Finding Union and Intersection of 2 Large Lists


One of my colleague who works on performance tracking and tuning of applications asked for some helped around how to find the union and intersection of large lists. He works mostly on Python and Perl. Being a Java guy, I prepared a sample program for him to do this allowing to determine the size of the lists by himself. He was happy to get the program and once I explained him a bit about the retailAll(), addAll() and removeAll() API of Java and how I used those to determine the union and intersection, it was very clear to him. I am giving that program here in case it helps you as a reference implementation.


package com.salesforce.test;

import java.util.List;
import java.util.ArrayList;
import java.util.Date;

/**
* To compiple: javac -d . ListPerformanceTest.java
* To run: java com.salesforce.test.ListPerformanceTest
*
* @author ashik
*/
public class ListPerformanceTest {

private int LOOP_COUNT = 50000;
private List firstList;
private List secondList;

public ListPerformanceTest() {
firstList = new ArrayList();
secondList = new ArrayList();
for(int i = 0; i < LOOP_COUNT; i++) {
if(i % 3 != 0 || i % 5 != 0) {
firstList.add("ashik - " + i);
}
if(i % 9 != 0) {
secondList.add("ashik - " + i);
}
}
}

public static void main(String[] args) {
System.out.println("\nListPerformanceTest starts.....\n");
ListPerformanceTest perf = new ListPerformanceTest();
List intersection = new ArrayList();
List union = new ArrayList();

Date d1 = new Date(System.currentTimeMillis());
System.out.println("d1 = " + d1);
for(String value : perf.firstList) {
System.out.println("value for firstList = " + value);
}
Date d2 = new Date(System.currentTimeMillis());
System.out.println("d2 = " + d2);
for(String value : perf.secondList) {
System.out.println("value for secondList = " + value);
}
Date d3 = new Date(System.currentTimeMillis());
System.out.println("d3 = " + d3);

System.out.println("perf.firstList.size() = " + perf.firstList.size() + " and perf.secondList.size() = " + perf.secondList.size());

if(perf.firstList.size() >= perf.secondList.size()) {
intersection.addAll(perf.firstList);
intersection.retainAll(perf.secondList);
} else {
intersection.addAll(perf.secondList);
intersection.retainAll(perf.firstList);
}
Date d4 = new Date(System.currentTimeMillis());
System.out.println("d4 = " + d4);
System.out.println("intersection.size() = " + intersection.size());

if(perf.firstList.size() >= perf.secondList.size()) {
union.addAll(perf.firstList);
union.removeAll(perf.secondList);
union.addAll(perf.secondList);
} else {
union.addAll(perf.secondList);
union.removeAll(perf.firstList);
union.addAll(perf.firstList);
}
Date d5 = new Date(System.currentTimeMillis());
System.out.println("d5 = " + d5);
System.out.println("union.size() = " + union.size());

System.out.println("\nListPerformanceTest ends.....\n");
}

}

The significant part from the output when you run the program is given below.

d3 = Fri Nov 25 10:03:36 PST 2011
perf.firstList.size() = 46666 and perf.secondList.size() = 44444
d4 = Fri Nov 25 10:03:55 PST 2011
intersection.size() = 42222
d5 = Fri Nov 25 10:04:14 PST 2011
union.size() = 48888

Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.