Security Now episode 989 (https://www.grc.com/sn/sn-989-notes.pdf) is a "propeller head" episode that discusses Bloom Filters. This inspired me to code up an example of one (in Java), which I will present here for those interested. First, here's an old example I coded up ages ago, that uses Google Guava's implementation of a Bloom Filter. This is to show you that you probably don't need to roll your own, but you may enjoy learning from doing so.
Java:
import org.eclipse.jdt.annotation.Nullable;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnel;
import com.google.common.hash.PrimitiveSink;
final class BloomFilterExample
{
private static final int MAX_NUMBER_OF_ENTRIES = 10_000;
private static final int EXTRA_ROOM_FOR_ENTRIES_FOR_FALSE_POSITIVE_PREVENTION = 0;
// The default value for false positive probability is 0.03 (3%)
private static final double FALSE_POSITIVE_PROBABILITY = 0.01;
private static final int STARTING_PRIME = 7883; // 7883 is one of the first one thousand primes
private final BloomFilter<BigInteger> bloomFilter = BloomFilter.create(BigIntegerFunnel.INSTANCE,
MAX_NUMBER_OF_ENTRIES + EXTRA_ROOM_FOR_ENTRIES_FOR_FALSE_POSITIVE_PREVENTION,
FALSE_POSITIVE_PROBABILITY);
private final List<BigInteger> listOfBigIntsPutIntoBloomFilter = new ArrayList<>(MAX_NUMBER_OF_ENTRIES);
public void doExample()
{
// Find a series of [probable] prime numbers
BigInteger curPrime = BigInteger.valueOf(STARTING_PRIME);
for (int i=0; i<MAX_NUMBER_OF_ENTRIES; i++)
{
curPrime = curPrime.nextProbablePrime();
bloomFilter.put(curPrime);
listOfBigIntsPutIntoBloomFilter.add(curPrime);
}
// Test the bloom filter, tracking the results
// Note that the testing range includes more than just primes, so the number of tests will
// be significant larger than the number of entries we installed into the bloom filter
// We do, however, skip testing the even numbers, because if this bloom filter was to actually
// be used for primality indication, no good programmer would bother testing even numbers.
int numberOfValuesTested = 0;
int numberOfFalsePositives = 0;
for (int i=STARTING_PRIME; i<curPrime.intValue(); i+=2)
{
final BigInteger soughtValue = BigInteger.valueOf(i);
numberOfValuesTested++;
if (bloomFilter.mightContain(soughtValue))
{
if (listOfBigIntsPutIntoBloomFilter.contains(soughtValue))
{
//System.out.println(soughtValue + " was in both");
}
else
{
//System.out.println(soughtValue + " was a false positive");
numberOfFalsePositives++;
}
}
else
{
//System.out.println(soughtValue + " not in bloom filter");
}
}
System.out.println("When " + numberOfValuesTested + " different values were tested, there were " + numberOfFalsePositives + " false positives.");
final double falsePositivePercentage = ((double)(int)((double)(numberOfFalsePositives*10000) / numberOfValuesTested))/100;
System.out.println("The actual false positive rate was " + falsePositivePercentage + "%, while the requested rate was " + (FALSE_POSITIVE_PROBABILITY * 100) + "%.");
}
private static enum BigIntegerFunnel implements Funnel<BigInteger>
{
// The Guava docs recommend making the Funnel an enum to help with
// [de]serialization
INSTANCE;
@Override
// This is quite probably @NonNull but it's painful to override and this check is safe and easy
public void funnel(final BigInteger from, final @Nullable PrimitiveSink into)
{
Objects.requireNonNull(into);
into.putBytes(from.toByteArray());
}
}
}