import sys
import utils
import pathlib
hmap = __import__("6_hashmaps")
# A Bloom filter is a probabilistic data structure that tells you, efficiently, whether an element is present in a set.
# It tells us whether an element is;
# - DEFINITELY NOT in the set, or
# - may be in the set.
#
# GoogleChrome used to use a bloomFilter to detect malicious URLS.
# It shipped with a bloomFilter containing all known malicious URLs.
# Then when you click on a link, chrome checks if that link is in the bloomFilter.
# If the link is not in the bloomFilter; then that link is safe.
# This is because bloomFilters tell us when something is DEFINITELY NOT in the set.
# You may also have a large list of already compromised passwords and
# you want to check whether a new password is part of this list of compromised passwords.
# https://lemire.me/blog/2019/12/19/xor-filters-faster-and-smaller-than-bloom-filters
# https://llimllib.github.io/bloomfilter-tutorial/
# https://www.kdnuggets.com/2016/08/gentle-introduction-bloom-filter.html
# https://stackoverflow.com/questions/18447874/google-chrome-usage-of-bloom-filter
class BloomFilter:
def __init__(self, size):
self.size = size
self.bit_array = [0] * self.size # ie bit array(bit vector)
self.hash_count = 3 # increases effectiveness.
def set(self, item):
# O(k). k== number of hash funcs(or self.hash_count in our case)
# Note it is possible to hash `item` only once.
# But hashing more times increases effectiveness.
# Ideally you should use different hash functions
for i in range(self.hash_count):
index = hmap.paul_larson_hash(item + str(i), self.size)
self.bit_array[index] = 1
def get(self, item) -> bool:
# O(k)
present = True
for i in range(self.hash_count):
index = hmap.paul_larson_hash(item + str(i), self.size)
if self.bit_array[index] == 0:
present = False
return present
bloom = BloomFilter(100)
animals = [
"dog",
"cat",
"giraffe",
"fly",
"mosquito",
"horse",
"eagle",
"bird",
"bison",
"boar",
"butterfly",
"ant",
"anaconda",
"bear",
"chicken",
"dolphin",
"donkey",
"crow",
"crocodile",
]
# First insertion of animals into the bloom filter
for animal in animals:
bloom.set(animal)
# Membership existence for already inserted animals
# There should NOT be any false negatives
for animal in animals:
if not bloom.get(animal):
# ie: False Negative should not happen
utils.assert_r("bloom_filter", bloom.get(animal), True)
false_positives = []
correct = []
# Membership existence for not inserted animals
# There could be false positives
other_animals = [
"badger",
"cow",
"pig",
"sheep",
"bee",
"wolf",
"fox",
"whale",
"shark",
"fish",
"turkey",
"duck",
"dove",
"deer",
"elephant",
"frog",
"falcon",
"goat",
"gorilla",
"hawk",
]
for other_animal in other_animals:
if bloom.get(other_animal):
false_positives.append(1)
else:
correct.append(1)
utils.assert_r("bloom_filter", len(false_positives), 6)
utils.assert_r("bloom_filter", len(correct), 14)
def test_space_efficiency():
"""
BloomFilters are more space efficient compared to things like hashmaps or arrays
https://lemire.me/blog/2019/12/19/xor-filters-faster-and-smaller-than-bloom-filters/
"""
f = pathlib.Path.cwd().joinpath("test_data/10_000_words.txt").open()
fRead = f.readlines()
f.close()
lines = []
for i in fRead:
lines.append(i)
bf = BloomFilter(int(len(lines) / 10))
theDict = {}
for h in lines:
bf.set(h)
theDict[h] = h
size_of_lines_list = sys.getsizeof(lines)
size_of_bloom_filter = sys.getsizeof(bf.bit_array)
size_of_dict = sys.getsizeof(theDict)
bloom_better_than_dict = int(size_of_dict / size_of_bloom_filter)
bloom_better_than_list = int(size_of_lines_list / size_of_bloom_filter)
if bloom_better_than_dict < 30:
utils.assert_r(
"bloom_filter",
"bloom_filter is not as space efficient compared to a dictionary",
"bloom_filter should be much more space efficient compared to dictionary",
)
if bloom_better_than_list < 10:
utils.assert_r(
"bloom_filter",
"bloom_filter is not as space efficient compared to a list",
"bloom_filter should be much more space efficient compared to list",
)
test_space_efficiency()