Home   About Me   Blog   Algorithms & datastructures Home  



import sys
import utils
import pathlib

hmap = __import__("6_hashmaps")


# A Bloom filter is a probabilistic data structure that tells you, efficiently, whether an element is present in a set.
# It tells us whether an element is;
# - DEFINITELY NOT in the set, or
# - may be in the set.
#
# GoogleChrome used to use a bloomFilter to detect malicious URLS.
# It shipped with a bloomFilter containing all known malicious URLs.
# Then when you click on a link, chrome checks if that link is in the bloomFilter.
# If the link is not in the bloomFilter; then that link is safe.
# This is because bloomFilters tell us when something is DEFINITELY NOT in the set.

# You may also have a large list of already compromised passwords and
# you want to check whether a new password is part of this list of compromised passwords.
# https://lemire.me/blog/2019/12/19/xor-filters-faster-and-smaller-than-bloom-filters


# https://llimllib.github.io/bloomfilter-tutorial/
# https://www.kdnuggets.com/2016/08/gentle-introduction-bloom-filter.html
# https://stackoverflow.com/questions/18447874/google-chrome-usage-of-bloom-filter
class BloomFilter:
    def __init__(self, size):
        self.size = size
        self.bit_array = [0] * self.size  # ie bit array(bit vector)
        self.hash_count = 3  # increases effectiveness.

    def set(self, item):
        # O(k). k== number of hash funcs(or self.hash_count in our case)

        # Note it is possible to hash `item` only once.
        # But hashing more times increases effectiveness.
        # Ideally you should use different hash functions
        for i in range(self.hash_count):
            index = hmap.paul_larson_hash(item + str(i), self.size)
            self.bit_array[index] = 1

    def get(self, item) -> bool:
        # O(k)
        present = True
        for i in range(self.hash_count):
            index = hmap.paul_larson_hash(item + str(i), self.size)
            if self.bit_array[index] == 0:
                present = False
        return present


bloom = BloomFilter(100)

animals = [
    "dog",
    "cat",
    "giraffe",
    "fly",
    "mosquito",
    "horse",
    "eagle",
    "bird",
    "bison",
    "boar",
    "butterfly",
    "ant",
    "anaconda",
    "bear",
    "chicken",
    "dolphin",
    "donkey",
    "crow",
    "crocodile",
]
# First insertion of animals into the bloom filter
for animal in animals:
    bloom.set(animal)

# Membership existence for already inserted animals
# There should NOT be any false negatives
for animal in animals:
    if not bloom.get(animal):
        # ie: False Negative should not happen
        utils.assert_r("bloom_filter", bloom.get(animal), True)


false_positives = []
correct = []
# Membership existence for not inserted animals
# There could be false positives
other_animals = [
    "badger",
    "cow",
    "pig",
    "sheep",
    "bee",
    "wolf",
    "fox",
    "whale",
    "shark",
    "fish",
    "turkey",
    "duck",
    "dove",
    "deer",
    "elephant",
    "frog",
    "falcon",
    "goat",
    "gorilla",
    "hawk",
]
for other_animal in other_animals:
    if bloom.get(other_animal):
        false_positives.append(1)
    else:
        correct.append(1)

utils.assert_r("bloom_filter", len(false_positives), 6)
utils.assert_r("bloom_filter", len(correct), 14)


def test_space_efficiency():
    """
    BloomFilters are more space efficient compared to things like hashmaps or arrays
    https://lemire.me/blog/2019/12/19/xor-filters-faster-and-smaller-than-bloom-filters/
    """
    f = pathlib.Path.cwd().joinpath("test_data/10_000_words.txt").open()
    fRead = f.readlines()
    f.close()
    lines = []
    for i in fRead:
        lines.append(i)

    bf = BloomFilter(int(len(lines) / 10))
    theDict = {}
    for h in lines:
        bf.set(h)
        theDict[h] = h

    size_of_lines_list = sys.getsizeof(lines)
    size_of_bloom_filter = sys.getsizeof(bf.bit_array)
    size_of_dict = sys.getsizeof(theDict)

    bloom_better_than_dict = int(size_of_dict / size_of_bloom_filter)
    bloom_better_than_list = int(size_of_lines_list / size_of_bloom_filter)

    if bloom_better_than_dict < 30:
        utils.assert_r(
            "bloom_filter",
            "bloom_filter is not as space efficient compared to a dictionary",
            "bloom_filter should be much more space efficient compared to dictionary",
        )
    if bloom_better_than_list < 10:
        utils.assert_r(
            "bloom_filter",
            "bloom_filter is not as space efficient compared to a list",
            "bloom_filter should be much more space efficient compared to list",
        )


test_space_efficiency()