Home   About Me   Blog   Algorithms & datastructures Home  



import typing
import pathlib

import utils


# Collision in Hashmaps is when more than one key hashes to the same slot.
# The pigeonhole-principle means that any hash function with more inputs than outputs MUST have collisions.
# The birthday-paradox places an upper bound on collision resistance;
# If a hash function produces N bits of output, an attacker who computes √2^N hash operations on random input is likely to find two matching outputs.
# The number of N-bit hashes that can be generated before getting a collision is 2^(N/2)
#   - https://en.wikipedia.org/wiki/Birthday_problem#Probability_of_a_shared_birthday_(collision)
#
# Collision can be eliminated by the following two ways:

# 1. Chaining
# In chaining, we place all the elements that hash to the same slot into the same linked list.
# Downside: It requires additional memory outside the hmap.

# 2. Open Addressing
# It's where collisions are resolved by probing. One method is by using double hashing.
# Downside: the hash table can "fill-up" so that no further insertions can be made
# Linear probing: If we're trying to insert an item but there's one already there, simply move to the next slot.
#                 If the next slot is full too, move along again, until you find an empty one,
#                 wrapping around to the beginning if you hit the end of the array.
#                 This technique is a lot faster than linked lists, because your CPU's
#                 cache has probably fetched the next items already.
#                 https://benhoyt.com/writings/hash-table-in-c/
# Look at the email conversation I had with Ben Hoyt with the email-title `hash table probe stats`

#
# Universal hashing;
# Its where we select the hash function from a random list of hash_funcs at runtime.
# This protects against adversaries been able to choose keys that will cause collision thus degrading perfomance.
# Python switched away from FNV-hash for this reason: https://www.python.org/dev/peps/pep-0456/
# The Golang Map implementation rotates the hash-seed to make it harder for attackers to trigger collisions.
# see: (i)   https://github.com/golang/go/blob/go1.16.4/src/runtime/map.go#L298-L313
#      (ii)  https://github.com/golang/go/blob/go1.16.4/src/runtime/map.go#L1001-L1003
#      (iii) https://github.com/golang/go/issues/25237
#      (iv)  https://hackernoon.com/some-insights-on-maps-in-golang-rm5v3ywh
#      (v) https://www.youtube.com/watch?v=DMQ_HcNSOAI (Great video on hashmaps & perfect hashmaps by Strager)

# For an example of creating a hashMap using probing(open addressing) see the link below
# It also shows how to grow the hashmap at runtime.
# https://github.com/benhoyt/counter/blob/6f3e137837f95fe029899f58eef350f38a18be15/counter.go#L41-L80
# https://gophers.slack.com/archives/C0VP8EF3R/p1615147587400800

# Another hashmap in Go by Josh Baker(tidwall)
# https://github.com/tidwall/hashmap/tree/v1.4.2

# Different hashmap algorithms;
# https://www.andreinc.net/2021/11/08/a-tale-of-java-hash-tables

# You can also prevent DOS by using a keyed hash,
# where the output hash depends not only on the string, but on a secret key.
# Typically the key is randomly generated at startup, or when creating a new hashtable.
# SipHash, is an example of a cryptographic keyed hash function.
# https://blog.polybdenum.com/2017/03/02/generating-64-bit-hash-collisions-to-dos-python.html

# "Zig's ArrayHashMap is basically ArrayList(Key) + ArrayList(Value) + an index object stored separately
# this is why ArrayHashMap(void, void) is sometimes useful" - Andrew Kelley[1]
# This also has a HashMap[3]. Difference is, ArrayHashMap is optimized for iteration.
#
#  1. https://discord.com/channels/605571803288698900/785499283368706060/951284933484245052
#  2. https://github.com/ziglang/zig/blob/f736cde397a6abb1399827ed5988c43001706580/lib/std/array_hash_map.zig#L460
#  3. https://github.com/ziglang/zig/blob/f736cde397a6abb1399827ed5988c43001706580/lib/std/hash_map.zig#L688

# Most hash funcs have more collisions than necessary.
# To deal with this, you can permute the final result using a "finalizer" such as found in murmurhash.
#  final_hash = finalizer(paul_larson_hash("hey"))
# https://www.andreinc.net/2021/10/02/implementing-hash-tables-in-c-part-1
# https://www.reddit.com/r/C_Programming/comments/q88m49/implementing_hash_tables_in_c_an_article_ive/hgo9qz3/
# https://github.com/aappleby/smhasher/blob/61a0530f28277f2e850bfc39600ce61d02b518de/src/MurmurHash3.cpp#L66-L77


# https://stackoverflow.com/a/2909572/2768067
# https://www.strchr.com/hash_functions
def paul_larson_hash(s: str, capacity: typing.Union[int, None] = None):
    hash = 0
    for i in s:
        hash = 101 * hash + ord(i)

    if capacity:
        # constrain hash in range [0, capacity - 1]
        return hash % capacity
    return hash


class Node:
    def __init__(self, key, val):
        self.key = key
        self.val = val
        self.next = None
        self.previous = None

    def __repr__(self):
        return f"{self.key}:{self.val}"


# Hmap using chaining
# Released under GNU General Public License v3.0
# full license is available: https://github.com/graphoarty/python-dsa/blob/cad73db712b147ba236919461e2263e46480761a/LICENSE
class Hmap:
    def __init__(self, capacity=3, hash_func=None):
        self.capacity = capacity
        self.buckets: list[Node] = [None] * self.capacity
        self.hash_func = hash_func

    def hash(self, key):
        if self.hash_func:
            # for tests
            return self.hash_func(key, self.capacity)
        return paul_larson_hash(key, self.capacity)

    def insert(self, key, val):
        index = self.hash(key)
        node = self.buckets[index]
        if node is None:
            # bucket is empty, add node
            self.buckets[index] = Node(key, val)
            return
        else:
            # handle collision
            while node is not None and node.next is not None:
                node = node.next
            new_node = Node(key, val)
            new_node.previous = node
            node.next = new_node

    def get(self, key):
        index = self.hash(key)
        node = self.buckets[index]
        while node is not None:
            if node.key == key:
                return node.val
            node = node.next
        return None

    def delete(self, key):
        index = self.hash(key)
        node = self.buckets[index]
        while node is not None:
            if node.key == key:
                res = node.val
                # delete
                if node.previous:
                    node.previous.next = node.next
                if node.next:
                    node.next.previous = node.previous
                if (node.previous is None) and (node.next is None):
                    # node was on it's own
                    self.buckets[index] = None
                return res
            node = node.next
        return None


ht = Hmap()


def test_hash_collision():
    # 10k words
    # https://www.mit.edu/~ecprice/wordlist.10000
    f = pathlib.Path.cwd().joinpath("test_data/10_000_words.txt").open()
    lines = f.readlines()
    f.close()

    hashes = []
    for i in lines:
        hash = paul_larson_hash(i)
        if hash in hashes:
            raise Exception(f"Collision for word: {i} with hash: {hash}")
        hashes.append(hash)


test_hash_collision()


def test_hash_func():
    utils.assert_r("hashmap", ht.hash("hello"), ht.hash("hello"))
    utils.assert_r("hashmap", ht.hash("hello"), 0)


test_hash_func()


def test_insert():
    ht.insert("test_key", "test_value")
    utils.assert_r("hashmap", ht.buckets[ht.hash("test_key")].val, "test_value")


test_insert()


def test_get():
    obj = "hello"
    ht.insert("key1", obj)
    utils.assert_r("hashmap", ht.get("key1"), obj)

    obj2 = ["this", "is", "a", "list"]
    ht.insert("key2", obj2)
    utils.assert_r("hashmap", ht.get("key2"), obj2)


test_get()


def test_delete():
    def fake_hash(s, capacity):
        # this hash func will produce collisions irrespective of input
        return 1

    ht = Hmap(hash_func=fake_hash)
    ht.insert("key1", "One")
    obj = "Two object"
    ht.insert("key2", obj)
    ht.insert("key3", "Three")

    utils.assert_r("hashmap", ht.delete("key2"), obj)
    utils.assert_r("hashmap", ht.delete("key2"), None)
    utils.assert_r("hashmap", ht.delete("key2"), None)
    utils.assert_r("hashmap", ht.delete("some random key"), None)
    utils.assert_r("hashmap", ht.delete("key2"), None)
    utils.assert_r("hashmap", ht.buckets[1].key, "key1")
    utils.assert_r("hashmap", ht.buckets[1].next.key, "key3")

    ht2 = Hmap()
    ht2.insert("A", 5)
    ht2.insert("B", 10)
    ht2.insert("Ball", "hello")

    utils.assert_r("hashmap", ht2.delete("A"), 5)
    utils.assert_r("hashmap", ht2.get("A"), None)
    utils.assert_r("hashmap", ht2.delete("A"), None)
    utils.assert_r("hashmap", ht2.delete("A"), None)


test_delete()


def test_capacity():
    # Test all public methods in one run at a large capacity
    for i in range(0, 1000):
        ht.insert("key" + str(i), "value")
    for i in range(0, 1000):
        _got = ht.get("key" + str(i))
        _deleted = ht.delete("key" + str(i))
        utils.assert_r("hashmap", _got, _deleted)


test_capacity()