import typing
import pathlib
import utils
# Collision in Hashmaps is when more than one key hashes to the same slot.
# The pigeonhole-principle means that any hash function with more inputs than outputs MUST have collisions.
# The birthday-paradox places an upper bound on collision resistance;
# If a hash function produces N bits of output, an attacker who computes √2^N hash operations on random input is likely to find two matching outputs.
# The number of N-bit hashes that can be generated before getting a collision is 2^(N/2)
# - https://en.wikipedia.org/wiki/Birthday_problem#Probability_of_a_shared_birthday_(collision)
#
# Collision can be eliminated by the following two ways:
# 1. Chaining
# In chaining, we place all the elements that hash to the same slot into the same linked list.
# Downside: It requires additional memory outside the hmap.
# 2. Open Addressing
# It's where collisions are resolved by probing. One method is by using double hashing.
# Downside: the hash table can "fill-up" so that no further insertions can be made
# Linear probing: If we're trying to insert an item but there's one already there, simply move to the next slot.
# If the next slot is full too, move along again, until you find an empty one,
# wrapping around to the beginning if you hit the end of the array.
# This technique is a lot faster than linked lists, because your CPU's
# cache has probably fetched the next items already.
# https://benhoyt.com/writings/hash-table-in-c/
# Look at the email conversation I had with Ben Hoyt with the email-title `hash table probe stats`
#
# Universal hashing;
# Its where we select the hash function from a random list of hash_funcs at runtime.
# This protects against adversaries been able to choose keys that will cause collision thus degrading perfomance.
# Python switched away from FNV-hash for this reason: https://www.python.org/dev/peps/pep-0456/
# The Golang Map implementation rotates the hash-seed to make it harder for attackers to trigger collisions.
# see: (i) https://github.com/golang/go/blob/go1.16.4/src/runtime/map.go#L298-L313
# (ii) https://github.com/golang/go/blob/go1.16.4/src/runtime/map.go#L1001-L1003
# (iii) https://github.com/golang/go/issues/25237
# (iv) https://hackernoon.com/some-insights-on-maps-in-golang-rm5v3ywh
# (v) https://www.youtube.com/watch?v=DMQ_HcNSOAI (Great video on hashmaps & perfect hashmaps by Strager)
# For an example of creating a hashMap using probing(open addressing) see the link below
# It also shows how to grow the hashmap at runtime.
# https://github.com/benhoyt/counter/blob/6f3e137837f95fe029899f58eef350f38a18be15/counter.go#L41-L80
# https://gophers.slack.com/archives/C0VP8EF3R/p1615147587400800
# Another hashmap in Go by Josh Baker(tidwall)
# https://github.com/tidwall/hashmap/tree/v1.4.2
# Different hashmap algorithms;
# https://www.andreinc.net/2021/11/08/a-tale-of-java-hash-tables
# You can also prevent DOS by using a keyed hash,
# where the output hash depends not only on the string, but on a secret key.
# Typically the key is randomly generated at startup, or when creating a new hashtable.
# SipHash, is an example of a cryptographic keyed hash function.
# https://blog.polybdenum.com/2017/03/02/generating-64-bit-hash-collisions-to-dos-python.html
# "Zig's ArrayHashMap is basically ArrayList(Key) + ArrayList(Value) + an index object stored separately
# this is why ArrayHashMap(void, void) is sometimes useful" - Andrew Kelley[1]
# This also has a HashMap[3]. Difference is, ArrayHashMap is optimized for iteration.
#
# 1. https://discord.com/channels/605571803288698900/785499283368706060/951284933484245052
# 2. https://github.com/ziglang/zig/blob/f736cde397a6abb1399827ed5988c43001706580/lib/std/array_hash_map.zig#L460
# 3. https://github.com/ziglang/zig/blob/f736cde397a6abb1399827ed5988c43001706580/lib/std/hash_map.zig#L688
# Most hash funcs have more collisions than necessary.
# To deal with this, you can permute the final result using a "finalizer" such as found in murmurhash.
# final_hash = finalizer(paul_larson_hash("hey"))
# https://www.andreinc.net/2021/10/02/implementing-hash-tables-in-c-part-1
# https://www.reddit.com/r/C_Programming/comments/q88m49/implementing_hash_tables_in_c_an_article_ive/hgo9qz3/
# https://github.com/aappleby/smhasher/blob/61a0530f28277f2e850bfc39600ce61d02b518de/src/MurmurHash3.cpp#L66-L77
# https://stackoverflow.com/a/2909572/2768067
# https://www.strchr.com/hash_functions
def paul_larson_hash(s: str, capacity: typing.Union[int, None] = None):
hash = 0
for i in s:
hash = 101 * hash + ord(i)
if capacity:
# constrain hash in range [0, capacity - 1]
return hash % capacity
return hash
class Node:
def __init__(self, key, val):
self.key = key
self.val = val
self.next = None
self.previous = None
def __repr__(self):
return f"{self.key}:{self.val}"
# Hmap using chaining
# Released under GNU General Public License v3.0
# full license is available: https://github.com/graphoarty/python-dsa/blob/cad73db712b147ba236919461e2263e46480761a/LICENSE
class Hmap:
def __init__(self, capacity=3, hash_func=None):
self.capacity = capacity
self.buckets: list[Node] = [None] * self.capacity
self.hash_func = hash_func
def hash(self, key):
if self.hash_func:
# for tests
return self.hash_func(key, self.capacity)
return paul_larson_hash(key, self.capacity)
def insert(self, key, val):
index = self.hash(key)
node = self.buckets[index]
if node is None:
# bucket is empty, add node
self.buckets[index] = Node(key, val)
return
else:
# handle collision
while node is not None and node.next is not None:
node = node.next
new_node = Node(key, val)
new_node.previous = node
node.next = new_node
def get(self, key):
index = self.hash(key)
node = self.buckets[index]
while node is not None:
if node.key == key:
return node.val
node = node.next
return None
def delete(self, key):
index = self.hash(key)
node = self.buckets[index]
while node is not None:
if node.key == key:
res = node.val
# delete
if node.previous:
node.previous.next = node.next
if node.next:
node.next.previous = node.previous
if (node.previous is None) and (node.next is None):
# node was on it's own
self.buckets[index] = None
return res
node = node.next
return None
ht = Hmap()
def test_hash_collision():
# 10k words
# https://www.mit.edu/~ecprice/wordlist.10000
f = pathlib.Path.cwd().joinpath("test_data/10_000_words.txt").open()
lines = f.readlines()
f.close()
hashes = []
for i in lines:
hash = paul_larson_hash(i)
if hash in hashes:
raise Exception(f"Collision for word: {i} with hash: {hash}")
hashes.append(hash)
test_hash_collision()
def test_hash_func():
utils.assert_r("hashmap", ht.hash("hello"), ht.hash("hello"))
utils.assert_r("hashmap", ht.hash("hello"), 0)
test_hash_func()
def test_insert():
ht.insert("test_key", "test_value")
utils.assert_r("hashmap", ht.buckets[ht.hash("test_key")].val, "test_value")
test_insert()
def test_get():
obj = "hello"
ht.insert("key1", obj)
utils.assert_r("hashmap", ht.get("key1"), obj)
obj2 = ["this", "is", "a", "list"]
ht.insert("key2", obj2)
utils.assert_r("hashmap", ht.get("key2"), obj2)
test_get()
def test_delete():
def fake_hash(s, capacity):
# this hash func will produce collisions irrespective of input
return 1
ht = Hmap(hash_func=fake_hash)
ht.insert("key1", "One")
obj = "Two object"
ht.insert("key2", obj)
ht.insert("key3", "Three")
utils.assert_r("hashmap", ht.delete("key2"), obj)
utils.assert_r("hashmap", ht.delete("key2"), None)
utils.assert_r("hashmap", ht.delete("key2"), None)
utils.assert_r("hashmap", ht.delete("some random key"), None)
utils.assert_r("hashmap", ht.delete("key2"), None)
utils.assert_r("hashmap", ht.buckets[1].key, "key1")
utils.assert_r("hashmap", ht.buckets[1].next.key, "key3")
ht2 = Hmap()
ht2.insert("A", 5)
ht2.insert("B", 10)
ht2.insert("Ball", "hello")
utils.assert_r("hashmap", ht2.delete("A"), 5)
utils.assert_r("hashmap", ht2.get("A"), None)
utils.assert_r("hashmap", ht2.delete("A"), None)
utils.assert_r("hashmap", ht2.delete("A"), None)
test_delete()
def test_capacity():
# Test all public methods in one run at a large capacity
for i in range(0, 1000):
ht.insert("key" + str(i), "value")
for i in range(0, 1000):
_got = ht.get("key" + str(i))
_deleted = ht.delete("key" + str(i))
utils.assert_r("hashmap", _got, _deleted)
test_capacity()