7.8: Implementations
While many programming languages already provide hash table functionality, [1] there are several independent implementations worth mentioning.
- Google Sparse Hash The Google SparseHash project contains several hash-map implementations in use at Google, with different performance characteristics, including an implementation that optimizes for space and one that optimizes for speed. The memory-optimized one is extremely memory-efficient with only 2 bits/entry of overhead.
-
MCT
provides hashtables similar to Google's
dense_hash_map
, but without restriction on contained values; it also comes with exception safety and support for C++0x features. - A number of runtime languages and/or standard libraries use hash tables to implement their support for associative arrays because of their efficiency.
A python implementation of extendible hashing
The file - block management routines aren't there, so this could be added in to make this a real implementation of a database hash index.
A full page is split according to the (local depth)th bit, first by collecting all the directory indices pointing to the full page, updating the pointers according to the d bit being 0 or 1 corresponding to first and second new pages, then reloading all the key-values after hashing each key and using the d bit of each hash to see which page to allocate to. The local depth of each new page is one greater than the old page's local depth, so that the next d bit can be used next time when splitting.
PAGE_SZ = 20 class Page: def __init__(self): self.m = {} self.d = 0 def full(self): return len(self.m) > PAGE_SZ def put(self,k,v): self.m[k] = v def get(self,k): return self.m.get(k) class EH: def __init__(self): self.gd = 0 p = Page() self.pp= [p] def get_page(self,k): h = hash(k) p = self.pp[ h & (( 1 << self.gd) -1)] return p def put(self, k, v): p = self.get_page(k) if p.full() and p.d == self.gd: self.pp = self.pp + self.pp self.gd += 1 if p.full() and p.d < self.gd: p.put(k,v); p1 = Page() p2 = Page() for k2 in p.m.keys(): v2 = p.m[k2] h = k2.__hash__() h = h & ((1 << self.gd) -1) if (h | (1 << p.d) == h): p2.put(k2,v2) else: p1.put(k2,v2) l = [] for i in xrange(0, len(self.pp)): if self.pp[i] == p: l.append(i) for i in l: if (i | ( 1 << p.d) == i): self.pp[i] = p2 else: self.pp[i] = p1 p1.d = p.d + 1 p2.d = p1.d else: p.put(k, v) def get(self, k): p = self.get_page(k) return p.get(k) if __name__ == "__main__": eh = EH() N = 10000 l = [] for i in range(0,N): l.append(i) import random random.shuffle(l) for i in l: eh.put(i,i) print l for i in range(0, N): print eh.get(i)
A Java implementation of extendible hashing
A direct Java translation of the above python code, tested to work.
package ext_hashing; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; public class EH2<K, V> { static class Page<K, V> { static int PAGE_SZ = 20; private Map<K, V> m = new HashMap<K, V>(); int d = 0; boolean full() { return m.size() > PAGE_SZ; } void put(K k, V v) { m.put(k, v); } V get(K k) { return m.get(k); } } int gd = 0; List<Page<K, V>> pp = new ArrayList<Page<K, V>>(); public EH2() { pp.add(new Page<K, V>()); } Page<K, V> getPage(K k) { int h = k.hashCode(); Page<K, V> p = pp.get(h & ((1 << gd) - 1)); return p; } void put(K k, V v) { Page<K, V> p = getPage(k); if (p.full() && p.d == gd) { List<Page<K, V>> pp2 = new ArrayList<EH2.Page<K, V>>(pp); pp.addAll(pp2); ++gd; } if (p.full() && p.d < gd) { p.put(k, v); Page<K, V> p1, p2; p1 = new Page<K, V>(); p2 = new Page<K, V>(); for (K k2 : p.m.keySet()) { V v2 = p.m.get(k2); int h = k2.hashCode() & ((1 << gd) - 1); if ((h | (1 << p.d)) == h) p2.put(k2, v2); else p1.put(k2, v2); } List<Integer> l = new ArrayList<Integer>(); for (int i = 0; i < pp.size(); ++i) if (pp.get(i) == p) l.add(i); for (int i : l) if ((i | (1 << p.d)) == i) pp.set(i, p2); else pp.set(i, p1); p1.d = p.d + 1; p2.d = p1.d; } else p.put(k, v); } public V get(K k) { return getPage(k).get(k); } public static void main(String[] args) { int N = 500000; Random r = new Random(); List<Integer> l = new ArrayList<Integer>(); for (int i = 0; i < N; ++i) { l.add(i); } for (int i = 0; i < N; ++i) { int j = r.nextInt(N); int t = l.get(j); l.set(j, l.get(i)); l.set(i, t); } EH2<Integer, Integer> eh = new EH2<Integer, Integer>(); for (int i = 0; i < N; ++i) { eh.put(l.get(i), l.get(i)); } for (int i = 0; i < N; ++i) { System.out.printf("%d:%d , ", i, eh.get(i)); if (i % 10 == 0) System.out.println(); } } }
A Java implementation of linear hashing
( usable for simple database indexing of arbitrary size, (almost ?) )
This code evolved out of a need for a bigger Java HashMap. Originally, a Java HashMap standard object was used as a Map to index a heap like database file ( DBF format). But then, a file was encountered with so many records that the OutOfMemoryError was being thrown, so linear hashing seemed like a reasonably simple algorithm to use as a disk based index scheme.
Initially, linear hashing was implemented behind the Java Map interface, mainly the put(k,v) and get(k,v) methods. Generics was used, in order not to get too involved in the key and value details.
debugging to achieve functionality
Custom dumps to System.err was used to verify that blocks were being created, filled, and overflow blocks were of expected number if they existed ( no. = 1 ). This was all done in a purely in-memory implementation.
Later, the standard Java decoupling was introduced, where the original LHMap2 class accepted listeners to events, such as when a block was needed. The listener was then implemented as a block file manager, loading blocks into the block list of the memory LHMap2 object whenever a null block was encountered on the block list, and checking whether virtual machine runtime free memory was low, and retiring blocks using just a basic first-to-last-listed cache removal algorithm (not least recently used (LRU), not least often used ) by saving them to disk file, and then actively invoking the system garbage collector.
Because an application use case existed, which was to externally index a large DBF table, this use case was used as the main test harness for the algorithm implementation.
package linearhashmap; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; /** * * @param <K> * key type , must implement equals() and hashCode() * @param <V> * value type * * */ public class LHMap2<K, V> implements Map<K, V>, Serializable { /** * */ private static final long serialVersionUID = 3095071852466632996L; /** * */ public static interface BlockListener<K, V> { public void blockRequested(int block, LHMap2<K, V> map); } List<BlockListener<K, V>> listeners = new ArrayList<BlockListener<K, V>>(); // int savedBlocks; int N; int level = 0; int split = 0; int blockSize; long totalWrites = 0; List<Block<K, V>> blockList = new ArrayList<Block<K, V>>(); public void addBlockListener(BlockListener<K, V> listener) { listeners.add(listener); } void notifyBlockRequested(int block) { for (BlockListener<K, V> l : listeners) { l.blockRequested(block, this); } } public LHMap2(int blockSize, int nStartingBlocks) { this.blockSize = blockSize; this.N = nStartingBlocks; for (int i = 0; i < nStartingBlocks; ++i) { addBlock(); } Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { @Override public void run() { showStructure(); } })); } public static class Block<K, V> implements Externalizable { /** * */ int j = 0; Block<K, V> overflow = null; LinkedList<K> keyList = new LinkedList<K>(); LinkedList<V> valueList = new LinkedList<V>(); transient private LHMap2<K, V> owner; transient private Map<K, V> shadow = new TreeMap<K, V>(); private boolean changed = false; private int size = 0; public LHMap2<K, V> getOwner() { return owner; } public void setOwner(LHMap2<K, V> owner) { this.owner = owner; Block<K, V> ov = overflow; while (ov != null) { overflow.setOwner(owner); ov = ov.overflow; } } public Block() { super(); } public Block(LHMap2<K, V> map) { setOwner(map); } public V put(K k, V v) { setChanged(true); V v2 = replace(k, v); if (v2 == null) { ++size; if (keyList.size() == getOwner().blockSize) { if (overflow == null) { getOwner().blockOverflowed(this, k, v); } else { overflow.put(k, v); } } else { keyList.addFirst(k); valueList.addFirst(v); } } return v2; } void setChanged(boolean b) { changed = b; } public Map<K, V> drainToMap(Map<K, V> map) { while (!keyList.isEmpty()) { K k = keyList.removeLast(); V v = valueList.removeLast(); map.put(k, v); } if (overflow != null) map = overflow.drainToMap(map); garbageCollectionOverflow(); return map; } public void updateMap(Map<K, V> map) { Iterator<K> ik = keyList.descendingIterator(); Iterator<V> iv = valueList.descendingIterator(); while (ik.hasNext() && iv.hasNext()) { map.put(ik.next(), iv.next()); } if (overflow != null) overflow.updateMap(map); } private void garbageCollectionOverflow() { if (overflow != null) { overflow.garbageCollectionOverflow(); overflow = null; } } public void addOverflowBucket() { // assert overflow is needed if (keyList.size() < getOwner().blockSize) return; if (overflow == null) { overflow = new Block<K, V>(getOwner()); } else { overflow.addOverflowBucket(); } } public V replace(K key, V v2) { if (overflow != null) { V v = overflow.replace(key, v2); if (v != null) return v; } Iterator<K> i = keyList.listIterator(); int j = 0; while (i.hasNext()) { if (key.equals(i.next())) { V v = valueList.get(j); if (v2 != null) { valueList.set(j, v2); } return v; } ++j; } return null; } public boolean isChanged() { return changed; } @Override public void readExternal(ObjectInput arg0) throws IOException, ClassNotFoundException { int sz = arg0.readInt(); for (int i = 0; i < sz; ++i) { K k = (K) arg0.readObject(); V v = (V) arg0.readObject(); shadow.put(k, v); } } public void loadFromShadow(LHMap2<K, V> owner) { setOwner(owner); Block<K, V> b = this; for (K k : shadow.keySet()) { if (b.keyList.size() == owner.blockSize) { Block<K, V> overflow = new Block<K, V>(owner); b.overflow = overflow; b = overflow; } b.keyList.add(k); b.valueList.add(shadow.get(k)); } shadow.clear(); } @Override public void writeExternal(ObjectOutput arg0) throws IOException { if (!changed) return; Map<K, V> map = new TreeMap<K, V>(); updateMap(map); int sz = map.size(); arg0.writeInt(sz); for (K k : map.keySet()) { arg0.writeObject(k); arg0.writeObject(map.get(k)); } setChanged(false); } } void init() { for (int i = 0; i < N; ++i) { addBlock(); } } /** * @param hashValue * @return a bucket number. */ int getDynamicHash(int hashValue) { long unsignedHash = ((long) hashValue << 32) >>> 32; // ^^ this long cast needed int h = (int) (unsignedHash % (N << level)); // System.err.println("h = " + h); if (h < split) { h = (int) (unsignedHash % (N << (level + 1))); // System.err.println("h < split, new h = " + h); } return h; } @Override public V put(K k, V v) { ++totalWrites; int h = getDynamicHash(k.hashCode()); Block<K, V> b = getBlock(h); b.put(k, v); return v; } public long getTotalWrites() { return totalWrites; } private Block<K, V> getBlock(int h) { notifyBlockRequested(h); return blockList.get(h); } void blockOverflowed(Block<K, V> b, K k, V v) { splitNextBucket(); b.addOverflowBucket(); b.put(k, v); } private void splitNextBucket() { Block<K, V> b = getBlock(split); TreeMap<K, V> map = new TreeMap<K, V>(); b.drainToMap(map); addBlock(); System.err.printf("split N LEVEL %d %d %d \n", split, N, level); if (++split >= (N << level)) { ++level; split = 0; } for (K k : map.keySet()) { V v = map.get(k); int h = getDynamicHash(k.hashCode()); System.err.println(h + " "); Block<K, V> b2 = getBlock(h); b2.put(k, v); } } private Block<K, V> addBlock() { Block<K, V> b = new Block<K, V>(this); blockList.add(b); return b; } @Override public void clear() { blockList = new ArrayList<Block<K, V>>(); split = 0; level = 0; totalWrites = 0;// savedBlocks = 0; } @Override public boolean containsKey(Object key) { return get(key) != null; } @Override public boolean containsValue(Object value) { return values().contains(value); } @Override public Set<java.util.Map.Entry<K, V>> entrySet() { TreeSet<Map.Entry<K, V>> set = new TreeSet<Map.Entry<K, V>>(); Set<K> kk = keySet(); for (K k : kk) { final K k2 = k; set.add(new Entry<K, V>() { @Override public K getKey() { return k2; } @Override public V getValue() { return get(k2); } @Override public V setValue(V value) { return put(k2, value); } }); } return set; } @Override public V get(Object key) { int h = getDynamicHash(key.hashCode()); Block<K, V> b = getBlock(h); return b.replace((K) key, null); } @Override public boolean isEmpty() { return size() == 0; } @Override public Set<K> keySet() { TreeSet<K> kk = new TreeSet<K>(); for (int i = 0; i < blockList.size(); ++i) { Block<K, V> b = getBlock(i); kk.addAll(b.keyList); Block<K, V> ov = b.overflow; while (ov != null) { kk.addAll(ov.keyList); ov = ov.overflow; } } return kk; } @Override public void putAll(Map<? extends K, ? extends V> m) { for (K k : m.keySet()) { put(k, m.get(k)); } } @Override public V remove(Object key) { return null; } @Override public int size() { long sz = longSize(); return (int) (sz > Integer.MAX_VALUE ? Integer.MAX_VALUE : sz); } public long longSize() { long sz = 0; for (Block<K, V> b : blockList) { Block<K, V> b1 = b; while (b1 != null) { sz += b1.size; b1 = b.overflow; } } return sz; } @Override public Collection<V> values() { ArrayList<V> list = new ArrayList<V>(); Set<K> kk = keySet(); for (K k : kk) { list.add(get(k)); } return list; } private void showStructure() { for (int i = 0; i < blockList.size(); ++i) { Block<K, V> b = getBlock(i); Block<K, V> ov = b.overflow; int k = 0; while (ov != null) { ov = ov.overflow; ++k; } System.out.println("Block " + i + " size " + b.keyList.size() + " overflow blocks = " + k); } } }
Each block is a file in this implementation, because blocks are variably sized here to account for generics and variable sized key and value pairs, and overflow blocks are conceptual, not actual as on disc storage, because the block's contents and that of its overflow bucket(s), are saved as a list of alternating keys and values in this implementation. Methods for saving and loading to Object streams was used in a standard java customized object persistence way by implementing the Externalizable interface in the Block data class.
package linearhashmap; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Random; /** * This class manages the LHMap2 class block disk swapping, and saves and load an instance of the LHMap2 class. * It has been used to externally index a legacy file based database of 100,000 record master table, and 1,000,000 record * sized child tables, and accounts for heap space available in the java virtual machine, so that OutOfMemory errors * are avoided when the heap space is low by putting blocks back on files, and garbage collecting them. * The main performance bottleneck appeared when loading a million record table for an index , on initial creation * of the index. * @author doctor * * @param <K> * @param <V> */ public class LHMap2BlockFileManager<K, V> implements LHMap2.BlockListener<K, V>, Serializable { /** * */ private static final long serialVersionUID = 2615265603397988894L; LHMap2BlockFileManagerData data = new LHMap2BlockFileManagerData( new byte[10000], new Random(), 0, new ArrayList<Integer>(), 0); public LHMap2BlockFileManager(File baseDir, String name, int maxBlocks, double unloadRatio) { data.home = new File(baseDir, name); if (!data.home.exists()) data.home.mkdir(); this.data.maxBlocks = maxBlocks; this.data.unloadRatio = unloadRatio; } @Override public void blockRequested(int block, LHMap2<K, V> map) { LHMap2.Block<K, V> b = map.blockList.get(block); if (b == null) { int tries = 3; File f = new File(data.home, Integer.toString(block)); do { if (f.exists()) break; if (!f.exists()) { if (--tries >= 0) fatal(block); try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } } } while (true); try { ByteArrayInputStream bis = new ByteArrayInputStream(data.buf); FileInputStream fis = new FileInputStream(f); int sz = fis.read(data.buf); fis.close(); addByteStats(sz); ObjectInputStream ois = new ObjectInputStream(bis); b = new LHMap2.Block<K, V>(); b.readExternal(ois); ois.close(); b.loadFromShadow(map); map.blockList.set(block, b); --data.retired; } catch (FileNotFoundException e) { e.printStackTrace(); fatal(block); } catch (IOException e) { e.printStackTrace(); fatal(block); } catch (ClassNotFoundException e) { e.printStackTrace(); fatal(block); } } int size = map.blockList.size(); try { long freeMemory = Runtime.getRuntime().freeMemory(); long limitMemory = (long) (data.avgBlockSize * data.unloadRatio * size); if (block % 30 == 0) System.err.println("free memory =" + freeMemory + " limit " + limitMemory); if (map.split % 20 == 19) { // this is just to add statistics before really needing to retire retireRandomBlock(map, block); ++data.retired; } else if (freeMemory < limitMemory) { for (int i = 0; i < size / 4; ++i) { retireRandomBlock(map, block); ++data.retired; } System.runFinalization(); System.gc(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private void addByteStats(int sz) { ++data.avgCount; data.avgBlockSize = (int) ((data.avgBlockSize * (data.avgCount - 1) + sz) / data.avgCount); } public void retireRandomBlock(LHMap2<K, V> map, int notThisOne) throws FileNotFoundException, IOException { int pick = 0; int size = map.blockList.size(); LHMap2.Block<K, V> b = null; for (pick = 0; pick < size && (pick == notThisOne || (b = map.blockList.get(pick)) == null); ++pick) ; if (pick < size) retireOneBlock(map, pick, b); } private void retireOneBlock(LHMap2<K, V> map, int pick, LHMap2.Block<K, V> b) throws IOException, FileNotFoundException { if (b == null) return; if (b.isChanged()) { // System.err.println("Retiring " + pick); File f = new File(data.home, Integer.toString(pick)); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(bos); b.writeExternal(oos); oos.flush(); oos.close(); FileOutputStream fos = new FileOutputStream(f); byte[] bb = bos.toByteArray(); fos.write(bb); fos.flush(); fos.close(); if (bb.length > data.buf.length) { data.buf = bb; } } map.blockList.set(pick, null); b = null; } private void fatal(int block) { Exception e = new Exception(); try { throw e; } catch (Exception e2) { e2.printStackTrace(); } System.err.println("block " + block + " requested and it is not in blocklist and not a file"); for (int i : data.retirees) { System.err.print(i + " "); } System.err.println(" were retired"); System.exit(-2); } public static boolean metaExists(File indexDir, String name) { File home = new File(indexDir, name); return new File(home, "LinearMap2").exists(); } public static <K, V> LHMap2<K, V> load(File baseDir, String name) throws FileNotFoundException, IOException, ClassNotFoundException { File home = new File(baseDir, name); File f2 = new File(home, "LinearMap2"); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(f2)); LHMap2<K, V> map = (LHMap2<K, V>) ois.readObject(); ois.close(); loadBlocks(map); return map; } private static <K, V> void loadBlocks(LHMap2<K, V> map) { LHMap2BlockFileManager<K, V> mgr = getBlockManagerListener(map); int size = map.blockList.size(); for (int i = 0; i < size; ++i) { mgr.blockRequested(i, map); } } public static <K, V> LHMap2BlockFileManager<K, V> getBlockManagerListener( LHMap2<K, V> map) { LHMap2BlockFileManager<K, V> mgr = (LHMap2BlockFileManager<K, V>) map.listeners .get(0); return mgr; } public static void save(File indexDir, String name, LHMap2<?, ?> offsetMap) throws FileNotFoundException, IOException { retireAllBlocks(offsetMap); File home = new File(indexDir, name); File f2 = new File(home, "LinearMap2"); ObjectOutputStream oos = new ObjectOutputStream( new FileOutputStream(f2)); oos.writeObject(offsetMap); oos.close(); } private static <K, V> void retireAllBlocks(LHMap2<K, V> offsetMap) throws FileNotFoundException, IOException { LHMap2BlockFileManager<K, V> mgr = getBlockManagerListener(offsetMap); int sz = offsetMap.blockList.size(); for (int i = 0; i < sz; ++i) { LHMap2.Block<K, V> b = offsetMap.blockList.get(i); // offsetMap.blockList.set(i, null); // mark for reloading as block // destroyed after writing if (b != null) { mgr.retireOneBlock(offsetMap, i, b); } } } }
package linearhashmap; import java.io.File; import java.io.Serializable; import java.util.List; import java.util.Random; public class LHMap2BlockFileManagerData implements Serializable{ /** * */ private static final long serialVersionUID = 1L; public byte[] buf; public Random r; public File baseDir; public File home; public int maxBlocks; public int retired; public double unloadRatio; public List<Integer> retirees; public int avgBlockSize; public long avgCount; public LHMap2BlockFileManagerData(byte[] buf, Random r, int retired, List<Integer> retirees, long avgCount) { this.buf = buf; this.r = r; this.retired = retired; this.retirees = retirees; this.avgCount = avgCount; } }
Footnotes
- ↑ Wikipedia: Comparison of programming languages (mapping) shows how many programming languages provide hash table functionality.