Bloom Filter Implementation for Efficient Membership Testing
Thread-safe Bloom filter that computes optimal hash counts, supports add and contains, and can be serialized for persistence.
main.go
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
package main
import (
"bytes"
"crypto/sha256"
"encoding/binary"
"fmt"
"math"
"sync"
)
// BloomFilter represents a thread-safe Bloom Filter data structure
type BloomFilter struct {
bitArray []bool // Underlying bit array
size uint // Size of bit array
hashCount uint // Number of hash functions to use
mu sync.RWMutex // For thread safety
}
// NewBloomFilter creates a new Bloom Filter optimized for expected items and false positive rate
func NewBloomFilter(expectedItems uint, falsePositiveRate float64) *BloomFilter {
// Calculate optimal bit array size
size := calculateOptimalSize(expectedItems, falsePositiveRate)
// Calculate optimal number of hash functions
hashCount := calculateOptimalHashCount(size, expectedItems)
return &BloomFilter{
bitArray: make([]bool, size),
size: size,
hashCount: hashCount,
}
}
// calculateOptimalSize computes the optimal bit array size
func calculateOptimalSize(n uint, p float64) uint {
if p <= 0 || p >= 1 {
p = 0.01 // Default to 1% false positive rate
}
size := uint(-(float64(n) * math.Log(p)) / (math.Log(2) * math.Log(2)))
if size == 0 {
size = 1 // Prevent zero size
}
return size
}
// calculateOptimalHashCount computes the optimal number of hash functions
func calculateOptimalHashCount(m, n uint) uint {
if m == 0 || n == 0 {
return 5 // Default hash count
}
hashCount := uint((float64(m) / float64(n)) * math.Log(2))
if hashCount == 0 {
hashCount = 1 // At least one hash function
}
return hashCount
}
// hash generates multiple hash values for a given data
func (b *BloomFilter) hash(data []byte) []uint {
hashes := make([]uint, b.hashCount)
// Use SHA256 as the base hash function
baseHash := sha256.Sum256(data)
// Generate multiple hash values by splitting the base hash
for i := uint(0); i < b.hashCount; i++ {
// Create a unique seed for each hash
seed := append(baseHash[:], byte(i))
// Hash the seed to get a unique value
hash := sha256.Sum256(seed)
// Convert hash bytes to uint and mod by size to get bit position
position := binary.BigEndian.Uint64(hash[:8]) % uint64(b.size)
hashes[i] = uint(position)
}
return hashes
}
// Add inserts an item into the Bloom Filter (thread-safe)
func (b *BloomFilter) Add(data []byte) {
b.mu.Lock()
defer b.mu.Unlock()
hashes := b.hash(data)
for _, pos := range hashes {
b.bitArray[pos] = true
}
}
// Contains checks if an item is in the Bloom Filter (thread-safe)
// Returns true if item is possibly present, false if definitely not present
func (b *BloomFilter) Contains(data []byte) bool {
b.mu.RLock()
defer b.mu.RUnlock()
hashes := b.hash(data)
for _, pos := range hashes {
if !b.bitArray[pos] {
return false // Definitely not present
}
}
return true // Possibly present
}
// Serialize converts the Bloom Filter to bytes for persistence
func (b *BloomFilter) Serialize() ([]byte, error) {
b.mu.RLock()
defer b.mu.RUnlock()
// Create buffer to hold serialized data
var buf bytes.Buffer
// Write metadata (size, hashCount)
if err := binary.Write(&buf, binary.BigEndian, b.size); err != nil {
return nil, fmt.Errorf("failed to write size: %w", err)
}
if err := binary.Write(&buf, binary.BigEndian, b.hashCount); err != nil {
return nil, fmt.Errorf("failed to write hash count: %w", err)
}
// Convert bit array to bytes (pack 8 bits per byte)
byteCount := (b.size + 7) / 8
bytesArray := make([]byte, byteCount)
for i := uint(0); i < b.size; i++ {
if b.bitArray[i] {
byteIdx := i / 8
bitIdx := i % 8
bytesArray[byteIdx] |= (1 << bitIdx)
}
}
// Write bit array bytes
if _, err := buf.Write(bytesArray); err != nil {
return nil, fmt.Errorf("failed to write bit array: %w", err)
}
return buf.Bytes(), nil
}
// Deserialize recreates a Bloom Filter from serialized bytes
func Deserialize(data []byte) (*BloomFilter, error) {
if len(data) < 16 { // At least size (8 bytes) + hashCount (8 bytes)
return nil, fmt.Errorf("invalid serialized data: too short")
}
// Read metadata
buf := bytes.NewReader(data)
var size, hashCount uint
if err := binary.Read(buf, binary.BigEndian, &size); err != nil {
return nil, fmt.Errorf("failed to read size: %w", err)
}
if err := binary.Read(buf, binary.BigEndian, &hashCount); err != nil {
return nil, fmt.Errorf("failed to read hash count: %w", err)
}
// Read bit array bytes
byteCount := (size + 7) / 8
bytesArray := make([]byte, byteCount)
if _, err := buf.Read(bytesArray); err != nil {
return nil, fmt.Errorf("failed to read bit array: %w", err)
}
// Convert bytes back to bit array
bitArray := make([]bool, size)
for i := uint(0); i < size; i++ {
byteIdx := i / 8
bitIdx := i % 8
bitArray[i] = (bytesArray[byteIdx] & (1 << bitIdx)) != 0
}
return &BloomFilter{
bitArray: bitArray,
size: size,
hashCount: hashCount,
}, nil
}
func main() {
// Create Bloom Filter for 1000 items with 1% false positive rate
bf := NewBloomFilter(1000, 0.01)
fmt.Printf("Bloom Filter created with size: %d, hash count: %d\n", bf.size, bf.hashCount)
// Add items
items := [][]byte{
[]byte("apple"),
[]byte("banana"),
[]byte("cherry"),
}
for _, item := range items {
bf.Add(item)
}
// Test membership
fmt.Println("Contains 'apple':", bf.Contains([]byte("apple"))) // true (definite)
fmt.Println("Contains 'banana':", bf.Contains([]byte("banana"))) // true (definite)
fmt.Println("Contains 'orange':", bf.Contains([]byte("orange"))) // false (definite)
fmt.Println("Contains 'grape':", bf.Contains([]byte("grape"))) // false (definite)
// Serialize and deserialize
data, err := bf.Serialize()
if err != nil {
fmt.Printf("Serialization failed: %v\n", err)
return
}
bf2, err := Deserialize(data)
if err != nil {
fmt.Printf("Deserialization failed: %v\n", err)
return
}
// Verify deserialized filter
fmt.Println("\nDeserialized filter contains 'cherry':", bf2.Contains([]byte("cherry"))) // true
}How It Works
Thread-safe Bloom filter that computes optimal hash counts, supports add and contains, and can be serialized for persistence.
Calculates bitset size and hash count from the desired false-positive rate, uses multiple hash functions to set bits for each element, tests membership by checking all hashes, and provides save or load helpers.
Key Concepts
- 1Computes optimal k based on capacity and target false-positive rate.
- 2Bitset stored in a byte slice keeps memory usage compact.
- 3Mutex guards allow concurrent reads and writes safely.
When to Use This Pattern
- Fast membership checks for caches or spam filters.
- Reducing expensive database lookups for likely missing keys.
- Set membership in distributed systems where memory is tight.
Best Practices
- Choose capacity and false-positive rate deliberately; cannot shrink later.
- Avoid using Bloom filters when false negatives are unacceptable.
- Persist and restore bitsets to reuse training across restarts.
Go Version1.18
Difficultyadvanced
Production ReadyYes
Lines of Code215