main.go
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
package main

import (
	"bytes"
	"crypto/sha256"
	"encoding/binary"
	"fmt"
	"math"
	"sync"
)

// BloomFilter represents a thread-safe Bloom Filter data structure
type BloomFilter struct {
	bitArray []bool       // Underlying bit array
	size     uint         // Size of bit array
	hashCount uint        // Number of hash functions to use
	mu       sync.RWMutex // For thread safety
}

// NewBloomFilter creates a new Bloom Filter optimized for expected items and false positive rate
func NewBloomFilter(expectedItems uint, falsePositiveRate float64) *BloomFilter {
	// Calculate optimal bit array size
	size := calculateOptimalSize(expectedItems, falsePositiveRate)
	// Calculate optimal number of hash functions
	hashCount := calculateOptimalHashCount(size, expectedItems)

	return &BloomFilter{
		bitArray:  make([]bool, size),
		size:      size,
		hashCount: hashCount,
	}
}

// calculateOptimalSize computes the optimal bit array size
func calculateOptimalSize(n uint, p float64) uint {
	if p <= 0 || p >= 1 {
		p = 0.01 // Default to 1% false positive rate
	}
	size := uint(-(float64(n) * math.Log(p)) / (math.Log(2) * math.Log(2)))
	if size == 0 {
		size = 1 // Prevent zero size
	}
	return size
}

// calculateOptimalHashCount computes the optimal number of hash functions
func calculateOptimalHashCount(m, n uint) uint {
	if m == 0 || n == 0 {
		return 5 // Default hash count
	}
	hashCount := uint((float64(m) / float64(n)) * math.Log(2))
	if hashCount == 0 {
		hashCount = 1 // At least one hash function
	}
	return hashCount
}

// hash generates multiple hash values for a given data
func (b *BloomFilter) hash(data []byte) []uint {
	hashes := make([]uint, b.hashCount)
	// Use SHA256 as the base hash function
	baseHash := sha256.Sum256(data)

	// Generate multiple hash values by splitting the base hash
	for i := uint(0); i < b.hashCount; i++ {
		// Create a unique seed for each hash
		seed := append(baseHash[:], byte(i))
		// Hash the seed to get a unique value
		hash := sha256.Sum256(seed)
		// Convert hash bytes to uint and mod by size to get bit position
		position := binary.BigEndian.Uint64(hash[:8]) % uint64(b.size)
		hashes[i] = uint(position)
	}

	return hashes
}

// Add inserts an item into the Bloom Filter (thread-safe)
func (b *BloomFilter) Add(data []byte) {
	b.mu.Lock()
	defer b.mu.Unlock()

	hashes := b.hash(data)
	for _, pos := range hashes {
		b.bitArray[pos] = true
	}
}

// Contains checks if an item is in the Bloom Filter (thread-safe)
// Returns true if item is possibly present, false if definitely not present
func (b *BloomFilter) Contains(data []byte) bool {
	b.mu.RLock()
	defer b.mu.RUnlock()

	hashes := b.hash(data)
	for _, pos := range hashes {
		if !b.bitArray[pos] {
			return false // Definitely not present
		}
	}
	return true // Possibly present
}

// Serialize converts the Bloom Filter to bytes for persistence
func (b *BloomFilter) Serialize() ([]byte, error) {
	b.mu.RLock()
	defer b.mu.RUnlock()

	// Create buffer to hold serialized data
	var buf bytes.Buffer

	// Write metadata (size, hashCount)
	if err := binary.Write(&buf, binary.BigEndian, b.size); err != nil {
		return nil, fmt.Errorf("failed to write size: %w", err)
	}
	if err := binary.Write(&buf, binary.BigEndian, b.hashCount); err != nil {
		return nil, fmt.Errorf("failed to write hash count: %w", err)
	}

	// Convert bit array to bytes (pack 8 bits per byte)
	byteCount := (b.size + 7) / 8
	bytesArray := make([]byte, byteCount)

	for i := uint(0); i < b.size; i++ {
		if b.bitArray[i] {
			byteIdx := i / 8
			bitIdx := i % 8
			bytesArray[byteIdx] |= (1 << bitIdx)
		}
	}

	// Write bit array bytes
	if _, err := buf.Write(bytesArray); err != nil {
		return nil, fmt.Errorf("failed to write bit array: %w", err)
	}

	return buf.Bytes(), nil
}

// Deserialize recreates a Bloom Filter from serialized bytes
func Deserialize(data []byte) (*BloomFilter, error) {
	if len(data) < 16 { // At least size (8 bytes) + hashCount (8 bytes)
		return nil, fmt.Errorf("invalid serialized data: too short")
	}

	// Read metadata
	buf := bytes.NewReader(data)
	var size, hashCount uint
	if err := binary.Read(buf, binary.BigEndian, &size); err != nil {
		return nil, fmt.Errorf("failed to read size: %w", err)
	}
	if err := binary.Read(buf, binary.BigEndian, &hashCount); err != nil {
		return nil, fmt.Errorf("failed to read hash count: %w", err)
	}

	// Read bit array bytes
	byteCount := (size + 7) / 8
	bytesArray := make([]byte, byteCount)
	if _, err := buf.Read(bytesArray); err != nil {
		return nil, fmt.Errorf("failed to read bit array: %w", err)
	}

	// Convert bytes back to bit array
	bitArray := make([]bool, size)
	for i := uint(0); i < size; i++ {
		byteIdx := i / 8
		bitIdx := i % 8
		bitArray[i] = (bytesArray[byteIdx] & (1 << bitIdx)) != 0
	}

	return &BloomFilter{
		bitArray:  bitArray,
		size:      size,
		hashCount: hashCount,
	}, nil
}

func main() {
	// Create Bloom Filter for 1000 items with 1% false positive rate
	bf := NewBloomFilter(1000, 0.01)
	fmt.Printf("Bloom Filter created with size: %d, hash count: %d\n", bf.size, bf.hashCount)

	// Add items
	items := [][]byte{
		[]byte("apple"),
		[]byte("banana"),
		[]byte("cherry"),
	}

	for _, item := range items {
		bf.Add(item)
	}

	// Test membership
	fmt.Println("Contains 'apple':", bf.Contains([]byte("apple")))    // true (definite)
	fmt.Println("Contains 'banana':", bf.Contains([]byte("banana")))  // true (definite)
	fmt.Println("Contains 'orange':", bf.Contains([]byte("orange")))  // false (definite)
	fmt.Println("Contains 'grape':", bf.Contains([]byte("grape")))    // false (definite)

	// Serialize and deserialize
	data, err := bf.Serialize()
	if err != nil {
		fmt.Printf("Serialization failed: %v\n", err)
		return
	}

	bf2, err := Deserialize(data)
	if err != nil {
		fmt.Printf("Deserialization failed: %v\n", err)
		return
	}

	// Verify deserialized filter
	fmt.Println("\nDeserialized filter contains 'cherry':", bf2.Contains([]byte("cherry"))) // true
}

How It Works

Thread-safe Bloom filter that computes optimal hash counts, supports add and contains, and can be serialized for persistence.

Calculates bitset size and hash count from the desired false-positive rate, uses multiple hash functions to set bits for each element, tests membership by checking all hashes, and provides save or load helpers.

Key Concepts

  • 1Computes optimal k based on capacity and target false-positive rate.
  • 2Bitset stored in a byte slice keeps memory usage compact.
  • 3Mutex guards allow concurrent reads and writes safely.

When to Use This Pattern

  • Fast membership checks for caches or spam filters.
  • Reducing expensive database lookups for likely missing keys.
  • Set membership in distributed systems where memory is tight.

Best Practices

  • Choose capacity and false-positive rate deliberately; cannot shrink later.
  • Avoid using Bloom filters when false negatives are unacceptable.
  • Persist and restore bitsets to reuse training across restarts.
Go Version1.18
Difficultyadvanced
Production ReadyYes
Lines of Code215