File size: 5,032 Bytes
dcc24f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
"""
FinEE Cache - Tier 0 Hash Cache for deduplication.
Provides LRU caching of extraction results to avoid redundant computation.
Uses SHA256 hash of input text as cache key.
"""
import hashlib
from collections import OrderedDict
from typing import Optional, Dict, Any
from dataclasses import dataclass, asdict
import json
import time
from .schema import ExtractionResult
@dataclass
class CacheStats:
"""Statistics for cache performance monitoring."""
hits: int = 0
misses: int = 0
evictions: int = 0
size: int = 0
max_size: int = 1000
@property
def hit_rate(self) -> float:
"""Calculate cache hit rate."""
total = self.hits + self.misses
return self.hits / total if total > 0 else 0.0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
**asdict(self),
'hit_rate': f"{self.hit_rate:.2%}"
}
class LRUCache:
"""
Thread-safe LRU (Least Recently Used) cache for extraction results.
Features:
- SHA256 hashing of input text
- Configurable max size
- Automatic LRU eviction
- Statistics tracking
"""
def __init__(self, max_size: int = 1000):
"""
Initialize the cache.
Args:
max_size: Maximum number of items to store (default: 1000)
"""
self.max_size = max_size
self._cache: OrderedDict[str, ExtractionResult] = OrderedDict()
self._stats = CacheStats(max_size=max_size)
@staticmethod
def hash_text(text: str) -> str:
"""
Generate SHA256 hash of input text.
Args:
text: Input text to hash
Returns:
Hex string of SHA256 hash
"""
# Normalize text before hashing (lowercase, strip whitespace)
normalized = text.strip().lower()
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
def get(self, text: str) -> Optional[ExtractionResult]:
"""
Retrieve cached result for input text.
Args:
text: Input text to look up
Returns:
ExtractionResult if found, None otherwise
"""
key = self.hash_text(text)
if key in self._cache:
# Move to end (most recently used)
self._cache.move_to_end(key)
self._stats.hits += 1
# Return a copy with cache metadata
result = self._cache[key]
result.from_cache = True
result.processing_time_ms = 0.0
return result
self._stats.misses += 1
return None
def set(self, text: str, result: ExtractionResult) -> None:
"""
Store extraction result in cache.
Args:
text: Original input text (used as key)
result: Extraction result to cache
"""
key = self.hash_text(text)
# If key exists, update and move to end
if key in self._cache:
self._cache.move_to_end(key)
self._cache[key] = result
return
# Check if we need to evict
while len(self._cache) >= self.max_size:
self._cache.popitem(last=False) # Remove oldest
self._stats.evictions += 1
# Add new item
self._cache[key] = result
self._stats.size = len(self._cache)
def contains(self, text: str) -> bool:
"""Check if text is in cache without updating LRU order."""
key = self.hash_text(text)
return key in self._cache
def clear(self) -> None:
"""Clear all cached items."""
self._cache.clear()
self._stats.size = 0
def get_stats(self) -> CacheStats:
"""Get cache statistics."""
self._stats.size = len(self._cache)
return self._stats
def __len__(self) -> int:
"""Return number of cached items."""
return len(self._cache)
def __contains__(self, text: str) -> bool:
"""Support 'in' operator."""
return self.contains(text)
# Global cache instance (singleton pattern)
_global_cache: Optional[LRUCache] = None
def get_cache(max_size: int = 1000) -> LRUCache:
"""
Get or create the global cache instance.
Args:
max_size: Maximum cache size (only used on first call)
Returns:
Global LRUCache instance
"""
global _global_cache
if _global_cache is None:
_global_cache = LRUCache(max_size=max_size)
return _global_cache
def clear_cache() -> None:
"""Clear the global cache."""
global _global_cache
if _global_cache is not None:
_global_cache.clear()
def get_cache_stats() -> Optional[CacheStats]:
"""Get statistics for the global cache."""
global _global_cache
if _global_cache is not None:
return _global_cache.get_stats()
return None
|