File size: 5,032 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
FinEE Cache - Tier 0 Hash Cache for deduplication.

Provides LRU caching of extraction results to avoid redundant computation.
Uses SHA256 hash of input text as cache key.
"""

import hashlib
from collections import OrderedDict
from typing import Optional, Dict, Any
from dataclasses import dataclass, asdict
import json
import time

from .schema import ExtractionResult


@dataclass
class CacheStats:
    """Statistics for cache performance monitoring."""
    hits: int = 0
    misses: int = 0
    evictions: int = 0
    size: int = 0
    max_size: int = 1000
    
    @property
    def hit_rate(self) -> float:
        """Calculate cache hit rate."""
        total = self.hits + self.misses
        return self.hits / total if total > 0 else 0.0
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            **asdict(self),
            'hit_rate': f"{self.hit_rate:.2%}"
        }


class LRUCache:
    """
    Thread-safe LRU (Least Recently Used) cache for extraction results.
    
    Features:
    - SHA256 hashing of input text
    - Configurable max size
    - Automatic LRU eviction
    - Statistics tracking
    """
    
    def __init__(self, max_size: int = 1000):
        """
        Initialize the cache.
        
        Args:
            max_size: Maximum number of items to store (default: 1000)
        """
        self.max_size = max_size
        self._cache: OrderedDict[str, ExtractionResult] = OrderedDict()
        self._stats = CacheStats(max_size=max_size)
    
    @staticmethod
    def hash_text(text: str) -> str:
        """
        Generate SHA256 hash of input text.
        
        Args:
            text: Input text to hash
            
        Returns:
            Hex string of SHA256 hash
        """
        # Normalize text before hashing (lowercase, strip whitespace)
        normalized = text.strip().lower()
        return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
    
    def get(self, text: str) -> Optional[ExtractionResult]:
        """
        Retrieve cached result for input text.
        
        Args:
            text: Input text to look up
            
        Returns:
            ExtractionResult if found, None otherwise
        """
        key = self.hash_text(text)
        
        if key in self._cache:
            # Move to end (most recently used)
            self._cache.move_to_end(key)
            self._stats.hits += 1
            
            # Return a copy with cache metadata
            result = self._cache[key]
            result.from_cache = True
            result.processing_time_ms = 0.0
            return result
        
        self._stats.misses += 1
        return None
    
    def set(self, text: str, result: ExtractionResult) -> None:
        """
        Store extraction result in cache.
        
        Args:
            text: Original input text (used as key)
            result: Extraction result to cache
        """
        key = self.hash_text(text)
        
        # If key exists, update and move to end
        if key in self._cache:
            self._cache.move_to_end(key)
            self._cache[key] = result
            return
        
        # Check if we need to evict
        while len(self._cache) >= self.max_size:
            self._cache.popitem(last=False)  # Remove oldest
            self._stats.evictions += 1
        
        # Add new item
        self._cache[key] = result
        self._stats.size = len(self._cache)
    
    def contains(self, text: str) -> bool:
        """Check if text is in cache without updating LRU order."""
        key = self.hash_text(text)
        return key in self._cache
    
    def clear(self) -> None:
        """Clear all cached items."""
        self._cache.clear()
        self._stats.size = 0
    
    def get_stats(self) -> CacheStats:
        """Get cache statistics."""
        self._stats.size = len(self._cache)
        return self._stats
    
    def __len__(self) -> int:
        """Return number of cached items."""
        return len(self._cache)
    
    def __contains__(self, text: str) -> bool:
        """Support 'in' operator."""
        return self.contains(text)


# Global cache instance (singleton pattern)
_global_cache: Optional[LRUCache] = None


def get_cache(max_size: int = 1000) -> LRUCache:
    """
    Get or create the global cache instance.
    
    Args:
        max_size: Maximum cache size (only used on first call)
        
    Returns:
        Global LRUCache instance
    """
    global _global_cache
    if _global_cache is None:
        _global_cache = LRUCache(max_size=max_size)
    return _global_cache


def clear_cache() -> None:
    """Clear the global cache."""
    global _global_cache
    if _global_cache is not None:
        _global_cache.clear()


def get_cache_stats() -> Optional[CacheStats]:
    """Get statistics for the global cache."""
    global _global_cache
    if _global_cache is not None:
        return _global_cache.get_stats()
    return None