File size: 6,520 Bytes
070daf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
Semantic Caching Layer - Intelligent caching using embedding similarity
"""

import hashlib
import json
import logging
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)


@dataclass
class CacheEntry:
    """A single cache entry"""
    key: str
    query: str
    embedding: List[float]
    result: Any
    metadata: Dict[str, Any]
    created_at: float
    ttl: int
    
    @property
    def is_expired(self) -> bool:
        return time.time() - self.created_at > self.ttl
    
    @property
    def age_seconds(self) -> float:
        return time.time() - self.created_at


@dataclass
class CacheHit:
    """Result of a cache hit"""
    result: Any
    similarity: float
    age_seconds: float
    savings_tokens: int
    savings_cost: float


class SemanticCache:
    """
    In-memory semantic cache using embedding similarity.
    Caches: Tool execution results, LLM completions, web search results, code execution outputs
    """
    
    def __init__(
        self,
        similarity_threshold: float = 0.92,
        default_ttl: int = 604800,  # 7 days
        max_size: int = 10000
    ):
        self.similarity_threshold = similarity_threshold
        self.default_ttl = default_ttl
        self.max_size = max_size
        self._cache: Dict[str, CacheEntry] = {}
        self._hits = 0
        self._misses = 0
        self._tokens_saved = 0
        self._cost_saved = 0.0
    
    def _compute_embedding(self, text: str) -> List[float]:
        """
        Compute a simple embedding for the text.
        In production, use OpenAI or sentence-transformers.
        For now, use a simple hash-based approach.
        """
        # Simple character n-gram based embedding
        text = text.lower().strip()
        ngrams = []
        for i in range(len(text) - 2):
            ngrams.append(text[i:i+3])
        
        # Create a 128-dimensional embedding
        embedding = [0.0] * 128
        for ngram in ngrams:
            idx = hash(ngram) % 128
            embedding[idx] += 1.0
        
        # Normalize
        norm = sum(x**2 for x in embedding) ** 0.5
        if norm > 0:
            embedding = [x / norm for x in embedding]
        
        return embedding
    
    def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
        """Compute cosine similarity between two vectors"""
        dot = sum(x * y for x, y in zip(a, b))
        norm_a = sum(x**2 for x in a) ** 0.5
        norm_b = sum(x**2 for x in b) ** 0.5
        
        if norm_a == 0 or norm_b == 0:
            return 0.0
        
        return dot / (norm_a * norm_b)
    
    def _generate_key(self, query: str) -> str:
        """Generate a cache key from query"""
        return hashlib.md5(query.encode()).hexdigest()
    
    async def check(self, query: str) -> Optional[CacheHit]:
        """Check if similar query exists in cache"""
        query_embedding = self._compute_embedding(query)
        
        best_match = None
        best_similarity = 0.0
        
        # Find best matching entry
        for entry in self._cache.values():
            if entry.is_expired:
                continue
            
            similarity = self._cosine_similarity(query_embedding, entry.embedding)
            
            if similarity > self.similarity_threshold and similarity > best_similarity:
                best_similarity = similarity
                best_match = entry
        
        if best_match:
            self._hits += 1
            tokens_saved = best_match.metadata.get("prompt_tokens", 0)
            cost_saved = best_match.metadata.get("estimated_cost", 0.0)
            self._tokens_saved += tokens_saved
            self._cost_saved += cost_saved
            
            logger.info(f"Cache hit: similarity={best_similarity:.3f}, key={best_match.key[:8]}")
            
            return CacheHit(
                result=best_match.result,
                similarity=best_similarity,
                age_seconds=best_match.age_seconds,
                savings_tokens=tokens_saved,
                savings_cost=cost_saved
            )
        
        self._misses += 1
        return None
    
    async def store(
        self,
        query: str,
        result: Any,
        metadata: Dict[str, Any] = None,
        ttl: int = None
    ) -> str:
        """Store result with embedding for future semantic matching"""
        
        # Clean up expired entries if cache is full
        if len(self._cache) >= self.max_size:
            self._cleanup_expired()
        
        # If still full, remove oldest entries
        if len(self._cache) >= self.max_size:
            oldest_key = min(
                self._cache.keys(),
                key=lambda k: self._cache[k].created_at
            )
            del self._cache[oldest_key]
        
        embedding = self._compute_embedding(query)
        key = self._generate_key(query)
        
        entry = CacheEntry(
            key=key,
            query=query,
            embedding=embedding,
            result=result,
            metadata=metadata or {},
            created_at=time.time(),
            ttl=ttl or self.default_ttl
        )
        
        self._cache[key] = entry
        logger.info(f"Cache stored: key={key[:8]}, entries={len(self._cache)}")
        
        return key
    
    def _cleanup_expired(self):
        """Remove expired entries"""
        expired_keys = [
            k for k, v in self._cache.items()
            if v.is_expired
        ]
        for key in expired_keys:
            del self._cache[key]
        
        if expired_keys:
            logger.info(f"Cleaned up {len(expired_keys)} expired cache entries")
    
    def get_stats(self) -> Dict[str, Any]:
        """Get cache statistics"""
        total = self._hits + self._misses
        hit_rate = self._hits / total if total > 0 else 0.0
        
        return {
            "hits": self._hits,
            "misses": self._misses,
            "hit_rate": hit_rate,
            "entries": len(self._cache),
            "tokens_saved": self._tokens_saved,
            "cost_saved": self._cost_saved
        }
    
    def clear(self):
        """Clear all cache entries"""
        self._cache.clear()
        self._hits = 0
        self._misses = 0
        self._tokens_saved = 0
        self._cost_saved = 0.0
        logger.info("Cache cleared")


# Global cache instance
semantic_cache = SemanticCache()