"""In-memory TTL cache for generation results.""" from __future__ import annotations import hashlib import time from typing import Optional from src.core.generator import GenerationResult def cache_key( query: str, model: str, top_k: int, *, provider: str = "ollama", use_rerank: bool = True, reranker_model: str = "", corpus_fingerprint: str = "documents", response_mode: str = "sync", ) -> str: """response_mode distinguishes streaming vs non-streaming cache entries (must not collide).""" raw = "\n".join( [ query, provider, model, str(top_k), str(use_rerank), reranker_model, corpus_fingerprint, response_mode, ] ) return hashlib.sha256(raw.encode("utf-8")).hexdigest() class ResponseCache: """Simple process-local cache with TTL.""" def __init__(self, ttl_seconds: int = 300) -> None: self.ttl_seconds = ttl_seconds self._store: dict[str, tuple[GenerationResult, float]] = {} def get(self, key: str) -> Optional[GenerationResult]: if self.ttl_seconds <= 0: return None item = self._store.get(key) if not item: return None result, expires_at = item if time.monotonic() > expires_at: del self._store[key] return None return result def set(self, key: str, result: GenerationResult) -> None: if self.ttl_seconds <= 0: return self._store[key] = (result, time.monotonic() + float(self.ttl_seconds)) def clear(self) -> None: self._store.clear()