Spaces:
Running
Running
| """ | |
| FilGoalBot — On-disk LLM response cache. | |
| Keyed on (model, intent, sorted chunk_ids, normalised query). Intentionally | |
| file-based + JSON: trivial to inspect, trivial to invalidate by deletion, and | |
| survives across processes (eval re-runs, API restarts). | |
| Saves Groq tokens on: | |
| - eval re-runs (same 50 questions, same retrieved chunks) | |
| - production duplicates (the same query within the eviction window) | |
| """ | |
| import hashlib | |
| import json | |
| import time | |
| from pathlib import Path | |
| CACHE_DIR = Path(".cache/llm") | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| # 30 days. Football news goes stale much faster, but the cache key includes the | |
| # retrieved chunk IDs — when new articles come in, the chunk set shifts and the | |
| # key changes naturally. | |
| DEFAULT_TTL_SECONDS = 30 * 24 * 3600 | |
| def _make_key( | |
| model: str, | |
| intent: str, | |
| chunk_ids: list[str], | |
| query: str, | |
| ) -> str: | |
| payload = json.dumps( | |
| { | |
| "model": model, | |
| "intent": intent, | |
| "chunks": sorted(chunk_ids), | |
| "query": query.strip().lower(), | |
| }, | |
| ensure_ascii=False, | |
| sort_keys=True, | |
| ) | |
| return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:32] | |
| def get(model: str, intent: str, chunk_ids: list[str], query: str, | |
| ttl_seconds: int = DEFAULT_TTL_SECONDS) -> str | None: | |
| key = _make_key(model, intent, chunk_ids, query) | |
| path = CACHE_DIR / f"{key}.json" | |
| if not path.exists(): | |
| return None | |
| try: | |
| entry = json.loads(path.read_text(encoding="utf-8")) | |
| except (json.JSONDecodeError, OSError): | |
| return None | |
| if time.time() - entry.get("ts", 0) >= ttl_seconds: | |
| return None | |
| return entry.get("answer") | |
| def put(model: str, intent: str, chunk_ids: list[str], query: str, | |
| answer: str) -> None: | |
| key = _make_key(model, intent, chunk_ids, query) | |
| path = CACHE_DIR / f"{key}.json" | |
| path.write_text( | |
| json.dumps( | |
| {"ts": time.time(), "answer": answer, "query": query, "intent": intent}, | |
| ensure_ascii=False, | |
| ), | |
| encoding="utf-8", | |
| ) | |
| def estimate_tokens(text: str) -> int: | |
| """Rough token estimate for budget guard. Arabic averages ~3 chars/token | |
| on llama tokenizers; we use 3 to err conservative (overestimate).""" | |
| return max(1, len(text) // 3) | |