Fil-RAG-Goal / qa_engine /cache.py
Omar10lfc's picture
Initial Space deployment
18fd039
"""
FilGoalBot — On-disk LLM response cache.
Keyed on (model, intent, sorted chunk_ids, normalised query). Intentionally
file-based + JSON: trivial to inspect, trivial to invalidate by deletion, and
survives across processes (eval re-runs, API restarts).
Saves Groq tokens on:
- eval re-runs (same 50 questions, same retrieved chunks)
- production duplicates (the same query within the eviction window)
"""
import hashlib
import json
import time
from pathlib import Path
CACHE_DIR = Path(".cache/llm")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# 30 days. Football news goes stale much faster, but the cache key includes the
# retrieved chunk IDs — when new articles come in, the chunk set shifts and the
# key changes naturally.
DEFAULT_TTL_SECONDS = 30 * 24 * 3600
def _make_key(
model: str,
intent: str,
chunk_ids: list[str],
query: str,
) -> str:
payload = json.dumps(
{
"model": model,
"intent": intent,
"chunks": sorted(chunk_ids),
"query": query.strip().lower(),
},
ensure_ascii=False,
sort_keys=True,
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:32]
def get(model: str, intent: str, chunk_ids: list[str], query: str,
ttl_seconds: int = DEFAULT_TTL_SECONDS) -> str | None:
key = _make_key(model, intent, chunk_ids, query)
path = CACHE_DIR / f"{key}.json"
if not path.exists():
return None
try:
entry = json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return None
if time.time() - entry.get("ts", 0) >= ttl_seconds:
return None
return entry.get("answer")
def put(model: str, intent: str, chunk_ids: list[str], query: str,
answer: str) -> None:
key = _make_key(model, intent, chunk_ids, query)
path = CACHE_DIR / f"{key}.json"
path.write_text(
json.dumps(
{"ts": time.time(), "answer": answer, "query": query, "intent": intent},
ensure_ascii=False,
),
encoding="utf-8",
)
def estimate_tokens(text: str) -> int:
"""Rough token estimate for budget guard. Arabic averages ~3 chars/token
on llama tokenizers; we use 3 to err conservative (overestimate)."""
return max(1, len(text) // 3)