Spaces:
Running
Running
File size: 4,486 Bytes
18fd039 22c2b18 18fd039 22c2b18 18fd039 22c2b18 18fd039 22c2b18 18fd039 22c2b18 18fd039 22c2b18 18fd039 22c2b18 18fd039 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
FilGoalBot — On-disk LLM response cache.
Keyed on (model, intent, sorted chunk_ids, normalised query, prompt_version).
Intentionally file-based + JSON: trivial to inspect, trivial to invalidate by
deletion, and survives across processes (eval re-runs, API restarts).
TTL is intent-aware. Match results go stale in hours; player bios stay fresh
for weeks. See INTENT_TTL_SECONDS.
Saves Groq tokens on:
- eval re-runs (same 50 questions, same retrieved chunks)
- production duplicates (the same query within the eviction window)
"""
import hashlib
import json
import time
from pathlib import Path
from qa_engine import prompts # PROMPT_VERSION read lazily — see _make_key
CACHE_DIR = Path(".cache/llm")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Default for unknown intents and ablation tests. Real intents are looked up
# in INTENT_TTL_SECONDS below.
DEFAULT_TTL_SECONDS = 30 * 24 * 3600
# Intent-aware TTLs. The cache key includes chunk_ids — so when fresh
# articles arrive the key shifts and stale entries are skipped naturally
# — but TTL is the safety net for cases where the corpus is static
# (e.g. eval re-runs) and the underlying facts move on.
#
# Tuned to the rate at which each intent's facts go stale in reality:
# match_result : a score changes once, but lineup/scorer chatter
# evolves through the day → short window.
# lineup : valid until kickoff, sometimes 12–48h.
# team_news : presser quotes age out in a few days.
# transfer_news: rumours linger for a window, finalised deals are
# stable once announced → a week is the right middle.
# player_info : bios and stats are slow-moving.
# general : true trivia (rules, history) effectively immortal.
INTENT_TTL_SECONDS: dict[str, int] = {
"match_result": 6 * 3600,
"lineup": 12 * 3600,
"team_news": 3 * 24 * 3600,
"transfer_news": 7 * 24 * 3600,
"player_info": 14 * 24 * 3600,
"general_football": 30 * 24 * 3600,
}
def ttl_for(intent: str) -> int:
"""Resolve the cache TTL for an intent, falling back to the default."""
return INTENT_TTL_SECONDS.get(intent, DEFAULT_TTL_SECONDS)
def _make_key(
model: str,
intent: str,
chunk_ids: list[str],
query: str,
) -> str:
payload = json.dumps(
{
"model": model,
"intent": intent,
"chunks": sorted(chunk_ids),
"query": query.strip().lower(),
# PROMPT_VERSION folded in so editing prompts.py auto-invalidates
# every prior cached answer. Without this, a prompt rewrite would
# be shadowed by stale completions until the TTL expired.
# Looked up at call time, not import time, so a runtime bump
# of prompts.PROMPT_VERSION (and tests that monkeypatch it)
# actually shifts the key.
"prompt_v": prompts.PROMPT_VERSION,
},
ensure_ascii=False,
sort_keys=True,
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:32]
def get(model: str, intent: str, chunk_ids: list[str], query: str,
ttl_seconds: int | None = None) -> str | None:
"""Look up a cached answer. ttl_seconds=None → look up per-intent TTL
(production default). Pass an explicit value to override (tests use 0
to force staleness)."""
if ttl_seconds is None:
ttl_seconds = ttl_for(intent)
key = _make_key(model, intent, chunk_ids, query)
path = CACHE_DIR / f"{key}.json"
if not path.exists():
return None
try:
entry = json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return None
if time.time() - entry.get("ts", 0) >= ttl_seconds:
return None
return entry.get("answer")
def put(model: str, intent: str, chunk_ids: list[str], query: str,
answer: str) -> None:
key = _make_key(model, intent, chunk_ids, query)
path = CACHE_DIR / f"{key}.json"
path.write_text(
json.dumps(
{"ts": time.time(), "answer": answer, "query": query, "intent": intent,
"prompt_v": prompts.PROMPT_VERSION},
ensure_ascii=False,
),
encoding="utf-8",
)
def estimate_tokens(text: str) -> int:
"""Rough token estimate for budget guard. Arabic averages ~3 chars/token
on llama tokenizers; we use 3 to err conservative (overestimate)."""
return max(1, len(text) // 3)
|