Spaces:
Sleeping
Sleeping
File size: 3,199 Bytes
754d8d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | import os
import json
from datetime import datetime
from pathlib import Path
def ensure_directories():
"""Create necessary directories if they don't exist."""
Path("data/policies").mkdir(parents=True, exist_ok=True)
Path("logs").mkdir(parents=True, exist_ok=True)
Path("chroma_db").mkdir(parents=True, exist_ok=True)
def log_query(question, retrieved_chunks, response, prompt_type="improved"):
"""Log query details to JSONL file."""
log_entry = {
"timestamp": datetime.now().isoformat(),
"question": question,
"prompt_type": prompt_type,
"num_chunks_retrieved": len(retrieved_chunks),
"chunks": [
{
"text": chunk["text"][:200] + "..." if len(chunk["text"]) > 200 else chunk["text"],
"metadata": chunk.get("metadata", {})
}
for chunk in retrieved_chunks
],
"response": response
}
log_file = "logs/queries.jsonl"
with open(log_file, "a", encoding="utf-8") as f:
f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
def get_groq_api_key():
"""Get Groq API key from environment."""
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
raise ValueError("GROQ_API_KEY environment variable not set")
return api_key
def safe_json_parse(text):
"""Safely parse JSON from LLM response."""
try:
# Try to find JSON in the response
start = text.find("{")
end = text.rfind("}") + 1
if start != -1 and end > start:
json_str = text[start:end]
return json.loads(json_str)
return None
except Exception:
return None
# ============================================================
# ⭐ NEW: Simple RAG Evaluation Metrics
# ============================================================
def evaluate_response(question: str, response: dict, prompt_type: str) -> dict:
"""
Generate simple evaluation metrics for RAG output.
Metrics:
- Accuracy (basic heuristic)
- Groundedness (based on evidence presence)
- Hallucination Risk
- Prompt Version
"""
answer = response.get("answer", "")
evidence = response.get("evidence", [])
# ---------------------------
# Accuracy (simple heuristic)
# ---------------------------
if isinstance(answer, str) and answer.startswith("I don't know"):
accuracy = "⚠️"
else:
accuracy = "✅"
# ---------------------------
# Groundedness
# ---------------------------
groundedness = "✅" if evidence else "⚠️"
# ---------------------------
# Hallucination Risk
# ---------------------------
if isinstance(answer, str) and answer.startswith("I don't know"):
hallucination = "LOW"
elif evidence:
hallucination = "LOW"
else:
hallucination = "MEDIUM"
evaluation = {
"Accuracy": accuracy,
"Groundedness": groundedness,
"Hallucination Risk": hallucination,
"Prompt Version": prompt_type
}
return evaluation
|