Spaces:
Sleeping
Sleeping
| """Module C: Generation Utility — measures improvement in AI responses when using the document.""" | |
| from __future__ import annotations | |
| import json | |
| import anthropic | |
| from kvl.ingestor import Document | |
| _QA_GEN_PROMPT = """Generate {n} specific questions whose answers can ONLY be found in this document (not general knowledge). | |
| Focus on questions about specific findings, methods, results, or recommendations in the document. | |
| Return ONLY a JSON array of question strings. | |
| Document: | |
| {document}""" | |
| _JUDGE_PROMPT = """Compare two AI-generated answers to the same question. One is generated without any document context (baseline), the other with a relevant document as context (RAG). | |
| Question: {question} | |
| Baseline answer (no document): {baseline} | |
| RAG answer (with document context): {rag_answer} | |
| Rate the RAG answer improvement over baseline on these criteria (1-5 each): | |
| - accuracy: Is the RAG answer more factually correct? | |
| - completeness: Does the RAG answer cover more relevant aspects? | |
| - specificity: Is the RAG answer more specific and precise? | |
| Also provide an overall improvement score (0-100) where: | |
| - 0 = RAG answer is no better or worse than baseline | |
| - 50 = moderate improvement | |
| - 100 = dramatically better, document was essential | |
| Return ONLY JSON: {{"accuracy": int, "completeness": int, "specificity": int, "improvement": int, "reason": "one sentence"}}""" | |
| def _call_claude(client: anthropic.Anthropic, system: str, user: str, model: str = "claude-sonnet-4-6") -> str: | |
| msg = client.messages.create( | |
| model=model, | |
| max_tokens=1024, | |
| messages=[{"role": "user", "content": user}], | |
| system=system, | |
| ) | |
| return msg.content[0].text.strip() | |
| def _generate_questions(client: anthropic.Anthropic, doc: Document, n: int = 8) -> list[str]: | |
| text = " ".join(doc.raw.split()[:5000]) | |
| raw = _call_claude( | |
| client, | |
| "You generate evaluation questions for knowledge documents.", | |
| _QA_GEN_PROMPT.format(n=n, document=text), | |
| model="claude-haiku-4-5-20251001", | |
| ) | |
| raw = raw.strip() | |
| if raw.startswith("```"): | |
| raw = "\n".join(raw.split("\n")[1:]) | |
| raw = raw.rsplit("```", 1)[0] | |
| try: | |
| questions = json.loads(raw) | |
| return [q for q in questions if isinstance(q, str)][:n] | |
| except json.JSONDecodeError: | |
| return [] | |
| def _baseline_answer(client: anthropic.Anthropic, question: str) -> str: | |
| return _call_claude( | |
| client, | |
| "Answer based only on your pre-trained knowledge. Be honest if you don't know.", | |
| question, | |
| model="claude-haiku-4-5-20251001", | |
| ) | |
| def _rag_answer(client: anthropic.Anthropic, question: str, context: str) -> str: | |
| return _call_claude( | |
| client, | |
| "Answer the question using the provided document context. Be specific and cite details from the document.", | |
| f"Document context:\n{context}\n\nQuestion: {question}", | |
| model="claude-haiku-4-5-20251001", | |
| ) | |
| def _judge(client: anthropic.Anthropic, question: str, baseline: str, rag: str) -> dict: | |
| raw = _call_claude( | |
| client, | |
| "You are an expert evaluator comparing AI answer quality.", | |
| _JUDGE_PROMPT.format(question=question, baseline=baseline, rag_answer=rag), | |
| ) | |
| raw = raw.strip() | |
| if raw.startswith("```"): | |
| raw = "\n".join(raw.split("\n")[1:]) | |
| raw = raw.rsplit("```", 1)[0] | |
| try: | |
| return json.loads(raw) | |
| except json.JSONDecodeError: | |
| return {"accuracy": 3, "completeness": 3, "specificity": 3, "improvement": 50, "reason": "Parse error."} | |
| def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None, max_workers: int = 6) -> dict: | |
| """Return generation utility score (0-100) and per-question details.""" | |
| if progress_cb: | |
| progress_cb("Generating evaluation questions...") | |
| questions = _generate_questions(client, doc) | |
| if not questions: | |
| return {"score": 50, "details": [], "summary": "Could not generate test questions."} | |
| # Use the full document as RAG context (capped for token limits) | |
| context = " ".join(doc.raw.split()[:4000]) | |
| def _evaluate_question(args): | |
| client, question, context = args | |
| baseline = _baseline_answer(client, question) | |
| rag = _rag_answer(client, question, context) | |
| judgment = _judge(client, question, baseline, rag) | |
| return { | |
| "question": question, | |
| "baseline_answer": baseline, | |
| "rag_answer": rag, | |
| "accuracy": judgment.get("accuracy", 3), | |
| "completeness": judgment.get("completeness", 3), | |
| "specificity": judgment.get("specificity", 3), | |
| "improvement": judgment.get("improvement", 50), | |
| "reason": judgment.get("reason", ""), | |
| } | |
| if progress_cb: | |
| progress_cb(f"Evaluating {len(questions)} questions in parallel (baseline vs RAG)...") | |
| from concurrent.futures import ThreadPoolExecutor | |
| with ThreadPoolExecutor(max_workers=max_workers) as pool: | |
| results = list(pool.map(_evaluate_question, [(client, q, context) for q in questions])) | |
| improvement_scores = [r["improvement"] for r in results] | |
| avg_improvement = sum(improvement_scores) / len(improvement_scores) | |
| score = round(avg_improvement) | |
| return { | |
| "score": score, | |
| "details": results, | |
| "summary": f"Average RAG improvement: {score}/100 across {len(questions)} questions.", | |
| } | |