"""Module C: Generation Utility — measures improvement in AI responses when using the document.""" from __future__ import annotations import json import anthropic from kvl.ingestor import Document _QA_GEN_PROMPT = """Generate {n} specific questions whose answers can ONLY be found in this document (not general knowledge). Focus on questions about specific findings, methods, results, or recommendations in the document. Return ONLY a JSON array of question strings. Document: {document}""" _JUDGE_PROMPT = """Compare two AI-generated answers to the same question. One is generated without any document context (baseline), the other with a relevant document as context (RAG). Question: {question} Baseline answer (no document): {baseline} RAG answer (with document context): {rag_answer} Rate the RAG answer improvement over baseline on these criteria (1-5 each): - accuracy: Is the RAG answer more factually correct? - completeness: Does the RAG answer cover more relevant aspects? - specificity: Is the RAG answer more specific and precise? Also provide an overall improvement score (0-100) where: - 0 = RAG answer is no better or worse than baseline - 50 = moderate improvement - 100 = dramatically better, document was essential Return ONLY JSON: {{"accuracy": int, "completeness": int, "specificity": int, "improvement": int, "reason": "one sentence"}}""" def _call_claude(client: anthropic.Anthropic, system: str, user: str, model: str = "claude-sonnet-4-6") -> str: msg = client.messages.create( model=model, max_tokens=1024, messages=[{"role": "user", "content": user}], system=system, ) return msg.content[0].text.strip() def _generate_questions(client: anthropic.Anthropic, doc: Document, n: int = 8) -> list[str]: text = " ".join(doc.raw.split()[:5000]) raw = _call_claude( client, "You generate evaluation questions for knowledge documents.", _QA_GEN_PROMPT.format(n=n, document=text), model="claude-haiku-4-5-20251001", ) raw = raw.strip() if raw.startswith("```"): raw = "\n".join(raw.split("\n")[1:]) raw = raw.rsplit("```", 1)[0] try: questions = json.loads(raw) return [q for q in questions if isinstance(q, str)][:n] except json.JSONDecodeError: return [] def _baseline_answer(client: anthropic.Anthropic, question: str) -> str: return _call_claude( client, "Answer based only on your pre-trained knowledge. Be honest if you don't know.", question, model="claude-haiku-4-5-20251001", ) def _rag_answer(client: anthropic.Anthropic, question: str, context: str) -> str: return _call_claude( client, "Answer the question using the provided document context. Be specific and cite details from the document.", f"Document context:\n{context}\n\nQuestion: {question}", model="claude-haiku-4-5-20251001", ) def _judge(client: anthropic.Anthropic, question: str, baseline: str, rag: str) -> dict: raw = _call_claude( client, "You are an expert evaluator comparing AI answer quality.", _JUDGE_PROMPT.format(question=question, baseline=baseline, rag_answer=rag), ) raw = raw.strip() if raw.startswith("```"): raw = "\n".join(raw.split("\n")[1:]) raw = raw.rsplit("```", 1)[0] try: return json.loads(raw) except json.JSONDecodeError: return {"accuracy": 3, "completeness": 3, "specificity": 3, "improvement": 50, "reason": "Parse error."} def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None, max_workers: int = 6) -> dict: """Return generation utility score (0-100) and per-question details.""" if progress_cb: progress_cb("Generating evaluation questions...") questions = _generate_questions(client, doc) if not questions: return {"score": 50, "details": [], "summary": "Could not generate test questions."} # Use the full document as RAG context (capped for token limits) context = " ".join(doc.raw.split()[:4000]) def _evaluate_question(args): client, question, context = args baseline = _baseline_answer(client, question) rag = _rag_answer(client, question, context) judgment = _judge(client, question, baseline, rag) return { "question": question, "baseline_answer": baseline, "rag_answer": rag, "accuracy": judgment.get("accuracy", 3), "completeness": judgment.get("completeness", 3), "specificity": judgment.get("specificity", 3), "improvement": judgment.get("improvement", 50), "reason": judgment.get("reason", ""), } if progress_cb: progress_cb(f"Evaluating {len(questions)} questions in parallel (baseline vs RAG)...") from concurrent.futures import ThreadPoolExecutor with ThreadPoolExecutor(max_workers=max_workers) as pool: results = list(pool.map(_evaluate_question, [(client, q, context) for q in questions])) improvement_scores = [r["improvement"] for r in results] avg_improvement = sum(improvement_scores) / len(improvement_scores) score = round(avg_improvement) return { "score": score, "details": results, "summary": f"Average RAG improvement: {score}/100 across {len(questions)} questions.", }