knowledge-value-lab / kvl /modules /generation.py
feedcomposer's picture
Upload folder using huggingface_hub
7cc493d verified
Raw
History Blame Contribute Delete
5.44 kB
"""Module C: Generation Utility — measures improvement in AI responses when using the document."""
from __future__ import annotations
import json
import anthropic
from kvl.ingestor import Document
_QA_GEN_PROMPT = """Generate {n} specific questions whose answers can ONLY be found in this document (not general knowledge).
Focus on questions about specific findings, methods, results, or recommendations in the document.
Return ONLY a JSON array of question strings.
Document:
{document}"""
_JUDGE_PROMPT = """Compare two AI-generated answers to the same question. One is generated without any document context (baseline), the other with a relevant document as context (RAG).
Question: {question}
Baseline answer (no document): {baseline}
RAG answer (with document context): {rag_answer}
Rate the RAG answer improvement over baseline on these criteria (1-5 each):
- accuracy: Is the RAG answer more factually correct?
- completeness: Does the RAG answer cover more relevant aspects?
- specificity: Is the RAG answer more specific and precise?
Also provide an overall improvement score (0-100) where:
- 0 = RAG answer is no better or worse than baseline
- 50 = moderate improvement
- 100 = dramatically better, document was essential
Return ONLY JSON: {{"accuracy": int, "completeness": int, "specificity": int, "improvement": int, "reason": "one sentence"}}"""
def _call_claude(client: anthropic.Anthropic, system: str, user: str, model: str = "claude-sonnet-4-6") -> str:
msg = client.messages.create(
model=model,
max_tokens=1024,
messages=[{"role": "user", "content": user}],
system=system,
)
return msg.content[0].text.strip()
def _generate_questions(client: anthropic.Anthropic, doc: Document, n: int = 8) -> list[str]:
text = " ".join(doc.raw.split()[:5000])
raw = _call_claude(
client,
"You generate evaluation questions for knowledge documents.",
_QA_GEN_PROMPT.format(n=n, document=text),
model="claude-haiku-4-5-20251001",
)
raw = raw.strip()
if raw.startswith("```"):
raw = "\n".join(raw.split("\n")[1:])
raw = raw.rsplit("```", 1)[0]
try:
questions = json.loads(raw)
return [q for q in questions if isinstance(q, str)][:n]
except json.JSONDecodeError:
return []
def _baseline_answer(client: anthropic.Anthropic, question: str) -> str:
return _call_claude(
client,
"Answer based only on your pre-trained knowledge. Be honest if you don't know.",
question,
model="claude-haiku-4-5-20251001",
)
def _rag_answer(client: anthropic.Anthropic, question: str, context: str) -> str:
return _call_claude(
client,
"Answer the question using the provided document context. Be specific and cite details from the document.",
f"Document context:\n{context}\n\nQuestion: {question}",
model="claude-haiku-4-5-20251001",
)
def _judge(client: anthropic.Anthropic, question: str, baseline: str, rag: str) -> dict:
raw = _call_claude(
client,
"You are an expert evaluator comparing AI answer quality.",
_JUDGE_PROMPT.format(question=question, baseline=baseline, rag_answer=rag),
)
raw = raw.strip()
if raw.startswith("```"):
raw = "\n".join(raw.split("\n")[1:])
raw = raw.rsplit("```", 1)[0]
try:
return json.loads(raw)
except json.JSONDecodeError:
return {"accuracy": 3, "completeness": 3, "specificity": 3, "improvement": 50, "reason": "Parse error."}
def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None, max_workers: int = 6) -> dict:
"""Return generation utility score (0-100) and per-question details."""
if progress_cb:
progress_cb("Generating evaluation questions...")
questions = _generate_questions(client, doc)
if not questions:
return {"score": 50, "details": [], "summary": "Could not generate test questions."}
# Use the full document as RAG context (capped for token limits)
context = " ".join(doc.raw.split()[:4000])
def _evaluate_question(args):
client, question, context = args
baseline = _baseline_answer(client, question)
rag = _rag_answer(client, question, context)
judgment = _judge(client, question, baseline, rag)
return {
"question": question,
"baseline_answer": baseline,
"rag_answer": rag,
"accuracy": judgment.get("accuracy", 3),
"completeness": judgment.get("completeness", 3),
"specificity": judgment.get("specificity", 3),
"improvement": judgment.get("improvement", 50),
"reason": judgment.get("reason", ""),
}
if progress_cb:
progress_cb(f"Evaluating {len(questions)} questions in parallel (baseline vs RAG)...")
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=max_workers) as pool:
results = list(pool.map(_evaluate_question, [(client, q, context) for q in questions]))
improvement_scores = [r["improvement"] for r in results]
avg_improvement = sum(improvement_scores) / len(improvement_scores)
score = round(avg_improvement)
return {
"score": score,
"details": results,
"summary": f"Average RAG improvement: {score}/100 across {len(questions)} questions.",
}