Spaces:

CGIAR
/

knowledge-value-lab

Sleeping

App Files Files Community

knowledge-value-lab / kvl /modules /generation.py

feedcomposer

Upload folder using huggingface_hub

7cc493d verified about 1 month ago

Raw

History Blame Contribute Delete

5.44 kB

	"""Module C: Generation Utility — measures improvement in AI responses when using the document."""

	from __future__ import annotations
	import json
	import anthropic
	from kvl.ingestor import Document

	_QA_GEN_PROMPT = """Generate {n} specific questions whose answers can ONLY be found in this document (not general knowledge).
	Focus on questions about specific findings, methods, results, or recommendations in the document.

	Return ONLY a JSON array of question strings.

	Document:
	{document}"""

	_JUDGE_PROMPT = """Compare two AI-generated answers to the same question. One is generated without any document context (baseline), the other with a relevant document as context (RAG).

	Question: {question}

	Baseline answer (no document): {baseline}

	RAG answer (with document context): {rag_answer}

	Rate the RAG answer improvement over baseline on these criteria (1-5 each):
	- accuracy: Is the RAG answer more factually correct?
	- completeness: Does the RAG answer cover more relevant aspects?
	- specificity: Is the RAG answer more specific and precise?

	Also provide an overall improvement score (0-100) where:
	- 0 = RAG answer is no better or worse than baseline
	- 50 = moderate improvement
	- 100 = dramatically better, document was essential

	Return ONLY JSON: {{"accuracy": int, "completeness": int, "specificity": int, "improvement": int, "reason": "one sentence"}}"""


	def _call_claude(client: anthropic.Anthropic, system: str, user: str, model: str = "claude-sonnet-4-6") -> str:
	msg = client.messages.create(
	model=model,
	max_tokens=1024,
	messages=[{"role": "user", "content": user}],
	system=system,
	)
	return msg.content[0].text.strip()


	def _generate_questions(client: anthropic.Anthropic, doc: Document, n: int = 8) -> list[str]:
	text = " ".join(doc.raw.split()[:5000])
	raw = _call_claude(
	client,
	"You generate evaluation questions for knowledge documents.",
	_QA_GEN_PROMPT.format(n=n, document=text),
	model="claude-haiku-4-5-20251001",
	)
	raw = raw.strip()
	if raw.startswith("```"):
	raw = "\n".join(raw.split("\n")[1:])
	raw = raw.rsplit("```", 1)[0]
	try:
	questions = json.loads(raw)
	return [q for q in questions if isinstance(q, str)][:n]
	except json.JSONDecodeError:
	return []


	def _baseline_answer(client: anthropic.Anthropic, question: str) -> str:
	return _call_claude(
	client,
	"Answer based only on your pre-trained knowledge. Be honest if you don't know.",
	question,
	model="claude-haiku-4-5-20251001",
	)


	def _rag_answer(client: anthropic.Anthropic, question: str, context: str) -> str:
	return _call_claude(
	client,
	"Answer the question using the provided document context. Be specific and cite details from the document.",
	f"Document context:\n{context}\n\nQuestion: {question}",
	model="claude-haiku-4-5-20251001",
	)


	def _judge(client: anthropic.Anthropic, question: str, baseline: str, rag: str) -> dict:
	raw = _call_claude(
	client,
	"You are an expert evaluator comparing AI answer quality.",
	_JUDGE_PROMPT.format(question=question, baseline=baseline, rag_answer=rag),
	)
	raw = raw.strip()
	if raw.startswith("```"):
	raw = "\n".join(raw.split("\n")[1:])
	raw = raw.rsplit("```", 1)[0]
	try:
	return json.loads(raw)
	except json.JSONDecodeError:
	return {"accuracy": 3, "completeness": 3, "specificity": 3, "improvement": 50, "reason": "Parse error."}


	def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None, max_workers: int = 6) -> dict:
	"""Return generation utility score (0-100) and per-question details."""
	if progress_cb:
	progress_cb("Generating evaluation questions...")

	questions = _generate_questions(client, doc)
	if not questions:
	return {"score": 50, "details": [], "summary": "Could not generate test questions."}

	# Use the full document as RAG context (capped for token limits)
	context = " ".join(doc.raw.split()[:4000])

	def _evaluate_question(args):
	client, question, context = args
	baseline = _baseline_answer(client, question)
	rag = _rag_answer(client, question, context)
	judgment = _judge(client, question, baseline, rag)
	return {
	"question": question,
	"baseline_answer": baseline,
	"rag_answer": rag,
	"accuracy": judgment.get("accuracy", 3),
	"completeness": judgment.get("completeness", 3),
	"specificity": judgment.get("specificity", 3),
	"improvement": judgment.get("improvement", 50),
	"reason": judgment.get("reason", ""),
	}

	if progress_cb:
	progress_cb(f"Evaluating {len(questions)} questions in parallel (baseline vs RAG)...")

	from concurrent.futures import ThreadPoolExecutor
	with ThreadPoolExecutor(max_workers=max_workers) as pool:
	results = list(pool.map(_evaluate_question, [(client, q, context) for q in questions]))

	improvement_scores = [r["improvement"] for r in results]

	avg_improvement = sum(improvement_scores) / len(improvement_scores)
	score = round(avg_improvement)

	return {
	"score": score,
	"details": results,
	"summary": f"Average RAG improvement: {score}/100 across {len(questions)} questions.",
	}