Spaces:

CGIAR
/

knowledge-value-lab

Sleeping

App Files Files Community

knowledge-value-lab / kvl /modules /demand.py

feedcomposer

Upload folder using huggingface_hub

11d4a48 verified about 1 month ago

Raw

History Blame Contribute Delete

4.38 kB

	"""Module E: Demand Utility — estimates how frequently this knowledge is needed by users."""

	from __future__ import annotations
	import json
	import anthropic
	from kvl.ingestor import Document

	_TOPIC_PROMPT = """Analyze this document and identify its main topics and themes.

	For each topic, estimate:
	1. How frequently real users would search for or need this information (1-10 scale)
	2. Whether this is a high-priority domain (healthcare, climate, food security, education, policy, etc.)
	3. Whether there are significant unmet information needs in this area

	Return ONLY a JSON array of objects:
	[
	{{
	"topic": "topic name",
	"description": "what aspect of the document this covers",
	"query_frequency": <int 1-10>,
	"priority_domain": <bool>,
	"unmet_need": <bool>,
	"rationale": "one sentence"
	}},
	...
	]

	Extract 5-8 topics.

	Document:
	{document}"""

	_DEMAND_SUMMARY_PROMPT = """Based on this knowledge document's topics and characteristics, assess its overall demand utility.

	Topics identified:
	{topics}

	Document summary:
	{summary}

	Evaluate:
	1. How broad vs. specialized is the audience for this knowledge?
	2. Are there gaps in existing AI model knowledge that this document fills?
	3. What is the geographic or linguistic scope?
	4. How actionable is this knowledge for typical users?

	Return ONLY JSON:
	{{
	"audience_breadth": <int 1-10>,
	"knowledge_gap_fill": <int 1-10>,
	"geographic_relevance": <int 1-10>,
	"actionability": <int 1-10>,
	"demand_score": <int 0-100>,
	"summary": "two sentence demand assessment"
	}}"""


	def _call_claude(client: anthropic.Anthropic, prompt: str, model: str = "claude-sonnet-4-6") -> str:
	msg = client.messages.create(
	model=model,
	max_tokens=1024,
	messages=[{"role": "user", "content": prompt}],
	system="You are an expert in knowledge management and information demand analysis.",
	)
	return msg.content[0].text.strip()


	def _parse_json(raw: str) -> dict \| list \| None:
	raw = raw.strip()
	if raw.startswith("```"):
	raw = "\n".join(raw.split("\n")[1:])
	raw = raw.rsplit("```", 1)[0]
	try:
	return json.loads(raw)
	except json.JSONDecodeError:
	return None


	def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None) -> dict:
	"""Return demand utility score (0-100) and topic analysis."""
	if progress_cb:
	progress_cb("Extracting document topics and themes...")

	text = " ".join(doc.raw.split()[:5000])
	raw_topics = _call_claude(client, _TOPIC_PROMPT.format(document=text), model="claude-haiku-4-5-20251001")
	topics = _parse_json(raw_topics)

	if not topics or not isinstance(topics, list):
	return {"score": 50, "topics": [], "summary": "Could not extract topics from document."}

	# Compute a base score from topic frequency ratings
	avg_frequency = sum(t.get("query_frequency", 5) for t in topics) / len(topics)
	priority_bonus = sum(1 for t in topics if t.get("priority_domain", False)) / len(topics)
	unmet_bonus = sum(1 for t in topics if t.get("unmet_need", False)) / len(topics)

	# Base score from topic analysis
	base_score = (avg_frequency / 10) * 60 + priority_bonus * 20 + unmet_bonus * 20

	if progress_cb:
	progress_cb("Estimating overall demand utility...")

	# Get holistic demand assessment from Claude
	doc_summary = f"Title: {doc.title}. Words: {doc.word_count}. Sections: {len(doc.sections)}."
	topics_str = json.dumps(topics, indent=2)
	raw_demand = _call_claude(client, _DEMAND_SUMMARY_PROMPT.format(topics=topics_str, summary=doc_summary))
	demand_assessment = _parse_json(raw_demand)

	if demand_assessment and isinstance(demand_assessment, dict):
	llm_score = demand_assessment.get("demand_score", base_score)
	# Blend base score (40%) with LLM holistic score (60%)
	final_score = round(0.4 * base_score + 0.6 * llm_score)
	assessment_summary = demand_assessment.get("summary", "")
	else:
	final_score = round(base_score)
	assessment_summary = ""

	final_score = max(0, min(100, final_score))

	return {
	"score": final_score,
	"topics": topics,
	"assessment": demand_assessment or {},
	"summary": assessment_summary or f"Demand estimated from {len(topics)} topic areas. Base score: {round(base_score)}/100.",
	}