feedcomposer's picture
Upload folder using huggingface_hub
11d4a48 verified
Raw
History Blame Contribute Delete
4.38 kB
"""Module E: Demand Utility — estimates how frequently this knowledge is needed by users."""
from __future__ import annotations
import json
import anthropic
from kvl.ingestor import Document
_TOPIC_PROMPT = """Analyze this document and identify its main topics and themes.
For each topic, estimate:
1. How frequently real users would search for or need this information (1-10 scale)
2. Whether this is a high-priority domain (healthcare, climate, food security, education, policy, etc.)
3. Whether there are significant unmet information needs in this area
Return ONLY a JSON array of objects:
[
{{
"topic": "topic name",
"description": "what aspect of the document this covers",
"query_frequency": <int 1-10>,
"priority_domain": <bool>,
"unmet_need": <bool>,
"rationale": "one sentence"
}},
...
]
Extract 5-8 topics.
Document:
{document}"""
_DEMAND_SUMMARY_PROMPT = """Based on this knowledge document's topics and characteristics, assess its overall demand utility.
Topics identified:
{topics}
Document summary:
{summary}
Evaluate:
1. How broad vs. specialized is the audience for this knowledge?
2. Are there gaps in existing AI model knowledge that this document fills?
3. What is the geographic or linguistic scope?
4. How actionable is this knowledge for typical users?
Return ONLY JSON:
{{
"audience_breadth": <int 1-10>,
"knowledge_gap_fill": <int 1-10>,
"geographic_relevance": <int 1-10>,
"actionability": <int 1-10>,
"demand_score": <int 0-100>,
"summary": "two sentence demand assessment"
}}"""
def _call_claude(client: anthropic.Anthropic, prompt: str, model: str = "claude-sonnet-4-6") -> str:
msg = client.messages.create(
model=model,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
system="You are an expert in knowledge management and information demand analysis.",
)
return msg.content[0].text.strip()
def _parse_json(raw: str) -> dict | list | None:
raw = raw.strip()
if raw.startswith("```"):
raw = "\n".join(raw.split("\n")[1:])
raw = raw.rsplit("```", 1)[0]
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None) -> dict:
"""Return demand utility score (0-100) and topic analysis."""
if progress_cb:
progress_cb("Extracting document topics and themes...")
text = " ".join(doc.raw.split()[:5000])
raw_topics = _call_claude(client, _TOPIC_PROMPT.format(document=text), model="claude-haiku-4-5-20251001")
topics = _parse_json(raw_topics)
if not topics or not isinstance(topics, list):
return {"score": 50, "topics": [], "summary": "Could not extract topics from document."}
# Compute a base score from topic frequency ratings
avg_frequency = sum(t.get("query_frequency", 5) for t in topics) / len(topics)
priority_bonus = sum(1 for t in topics if t.get("priority_domain", False)) / len(topics)
unmet_bonus = sum(1 for t in topics if t.get("unmet_need", False)) / len(topics)
# Base score from topic analysis
base_score = (avg_frequency / 10) * 60 + priority_bonus * 20 + unmet_bonus * 20
if progress_cb:
progress_cb("Estimating overall demand utility...")
# Get holistic demand assessment from Claude
doc_summary = f"Title: {doc.title}. Words: {doc.word_count}. Sections: {len(doc.sections)}."
topics_str = json.dumps(topics, indent=2)
raw_demand = _call_claude(client, _DEMAND_SUMMARY_PROMPT.format(topics=topics_str, summary=doc_summary))
demand_assessment = _parse_json(raw_demand)
if demand_assessment and isinstance(demand_assessment, dict):
llm_score = demand_assessment.get("demand_score", base_score)
# Blend base score (40%) with LLM holistic score (60%)
final_score = round(0.4 * base_score + 0.6 * llm_score)
assessment_summary = demand_assessment.get("summary", "")
else:
final_score = round(base_score)
assessment_summary = ""
final_score = max(0, min(100, final_score))
return {
"score": final_score,
"topics": topics,
"assessment": demand_assessment or {},
"summary": assessment_summary or f"Demand estimated from {len(topics)} topic areas. Base score: {round(base_score)}/100.",
}