"""Module E: Demand Utility — estimates how frequently this knowledge is needed by users.""" from __future__ import annotations import json import anthropic from kvl.ingestor import Document _TOPIC_PROMPT = """Analyze this document and identify its main topics and themes. For each topic, estimate: 1. How frequently real users would search for or need this information (1-10 scale) 2. Whether this is a high-priority domain (healthcare, climate, food security, education, policy, etc.) 3. Whether there are significant unmet information needs in this area Return ONLY a JSON array of objects: [ {{ "topic": "topic name", "description": "what aspect of the document this covers", "query_frequency": , "priority_domain": , "unmet_need": , "rationale": "one sentence" }}, ... ] Extract 5-8 topics. Document: {document}""" _DEMAND_SUMMARY_PROMPT = """Based on this knowledge document's topics and characteristics, assess its overall demand utility. Topics identified: {topics} Document summary: {summary} Evaluate: 1. How broad vs. specialized is the audience for this knowledge? 2. Are there gaps in existing AI model knowledge that this document fills? 3. What is the geographic or linguistic scope? 4. How actionable is this knowledge for typical users? Return ONLY JSON: {{ "audience_breadth": , "knowledge_gap_fill": , "geographic_relevance": , "actionability": , "demand_score": , "summary": "two sentence demand assessment" }}""" def _call_claude(client: anthropic.Anthropic, prompt: str, model: str = "claude-sonnet-4-6") -> str: msg = client.messages.create( model=model, max_tokens=1024, messages=[{"role": "user", "content": prompt}], system="You are an expert in knowledge management and information demand analysis.", ) return msg.content[0].text.strip() def _parse_json(raw: str) -> dict | list | None: raw = raw.strip() if raw.startswith("```"): raw = "\n".join(raw.split("\n")[1:]) raw = raw.rsplit("```", 1)[0] try: return json.loads(raw) except json.JSONDecodeError: return None def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None) -> dict: """Return demand utility score (0-100) and topic analysis.""" if progress_cb: progress_cb("Extracting document topics and themes...") text = " ".join(doc.raw.split()[:5000]) raw_topics = _call_claude(client, _TOPIC_PROMPT.format(document=text), model="claude-haiku-4-5-20251001") topics = _parse_json(raw_topics) if not topics or not isinstance(topics, list): return {"score": 50, "topics": [], "summary": "Could not extract topics from document."} # Compute a base score from topic frequency ratings avg_frequency = sum(t.get("query_frequency", 5) for t in topics) / len(topics) priority_bonus = sum(1 for t in topics if t.get("priority_domain", False)) / len(topics) unmet_bonus = sum(1 for t in topics if t.get("unmet_need", False)) / len(topics) # Base score from topic analysis base_score = (avg_frequency / 10) * 60 + priority_bonus * 20 + unmet_bonus * 20 if progress_cb: progress_cb("Estimating overall demand utility...") # Get holistic demand assessment from Claude doc_summary = f"Title: {doc.title}. Words: {doc.word_count}. Sections: {len(doc.sections)}." topics_str = json.dumps(topics, indent=2) raw_demand = _call_claude(client, _DEMAND_SUMMARY_PROMPT.format(topics=topics_str, summary=doc_summary)) demand_assessment = _parse_json(raw_demand) if demand_assessment and isinstance(demand_assessment, dict): llm_score = demand_assessment.get("demand_score", base_score) # Blend base score (40%) with LLM holistic score (60%) final_score = round(0.4 * base_score + 0.6 * llm_score) assessment_summary = demand_assessment.get("summary", "") else: final_score = round(base_score) assessment_summary = "" final_score = max(0, min(100, final_score)) return { "score": final_score, "topics": topics, "assessment": demand_assessment or {}, "summary": assessment_summary or f"Demand estimated from {len(topics)} topic areas. Base score: {round(base_score)}/100.", }