Spaces:

kaburia
/

policy-analysis

Running

App Files Files Community

kaburia commited on Aug 29, 2025

Commit

9f75f1e

1 Parent(s): daa29bf

reverting back

Browse files

Files changed (1) hide show

utils/model_generation.py +43 -232

utils/model_generation.py CHANGED Viewed

@@ -1,76 +1,32 @@
 import json
 import requests
-from typing import List, Dict, Any, Union, Optional
 import time
 import numpy as np
 import os
-import re
-# ---------------------------
-# Comparison & Rendering Config
-# ---------------------------
-COMPARISON_CONFIG = {
-    "trigger_keywords": [
-        "compare", "comparison", "vs", "versus", "delta", "differences",
-        "benchmark", "matrix", "table", "side-by-side", "contrast", "diff", "gap analysis"
-    ],
-    "default_dimensions": [
-        "Scope/Entities", "Definitions", "Obligations/Controls", "Exemptions/Thresholds",
-        "Deadlines/Effective Dates", "Reporting/Recordkeeping",
-        "Enforcement Authority", "Penalties/Sanctions", "Cross-Border/Transfer",
-        "Data Retention", "Audits/Inspections"
-    ],
-    "unknown_token": "Not stated in sources",
-    "default_render_format": "markdown",  # 'markdown' | 'csv' | 'json'
-}
-VERBOSITY_HINT = "Target depth: medium-to-long. Use complete sentences and adequate context; avoid terse bullet-only outputs."
 PROMPT_TEMPLATES = {
     "verbatim_sentiment": {
         "system": (
-            "You are a compliance-grade policy analyst assistant. Prime directive: be faithful to the provided sources. "
-            "Do NOT speculate. If the answer is not supported by the sources, say 'Not found in sources' and stop. "
-            "Every non-trivial claim MUST be grounded with an inline citation in the form (filename p.X). "
-            "Prefer 'unknown/not stated' over guessing. "
-            "Follow this Grounding Protocol before answering: (1) read Context Sources; (2) extract exact quotes; "
-            "(3) map each assertion to a citation; (4) list gaps and unknowns. "
-            "Write in a direct, corporate tone; skeptical and gap-seeking. "
-            "Avoid hallucinations. Base everything strictly on the content provided. "
-            f"{VERBOSITY_HINT} "
-            "If sentiment or coherence inputs are disabled or empty, omit those sections entirely—do not mention they were omitted. "
-            "If comparison triggers are present, begin with a table-first comparative section as specified."
         ),
         "user_template": """
 Query: {query}
-Deliverables (use the exact section headers below; omit any section whose input is empty/disabled):
-1) Quoted Policy Excerpts
-   - Quote the provided text and append citations like (filename p.X). Group by subtopic ordered.
-2) Evidence-Backed Findings
-   - Paraphrase what the excerpts establish. Each bullet ends with a citation (filename p.X).
-3) Sentiment Summary
-   - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.
-4) Coherence Assessment
-   - From the coherence report: state on-topic vs off-topic; call out which sections were coherent, off-topic, or repeated.
-5) Risks & Unknowns
-   - Explicitly list ambiguities, missing definitions, or conflicts across sources.
-6) Compliance Implications
-   - Concrete next steps or checks a compliance team should run based strictly on the sources.
-# Comparative Table (render only if comparison triggers are present or {force_table}=True)
-- Render format: {render_format} (default markdown).
-- Columns (strict): Dimension | Document | Provision (Summary) | Deadlines/Dates | Enforcement/Penalties | Citation | Notes/Risks
-- Dimensions to use: {dimensions_hint_or_default}
-- Populate rows for each (Dimension × Document) where evidence exists.
-- Every non-trivial cell ends with a citation (filename p.X). Unknowns = '{unknown_token}'.
-- Sort by Dimension, then Document. If only one document exists, produce a single-column table (no invented comparisons).
-Constraints:
-- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'
-- Each substantive statement has a citation.
-- Avoid quotes unless legally binding language is essential.
 Topic hint: {topic_hint}
@@ -88,29 +44,12 @@ Context Sources:
     "abstractive_summary": {
         "system": (
             "You are a policy analyst summarizing government documents for a general audience. "
-            "Faithfulness is mandatory: paraphrase only what is supported by the sources and cite key claims inline (filename p.X). "
-            "Avoid quotes unless legally binding language is essential. "
-            f"{VERBOSITY_HINT} "
-            "If critical info is absent, say 'Not found in sources'—do not infer. "
-            "If comparison triggers are present, begin with a table-first comparative section as specified."
         ),
         "user_template": """Query: {query}
-Write a comprehensive, plain-language summary with these sections:
-- What It Covers (scope, entities, timelines) [cite]
-- Key Requirements & Controls (what must be done) [cite]
-- Enforcement & Penalties (who enforces, how, consequences) [cite]
-- Deadlines & Effective Dates (explicit dates or 'not stated') [cite]
-- Exemptions/Thresholds (if any; otherwise 'not stated') [cite]
-- Risks & Open Questions (gaps/ambiguities; no speculation)
-- Action Checklist (practical steps derived strictly from the sources) [cite]
-# Comparative Table (render only if comparison triggers are present or {force_table}=True)
-- Render format: {render_format} (default markdown).
-- Columns (strict): Dimension | Document | Provision (Summary) | Deadlines/Dates | Enforcement/Penalties | Citation | Notes/Risks
-- Dimensions to use: {dimensions_hint_or_default}
-- Every non-trivial cell ends with a citation (filename p.X). Unknowns = '{unknown_token}'.
-- Sort by Dimension, then Document. If only one document exists, produce a single-column table.
 Topic hint: {topic_hint}
@@ -122,29 +61,11 @@ Context DOCS:
     "followup_reasoning": {
         "system": (
             "You are an assistant that explains policy documents interactively, reasoning step-by-step. "
-            "Be strictly faithful to the documents; if a detail is absent, say so. "
-            "Cite document filename and page for each factual claim. "
-            f"{VERBOSITY_HINT} "
-            "If comparison triggers are present, begin with a table-first comparative section as specified."
         ),
         "user_template": """User query: {query}
-# Comparative Table (render only if comparison triggers are present or {force_table}=True)
-- Render format: {render_format} (default markdown).
-- Columns (strict): Dimension | Document | Provision (Summary) | Deadlines/Dates | Enforcement/Penalties | Citation | Notes/Risks
-- Dimensions to use: {dimensions_hint_or_default}
-- Every non-trivial cell ends with a citation (filename p.X). Unknowns = '{unknown_token}'.
-- Sort by Dimension, then Document.
-Then answer step-by-step:
-1) Direct Answer (what the sources actually support) with inline citations (filename p.X).
-2) Why (short reasoning mapped to specific passages) with citations.
-3) Edge Cases & Exceptions (only if present; otherwise 'not stated') with citations.
-4) What’s Missing (explicitly note absent info; no speculation).
-Follow-up Q&A:
-- List 3–6 follow-up questions a reader might ask, and answer each using the docs.
-- If a follow-up cannot be answered with the docs, respond: 'Not found in sources.'
 Topic: {topic_hint}
@@ -153,42 +74,7 @@ DOCS:
 """
     },
-    "comparison_matrix": {
-        "system": (
-            "You are a compliance-grade policy analyst. Zero hallucinations. "
-            "Derive ONLY from the provided documents. If a detail is missing, write "
-            f"'{COMPARISON_CONFIG['unknown_token']}'. No external knowledge. "
-            "Produce a table-first answer with per-cell citations (filename p.X). "
-            f"{VERBOSITY_HINT} "
-            "If there is only one relevant document, state that and produce a single-column table; do not invent comparisons."
-        ),
-        "user_template": """Query: {query}
-Operating mode: Comparison/Matrix
-Output spec (in this order):
-1) Comparison Table
-   - Render format: {render_format} (default markdown).
-   - Columns (strict): Dimension | Document | Provision (Summary) | Deadlines/Dates | Enforcement/Penalties | Citation | Notes/Risks
-   - Dimensions to use: {dimensions_hint_or_default}
-   - Populate rows for each (Dimension × Document) where evidence exists.
-   - Every non-trivial cell ends with a citation (filename p.X). Unknowns = '{unknown_token}'.
-   - Sort by Dimension, then Document.
-2) Insights & Deltas (bulleted)
-   - Top 3 material differences (what, where, why it matters) with citations.
-   - Tightest requirement, earliest deadline, and heaviest penalty across docs (each with citations).
-   - Ambiguities/Conflicts to watch (cite).
-3) Risks & Unknowns
-   - List critical gaps by Dimension. No speculation—flag as '{unknown_token}' with citations to show you checked.
-Topic hint: {topic_hint}
-Context DOCS:
-{context_block}
-"""
-    },
 }
@@ -222,6 +108,7 @@ def get_do_completion(api_key, model_name, messages, temperature=0.2, max_tokens
         return None
 # --- Prompt context builder ---
 def _clip(text: str, max_chars: int = 1400) -> str:
     """Trim content to limit prompt size."""
@@ -242,10 +129,10 @@ def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
     for i, item in enumerate(top_docs):
         if hasattr(item, "page_content"):
             text = item.page_content
-            meta = getattr(item, "metadata", {}) or {}
         else:
-            text = item.get("text") or item.get("page_content", "") or ""
-            meta = item.get("metadata", {}) or {}
         # Get file name from path
         full_path = meta.get("source", "")
@@ -255,99 +142,38 @@ def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
         page_label = meta.get("page_label") or meta.get("page") or "unknown"
         citation = f"{filename}, p. {page_label}"
         blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
     return "\n".join(blocks)
-# --- Comparison trigger detection ---
-def has_comparison_trigger(*texts: Optional[str], extra_keywords: Optional[List[str]] = None) -> bool:
-    """
-    Returns True if any of the provided strings contain comparison triggers.
-    """
-    keys = set(COMPARISON_CONFIG["trigger_keywords"])
-    if extra_keywords:
-        keys.update(k.lower() for k in extra_keywords)
-    pattern = re.compile(r"|".join(re.escape(k) for k in sorted(keys, key=len, reverse=True)), flags=re.IGNORECASE)
-    for t in texts:
-        if t and pattern.search(t):
-            return True
-    return False
-def _dimensions_hint_or_default(dimensions_hint: Optional[Union[str, List[str]]]) -> str:
-    if isinstance(dimensions_hint, list):
-        dims = [str(d).strip() for d in dimensions_hint if str(d).strip()]
-        if dims:
-            return ", ".join(dims)
-    if isinstance(dimensions_hint, str) and dimensions_hint.strip():
-        return dimensions_hint.strip()
-    return ", ".join(COMPARISON_CONFIG["default_dimensions"])
 # --- Message builder ---
 def build_messages(
     query: str,
     top_docs: List[Dict[str, Any]],
     task_mode: str,
-    sentiment_rollup: Optional[Dict[str, Any]] = None,
     coherence_report: str = "",
-    topic_hint: str = "energy policy",
-    force_table: bool = False,
-    dimensions_hint: Optional[Union[str, List[str]]] = None,
-    render_format: str = COMPARISON_CONFIG["default_render_format"],
-    verbosity_hint: str = VERBOSITY_HINT,
-    extra_comparison_keywords: Optional[List[str]] = None
 ) -> List[Dict[str, str]]:
-    """
-    Builds messages with conditional comparison mode and conditional sections.
-    """
-    # Auto-switch to comparison template if requested or triggered
-    comparison_triggered = force_table or has_comparison_trigger(query, topic_hint, extra_keywords=extra_comparison_keywords)
-    effective_task_mode = task_mode
-    if task_mode == "auto" and comparison_triggered:
-        effective_task_mode = "comparison_matrix"
-    template = PROMPT_TEMPLATES.get(effective_task_mode)
     if not template:
-        raise ValueError(f"Unknown task mode: {effective_task_mode}")
     context_block = build_context_block(top_docs)
     sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
-    # Prepare shared placeholders
-    user_kwargs = {
-        "query": query,
-        "topic_hint": topic_hint,
-        "sentiment_json": sentiment_json,
-        "context_block": context_block,
-        "coherence_report": coherence_report or "",
-        "force_table": str(force_table),
-        "render_format": (render_format or COMPARISON_CONFIG["default_render_format"]).lower(),
-        "dimensions_hint_or_default": _dimensions_hint_or_default(dimensions_hint),
-        "unknown_token": COMPARISON_CONFIG["unknown_token"],
-    }
-    user_prompt = template["user_template"].format(**user_kwargs)
-    # If not using the dedicated comparison template but a table is triggered, ensure instructions are present.
-    if comparison_triggered and effective_task_mode not in ("comparison_matrix",):
-        # already included a “Comparative Table” section in templates above; nothing additional required
-        pass
-    system_prompt = template["system"]
-    # Reinforce verbosity dynamically if caller passed a different hint
-    if verbosity_hint and verbosity_hint not in system_prompt:
-        system_prompt = system_prompt + " " + verbosity_hint
-    # Conditional redaction: if sentiment/coherence are empty, strip their input blocks to avoid nudging the model
-    if not sentiment_rollup:
-        user_prompt = user_prompt.replace("Sentiment JSON (rolled-up across top docs):\n{sentiment_json}\n", "")
-    if not coherence_report:
-        user_prompt = user_prompt.replace("Coherence report:\n{coherence_report}\n", "")
     return [
-        {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_prompt}
     ]
@@ -358,20 +184,12 @@ def generate_policy_answer(
     model_name: str,
     query: str,
     top_docs: List[Union[Dict[str, Any], Any]],
-    sentiment_rollup: Optional[Dict[str, Any]] = None,
     coherence_report: str = "",
-    task_mode: str = "verbatim_sentiment",   # 'verbatim_sentiment' | 'abstractive_summary' | 'followup_reasoning' | 'comparison_matrix' | 'auto'
     temperature: float = 0.2,
-    max_tokens: int = 2000,
-    force_table: bool = False,
-    dimensions_hint: Optional[Union[str, List[str]]] = None,
-    render_format: str = COMPARISON_CONFIG["default_render_format"],
-    verbosity_hint: str = VERBOSITY_HINT,
-    extra_comparison_keywords: Optional[List[str]] = None
 ) -> str:
-    """
-    Orchestrates the request with faithfulness guardrails and table-first comparison when indicated.
-    """
     if not top_docs:
         return "No documents available to answer."
@@ -380,21 +198,14 @@ def generate_policy_answer(
         top_docs=top_docs,
         task_mode=task_mode,
         sentiment_rollup=sentiment_rollup,
-        coherence_report=coherence_report,
-        topic_hint="energy policy",  # can be overridden by caller
-        force_table=force_table,
-        dimensions_hint=dimensions_hint,
-        render_format=render_format,
-        verbosity_hint=verbosity_hint,
-        extra_comparison_keywords=extra_comparison_keywords
     )
     resp = get_do_completion(api_key, model_name, messages, temperature=temperature, max_tokens=max_tokens)
     if resp is None:
         return "Upstream model error. No response."
     try:
         return resp["choices"][0]["message"]["content"].strip()
     except Exception:
-        # Fallback: return raw JSON so the caller can debug payload shape
         return json.dumps(resp, indent=2)

 import json
 import requests
+from typing import List, Dict, Any, Union
 import time
 import numpy as np
 import os
 PROMPT_TEMPLATES = {
     "verbatim_sentiment": {
         "system": (
+            "You are a compliance-grade policy analyst assistant. "
+            "Your job is to return a precise, fact-grounded response. "
+            "Avoid hallucinations. Base everything strictly on the content provided."
+            "if the coherence and or sentiment analysis is not enabled, do not mention it in the response."
         ),
         "user_template": """
 Query: {query}
+Deliverables:
+1) **Quoted Policy Excerpts**: Quote key policy content directly. Cite the source using filename and page Do not leave out any information provided
+2) **Sentiment Summary**: Use the sentiment JSON to explain tone, gaps, penalties, or enforcement clarity in plain English.
+3) **Coherence Assessment**: Summarize the coherence report below. Highlight:
+   - Whether the answer was mostly on-topic or off-topic
+   - point out the sections that were coherent, off topic and repeated
 Topic hint: {topic_hint}
     "abstractive_summary": {
         "system": (
             "You are a policy analyst summarizing government documents for a general audience. "
+            "Your response should paraphrase clearly, avoiding quotes unless absolutely necessary. "
+            "Highlight high-level goals, enforcement strategies, and important deadlines or penalties."
         ),
         "user_template": """Query: {query}
+Summarize the answer in natural, non-technical language. Emphasize clarity and coverage. Avoid quoting unless the phrase is legally binding.
 Topic hint: {topic_hint}
     "followup_reasoning": {
         "system": (
             "You are an assistant that explains policy documents interactively, reasoning step-by-step. "
+            "Always cite document IDs and indicate if certain info is absent."
         ),
         "user_template": """User query: {query}
+Explain the answer step-by-step. Add follow-up questions that a reader might ask, and try to answer them using the documents below.
 Topic: {topic_hint}
 """
     },
+    # Add more templates as needed
 }
         return None
 # --- Prompt context builder ---
 def _clip(text: str, max_chars: int = 1400) -> str:
     """Trim content to limit prompt size."""
     for i, item in enumerate(top_docs):
         if hasattr(item, "page_content"):
             text = item.page_content
+            meta = getattr(item, "metadata", {})
         else:
+            text = item.get("text") or item.get("page_content", "")
+            meta = item.get("metadata", {})
         # Get file name from path
         full_path = meta.get("source", "")
         page_label = meta.get("page_label") or meta.get("page") or "unknown"
         citation = f"{filename}, p. {page_label}"
         blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
     return "\n".join(blocks)
 # --- Message builder ---
 def build_messages(
     query: str,
     top_docs: List[Dict[str, Any]],
     task_mode: str,
+    sentiment_rollup: Dict[str, List[str]],
     coherence_report: str = "",
+    topic_hint: str = "energy policy"
 ) -> List[Dict[str, str]]:
+    template = PROMPT_TEMPLATES.get(task_mode)
     if not template:
+        raise ValueError(f"Unknown task mode: {task_mode}")
     context_block = build_context_block(top_docs)
     sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
+    user_prompt = template["user_template"].format(
+        query=query,
+        topic_hint=topic_hint,
+        sentiment_json=sentiment_json,
+        context_block=context_block,
+        coherence_report=coherence_report
+    )
     return [
+        {"role": "system", "content": template["system"]},
         {"role": "user", "content": user_prompt}
     ]
     model_name: str,
     query: str,
     top_docs: List[Union[Dict[str, Any], Any]],
+    sentiment_rollup: Dict[str, List[str]],
     coherence_report: str = "",
+    task_mode: str = "verbatim_sentiment",
     temperature: float = 0.2,
+    max_tokens: int = 2000
 ) -> str:
     if not top_docs:
         return "No documents available to answer."
         top_docs=top_docs,
         task_mode=task_mode,
         sentiment_rollup=sentiment_rollup,
+        coherence_report=coherence_report
     )
     resp = get_do_completion(api_key, model_name, messages, temperature=temperature, max_tokens=max_tokens)
     if resp is None:
         return "Upstream model error. No response."
     try:
         return resp["choices"][0]["message"]["content"].strip()
     except Exception:
         return json.dumps(resp, indent=2)