Spaces:

kaburia
/

policy-analysis

Running

App Files Files Community

kaburia commited on Aug 29, 2025

Commit

b022bee

1 Parent(s): 4c5d178

rewrite

Browse files

Files changed (6) hide show

utils/coherence_bbscore.py +2 -4
utils/conversation_logging.py +74 -0
utils/hybrid_retrieval.py +70 -0
utils/ingest_pdf.py +139 -0
utils/model_generation.py +34 -40
utils/query_constraints.py +58 -0

utils/coherence_bbscore.py CHANGED Viewed

@@ -3,7 +3,7 @@ import math, re, unicodedata
 from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import os, re, unicodedata, numpy as np
 try:
     from sentence_transformers import SentenceTransformer
 except Exception:
@@ -242,14 +242,12 @@ def coherence_assessment_std(
     }
 # Get the coherence report
-def coherence_report(embedder="BAAI/bge-m3",
                         input_text=None,
                         reranked_results=None,
                         run_zero_shot=True):
     embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
     if reranked_results is None:
-        # Import here to avoid circular imports
-        from utils.retrieve_n_rerank import retrieve_and_rerank
         reranked_results = retrieve_and_rerank(input_text)
     if not reranked_results:
         return []

 from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import os, re, unicodedata, numpy as np
+from utils.retrieve_n_rerank import retrieve_and_rerank
 try:
     from sentence_transformers import SentenceTransformer
 except Exception:
     }
 # Get the coherence report
+def coherence_report(embedder="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
                         input_text=None,
                         reranked_results=None,
                         run_zero_shot=True):
     embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
     if reranked_results is None:
         reranked_results = retrieve_and_rerank(input_text)
     if not reranked_results:
         return []

utils/conversation_logging.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os, json, time, threading, logging
+from datetime import datetime
+from typing import List, Tuple
+try:
+    import boto3
+    from botocore.exceptions import ClientError, NoCredentialsError
+except Exception:
+    boto3 = None
+    ClientError = NoCredentialsError = Exception
+LOG_FILE = os.getenv("CONVO_LOG_FILE", "conversation_history.jsonl")
+UPLOAD_ENABLED = os.getenv("SPACES_UPLOAD_CONVO", "true").lower() == "true"
+SPACES_KEY = os.getenv("SPACES_KEY")
+SPACES_SECRET = os.getenv("SPACES_SECRET")
+SPACES_BUCKET = os.getenv("SPACES_BUCKET")
+SPACES_REGION = os.getenv("SPACES_REGION", "ams3")
+_lock = threading.Lock()
+def load_history(max_lines: int = 500) -> List[Tuple[str,str]]:
+    if not os.path.exists(LOG_FILE):
+        return []
+    pairs: List[Tuple[str,str]] = []
+    try:
+        with open(LOG_FILE, "r", encoding="utf-8") as f:
+            for line in f.readlines()[-max_lines:]:
+                try:
+                    obj = json.loads(line)
+                    if obj.get("role") == "exchange":
+                        pairs.append((obj.get("user",""), obj.get("assistant","")))
+                except json.JSONDecodeError:
+                    continue
+    except Exception as e:
+        logging.error(f"Failed to load history: {e}")
+    return pairs
+def _write_line(obj: dict):
+    with open(LOG_FILE, "a", encoding="utf-8") as f:
+        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+def _upload_file():
+    if not (UPLOAD_ENABLED and boto3 and SPACES_KEY and SPACES_SECRET and SPACES_BUCKET):
+        return
+    try:
+        session = boto3.session.Session()
+        client = session.client(
+            's3',
+            region_name=SPACES_REGION,
+            endpoint_url=f"https://{SPACES_REGION}.digitaloceanspaces.com",
+            aws_access_key_id=SPACES_KEY,
+            aws_secret_access_key=SPACES_SECRET,
+        )
+        object_name = os.getenv("SPACES_CONVO_OBJECT", f"chat-logs/{os.path.basename(LOG_FILE)}")
+        client.upload_file(LOG_FILE, SPACES_BUCKET, object_name)
+    except (ClientError, NoCredentialsError) as e:
+        logging.error(f"Spaces upload failed: {e}")
+    except Exception as e:
+        logging.error(f"Unexpected upload error: {e}")
+def log_exchange(user_msg: str, assistant_msg: str, meta: dict = None):
+    ts = time.time()
+    record = {
+        "role": "exchange",
+        "timestamp": datetime.utcfromtimestamp(ts).isoformat() + "Z",
+        "user": user_msg,
+        "assistant": assistant_msg,
+        "meta": meta or {}
+    }
+    with _lock:
+        _write_line(record)
+    # Upload in background thread to avoid blocking UI
+    threading.Thread(target=_upload_file, daemon=True).start()

utils/hybrid_retrieval.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Hybrid retrieval (BM25 + dense) with deterministic filtering + page consolidation."""
+from typing import List, Optional, Dict, Any, Tuple
+from rank_bm25 import BM25Okapi
+import numpy as np
+from utils.encoding_input import encode_text
+from utils.retrieve_n_rerank import rerank_cross_encoder
+TOK_SPLIT = lambda t: [w.lower() for w in t.split() if w.strip()]
+class HybridRetriever:
+    def __init__(self, vectorstore):
+        self.vs = vectorstore
+        # Build BM25 corpus from all docs
+        self.docs = [self.vs.docstore.search(self.vs.index_to_docstore_id[i]) for i in range(len(self.vs.index_to_docstore_id))]
+        corpus_tokens = [TOK_SPLIT(d.page_content) for d in self.docs]
+        self.bm25 = BM25Okapi(corpus_tokens)
+    def fetch(self, query: str, k_dense=30, k_bm25=30, filters: Dict[str, Any] = None, rerank_top=12) -> List[Any]:
+        filters = filters or {}
+        q_emb = encode_text(query)
+        # Dense search
+        q = np.asarray(q_emb, dtype="float32").reshape(1,-1)
+        D, I = self.vs.index.search(q, k_dense)
+        dense_docs = [self.docs[i] for i in I[0] if i < len(self.docs)]
+        # BM25
+        bm_scores = self.bm25.get_scores(TOK_SPLIT(query))
+        top_bm_idx = np.argsort(bm_scores)[::-1][:k_bm25]
+        bm25_docs = [self.docs[i] for i in top_bm_idx]
+        # Union
+        uniq = {}
+        for d in dense_docs + bm25_docs:
+            m = getattr(d, 'metadata', {})
+            key = (m.get('source'), m.get('page_label'), m.get('page') )
+            if key not in uniq:
+                uniq[key] = d
+        docs = list(uniq.values())
+        # Apply filters
+        def ok(d):
+            m = getattr(d,'metadata',{})
+            for k,v in filters.items():
+                if v is None: continue
+                if str(m.get(k)) != str(v):
+                    return False
+            return True
+        docs = [d for d in docs if ok(d)] if filters else docs
+        if not docs:
+            return []
+        # Rerank
+        reranked = rerank_cross_encoder(query, docs, top_m=rerank_top)
+        return [d for d,_ in reranked]
+def consolidate_page(docs: List[Any], target_page: Optional[str]) -> List[Any]:
+    if not target_page:
+        return docs
+    # Merge all docs with same (source,page_label)
+    by_key: Dict[Tuple[str,str], List[Any]] = {}
+    for d in docs:
+        m = getattr(d,'metadata',{})
+        key = (m.get('source'), str(m.get('page_label') or m.get('page')))
+        by_key.setdefault(key, []).append(d)
+    merged = []
+    from langchain.schema import Document
+    for (src,p), group in by_key.items():
+        if p != str(target_page):
+            continue
+        text = "\n".join(g.page_content for g in group)
+        meta = dict(group[0].metadata)
+        meta['merged_chunks'] = len(group)
+        merged.append(Document(page_content=text, metadata=meta))
+    return merged or docs

utils/ingest_pdf.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""Ingestion pipeline to build a page‑level FAISS index with rich metadata.
+Features:
+ - Per page extraction (page_index 0-based, page_label as shown in PDF)
+ - Optional OCR fallback for blank / low-text pages (scanned PDFs)
+ - Records include: doc_id, doc_title, page_index, page_label, text,
+                    section_heading (heuristic), span_start/stop (page chars),
+                    has_anchor flags for configured phrases.
+ - Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)
+Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
+"""
+from __future__ import annotations
+import os, re, json, uuid
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Iterable
+from pypdf import PdfReader
+import pytesseract
+from PIL import Image
+from io import BytesIO
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document
+ANCHOR_PHRASES = [
+    "Specifically these objectives are",
+]
+HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$")
+@dataclass
+class PageRecord:
+    doc_id: str
+    doc_title: str
+    page_index: int
+    page_label: str
+    text: str
+    section_heading: str
+    span_start: int
+    span_stop: int
+    has_anchors: Dict[str, bool]
+    source: str  # original path
+def _extract_page_label(reader, idx: int) -> str:
+    # Attempt to read logical page label from PDF (if present); fallback to idx+1
+    try:
+        return reader.page_labels[idx]
+    except Exception:
+        return str(idx + 1)
+def _ocr_page(page) -> str:
+    try:
+        images = page.images
+    except Exception:
+        images = []
+    texts = []
+    for img_obj in images:
+        try:
+            data = img_obj.data
+            im = Image.open(BytesIO(data))
+            txt = pytesseract.image_to_string(im)
+            if txt.strip():
+                texts.append(txt)
+        except Exception:
+            continue
+    return "\n".join(texts).strip()
+def _heading_from_text(text: str) -> str:
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+    for l in lines[:8]:  # inspect first few lines
+        if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
+            return l[:120]
+    return ""
+def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
+    reader = PdfReader(path)
+    doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
+    doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
+    records: List[PageRecord] = []
+    for i, page in enumerate(reader.pages):
+        try:
+            raw = page.extract_text() or ""
+        except Exception:
+            raw = ""
+        if len(raw.strip()) < 20:  # fallback to OCR for likely scanned page
+            raw_ocr = _ocr_page(page)
+            if len(raw_ocr) > len(raw):
+                raw = raw_ocr
+        page_label = _extract_page_label(reader, i)
+        heading = _heading_from_text(raw)
+        has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
+        rec = PageRecord(
+            doc_id=doc_id,
+            doc_title=doc_title,
+            page_index=i,
+            page_label=str(page_label),
+            text=raw,
+            section_heading=heading,
+            span_start=0,
+            span_stop=len(raw),
+            has_anchors=has_anchors,
+            source=path,
+        )
+        records.append(rec)
+    return records
+def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
+    os.makedirs(index_dir, exist_ok=True)
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    docs = [Document(page_content=r.text, metadata={
+        "doc_id": r.doc_id,
+        "doc_title": r.doc_title,
+        "page_index": r.page_index,
+        "page_label": r.page_label,
+        "section_heading": r.section_heading,
+        "span_start": r.span_start,
+        "span_stop": r.span_stop,
+        "source": r.source,
+        **{f"anchor_{k}": v for k, v in r.has_anchors.items()}
+    }) for r in records]
+    vs = FAISS.from_documents(docs, embeddings)
+    vs.save_local(index_dir)
+    # also write JSONL
+    with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
+    return index_dir
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("pdf", help="Path to PDF")
+    ap.add_argument("--doc-id")
+    ap.add_argument("--doc-title")
+    ap.add_argument("--out", default="faiss_index_new")
+    args = ap.parse_args()
+    recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
+    build_vectorstore(recs, args.out)
+    print(f"Ingested {len(recs)} pages -> {args.out}")

utils/model_generation.py CHANGED Viewed

@@ -26,37 +26,9 @@ PROMPT_TEMPLATES = {
             "The context is already searched, retrieved and reranked when handed to you."
-        ),
-        "user_template": """
-Query: {query}
-Deliverables (use the exact section headers below; omit any section whose input is empty/disabled):
-1) Quoted Policy Excerpts
-   - Quote the  necessary text and append citations like (filename p.X). Group by subtopic.
-   - Try to meet the user's specification as much as possible where if they only want items from a certain page only give out data from that page or if it is from a certain document please only retrieve just from that document
-    - Order by page
-2) Sentiment Summary
-   - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.
-3) Coherence Assessment
-   - From the coherence report only provide when ticked: state on-topic vs off-topic; call out which sections were coherent, off-topic, or repeated.
-Constraints:
-- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'
-- Use full sentences (no telegraphic fragments).
-- Each substantive statement has a citation.
-Topic hint: {topic_hint}
-Sentiment JSON (rolled-up across top docs):
-{sentiment_json}
-Coherence report:
-{coherence_report}
-Context Sources:
-{context_block}
-"""
     },
     "abstractive_summary": {
@@ -183,7 +155,7 @@ def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
         citation = f"{filename}, p. {page_label}"
-        blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
     return "\n".join(blocks)
@@ -195,25 +167,47 @@ def build_messages(
     task_mode: str,
     sentiment_rollup: Dict[str, List[str]],
     coherence_report: str = "",
-    topic_hint: str = "energy policy"
 ) -> List[Dict[str, str]]:
     template = PROMPT_TEMPLATES.get(task_mode)
     if not template:
         raise ValueError(f"Unknown task mode: {task_mode}")
     context_block = build_context_block(top_docs)
     sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
-    user_prompt = template["user_template"].format(
-        query=query,
-        topic_hint=topic_hint,
-        sentiment_json=sentiment_json,
-        context_block=context_block,
-        coherence_report=coherence_report
     )
     return [
-        {"role": "system", "content": template["system"]},
         {"role": "user", "content": user_prompt}
     ]

             "The context is already searched, retrieved and reranked when handed to you."
+    ),
+    # dynamic assembly; placeholders kept for backward compatibility but sections may be removed
+    "user_template": "DYNAMIC"
     },
     "abstractive_summary": {
         citation = f"{filename}, p. {page_label}"
+    blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
     return "\n".join(blocks)
     task_mode: str,
     sentiment_rollup: Dict[str, List[str]],
     coherence_report: str = "",
+    topic_hint: str = "energy policy",
+    allowlist_meta: Dict[str, Any] = None
 ) -> List[Dict[str, str]]:
     template = PROMPT_TEMPLATES.get(task_mode)
     if not template:
         raise ValueError(f"Unknown task mode: {task_mode}")
     context_block = build_context_block(top_docs)
+    sentiment_present = bool(sentiment_rollup)
+    coherence_present = bool(coherence_report)
     sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
+    # Build user prompt dynamically to truly omit absent sections
+    parts = [
+        f"Query: {query}\n",
+        "Deliverables (omit any section whose input is empty/disabled):",
+        "1) Quoted Policy Excerpts\n   - Quote the necessary text and append citations like (filename p.X). Group by subtopic.\n   - Honor any page or document restriction from the query strictly.\n   - Order by page",
+    ]
+    if sentiment_present:
+        parts.append("2) Sentiment Summary\n   - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.")
+    if coherence_present:
+        idx = 3 if sentiment_present else 2
+        parts.append(f"{idx}) Coherence Assessment\n   - From the coherence report: on-topic vs off-topic; note coherent/off-topic/repeated sections only if present.")
+    parts.append(
+        "\nConstraints:\n- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'\n- Use full sentences.\n- Each substantive statement has a citation."
     )
+    parts.append(f"\nTopic hint: {topic_hint}\n")
+    if sentiment_present:
+        parts.append(f"Sentiment JSON (rolled-up across top docs):\n{sentiment_json}\n")
+    if coherence_present:
+        parts.append(f"Coherence report:\n{coherence_report}\n")
+    guard = ""
+    if allowlist_meta:
+        doc_id = allowlist_meta.get('doc_id')
+        pages = allowlist_meta.get('pages')
+        guard = f"[ALLOWLIST_DOCS] doc_id={doc_id}; pages={pages}\nOnly use text from chunks where doc_id={doc_id} and page_label in {pages}. If none present reply exactly: Not found in sources for page {pages} of {doc_id}. Do not use any other documents.\n"
+    parts.append(f"{guard}Context Sources:\n{context_block}")
+    user_prompt = "\n".join(parts)
     return [
+    {"role": "system", "content": template["system"]},
         {"role": "user", "content": user_prompt}
     ]

utils/query_constraints.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+from typing import Optional, Dict, List
+PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE)
+# crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year
+DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})")
+STOP = {"the","and","of","in","policy","document","national"}
+def _doc_tokens(phrase: str) -> List[str]:
+    return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \
+           [y for y in re.findall(r"20\d{2}", phrase)]
+def parse_query_constraints(query: str) -> Dict[str, Optional[int]]:
+    """Extract simple structured constraints from the natural language query.
+    Currently supports:
+      - page: "page 17" -> page=17
+    Extendable later for document title filtering.
+    """
+    page = None
+    if query:
+        m = PAGE_PATTERN.search(query)
+        if m:
+            try:
+                page = int(m.group(1))
+            except ValueError:
+                page = None
+    doc_tokens: List[str] = []
+    if query:
+        for m in DOC_PHRASE_PATTERN.finditer(query):
+            doc_tokens = _doc_tokens(m.group(1))
+            if doc_tokens:
+                break
+    return {"page": page, "doc_tokens": doc_tokens}
+def page_matches(meta, target_page: int) -> bool:
+    """Return True if metadata page/page_label matches the requested page.
+    Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page.
+    """
+    if target_page is None:
+        return True
+    label = meta.get("page_label") or meta.get("page") or ""
+    if label is None:
+        return False
+    # Normalize to string and extract integers present
+    s = str(label)
+    nums = re.findall(r"\d+", s)
+    return any(int(n) == target_page for n in nums)
+def doc_matches(meta, tokens: List[str]) -> bool:
+    if not tokens:
+        return True
+    src = (meta.get("source") or meta.get("path") or "").lower()
+    if not src:
+        return False
+    hit = sum(1 for t in tokens if t in src)
+    # require at least 60% of tokens present
+    return hit / max(1, len(tokens)) >= 0.6