Spaces:

kaburia
/

policy-analysis

Paused

App Files Files Community

kaburia commited on Aug 29, 2025

Commit

c0a58c4

1 Parent(s): 5d99375

j

Browse files

Files changed (1) hide show

app.py +89 -16

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ try:
     from utils.query_constraints import parse_query_constraints, page_matches, doc_matches
     from utils.conversation_logging import load_history, log_exchange
     from langchain.schema import Document
 except ImportError as e:
     print(f"Import error: {e}")
     print("Make sure you're running from the correct directory and all dependencies are installed.")
@@ -30,7 +31,49 @@ ENABLE_COHERENCE = True
 # Load persisted history (if any) for memory retention
 PERSISTED_HISTORY = load_history()
-def chat_response(message, history):
     """
     Generate response for chat interface.
@@ -47,17 +90,28 @@ def chat_response(message, history):
         want_page = constraints.get("page")
         doc_tokens = constraints.get("doc_tokens", [])
-        # Increase initial recall if a specific page is requested
-        base_k = 120 if want_page is not None else 50
-        reranked_results = retrieve_and_rerank(
-            query_text=message,
-            vectorstore=vectorstore,
-            k=base_k,
-            rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
-            top_m=40 if want_page is not None else 20,
-            min_score=0.4 if want_page is not None else 0.5,  # relax threshold for page-constrained queries
-            only_docs=False
-        )
         if not reranked_results:
             return "I'm sorry, I couldn't find any relevant information in the policy documents to answer your question. Could you try rephrasing your question or asking about a different topic?"
@@ -96,6 +150,24 @@ def chat_response(message, history):
         top_docs = [doc for doc, score in reranked_results]
         # Perform sentiment and coherence analysis if enabled
         sentiment_rollup = get_sentiment(top_docs) if ENABLE_SENTIMENT else {}
         coherence_report_ = coherence_report(reranked_results=top_docs, input_text=message) if ENABLE_COHERENCE else ""
@@ -274,9 +346,9 @@ with gr.Blocks(title="Kenya Policy Assistant - Chat", theme=gr.themes.Soft()) as
                 )
     # Chat functionality
-    def respond(message, history):
         if message.strip():
-            bot_message = chat_response(message, history)
             history.append([message, ""])
             for partial_response in bot_message:
@@ -285,8 +357,9 @@ with gr.Blocks(title="Kenya Policy Assistant - Chat", theme=gr.themes.Soft()) as
         else:
             yield history, ""
-    submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
-    msg.submit(respond, [msg, chatbot], [chatbot, msg])
     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
     # Update settings when toggles change

     from utils.query_constraints import parse_query_constraints, page_matches, doc_matches
     from utils.conversation_logging import load_history, log_exchange
     from langchain.schema import Document
+    from utils.hybrid_retrieval import HybridRetriever, consolidate_page
 except ImportError as e:
     print(f"Import error: {e}")
     print("Make sure you're running from the correct directory and all dependencies are installed.")
 # Load persisted history (if any) for memory retention
 PERSISTED_HISTORY = load_history()
+# Default verbatim mode flag (quotes only, no generative summarization)
+VERBATIM_MODE_DEFAULT = True
+def _citation(meta):
+    src = os.path.basename(meta.get('source','Unknown'))
+    page = meta.get('page_label') or meta.get('page') or 'unknown'
+    return f"({src} p.{page})"
+def _extract_quotes(query: str, docs, max_quotes: int = 12):
+    import re, math
+    terms = [t.lower() for t in re.findall(r"[A-Za-z0-9]+", query) if len(t) > 2]
+    term_set = set(terms)
+    scored = []
+    for d in docs:
+        meta = getattr(d,'metadata',{})
+        # split on sentence end punctuation heuristically
+        sentences = re.split(r"(?<=[\.!?])\s+", d.page_content)
+        for sent in sentences:
+            s = sent.strip()
+            if not s:
+                continue
+            toks = [w.lower() for w in re.findall(r"[A-Za-z0-9]+", s)]
+            if not toks:
+                continue
+            overlap = len(term_set.intersection(toks))
+            if overlap == 0:
+                continue
+            score = overlap / math.log(len(toks)+1, 2)
+            scored.append((score, s, meta))
+    scored.sort(key=lambda x: x[0], reverse=True)
+    out = []
+    seen = set()
+    for score, s, meta in scored:
+        key = (s, meta.get('source'), meta.get('page_label'))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(f"• \"{s}\" {_citation(meta)}")
+        if len(out) >= max_quotes:
+            break
+    return out
+def chat_response(message, history, verbatim_mode=True):
     """
     Generate response for chat interface.
         want_page = constraints.get("page")
         doc_tokens = constraints.get("doc_tokens", [])
+        # Attempt hybrid deterministic retrieval first
+        reranked_results = []
+        try:
+            hybrid = HybridRetriever(vectorstore)
+            filters = {}
+            if want_page is not None:
+                filters['page_label'] = str(want_page)
+            hybrid_docs = hybrid.fetch(message, k_dense=40, k_bm25=40, filters=filters if filters else None, rerank_top=30)
+            reranked_results = [(d, 0.0) for d in hybrid_docs]
+        except Exception:
+            pass
+        if not reranked_results:  # fallback legacy path
+            base_k = 120 if want_page is not None else 50
+            reranked_results = retrieve_and_rerank(
+                query_text=message,
+                vectorstore=vectorstore,
+                k=base_k,
+                rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
+                top_m=40 if want_page is not None else 20,
+                min_score=0.4 if want_page is not None else 0.5,
+                only_docs=False
+            )
         if not reranked_results:
             return "I'm sorry, I couldn't find any relevant information in the policy documents to answer your question. Could you try rephrasing your question or asking about a different topic?"
         top_docs = [doc for doc, score in reranked_results]
+        # If single page requested consolidate all fragments so quotes not truncated
+        if want_page is not None:
+            consolidated = consolidate_page(top_docs, str(want_page))
+            if consolidated:
+                top_docs = consolidated
+        if verbatim_mode:
+            quotes = _extract_quotes(message, top_docs)
+            if not quotes:
+                return "Not found in sources."
+            answer = "Quoted Policy Excerpts (verbatim)\n" + "\n".join(quotes)
+            yield answer
+            try:
+                log_exchange(message, answer, meta={"mode":"verbatim","page": want_page})
+            except Exception:
+                pass
+            return
         # Perform sentiment and coherence analysis if enabled
         sentiment_rollup = get_sentiment(top_docs) if ENABLE_SENTIMENT else {}
         coherence_report_ = coherence_report(reranked_results=top_docs, input_text=message) if ENABLE_COHERENCE else ""
                 )
     # Chat functionality
+    def respond(message, history, verbatim_mode):
         if message.strip():
+            bot_message = chat_response(message, history, verbatim_mode=verbatim_mode)
             history.append([message, ""])
             for partial_response in bot_message:
         else:
             yield history, ""
+    verbatim_toggle = gr.Checkbox(label="Verbatim Mode", value=VERBATIM_MODE_DEFAULT, info="Return only exact quoted excerpts (no generation).")
+    submit_btn.click(respond, [msg, chatbot, verbatim_toggle], [chatbot, msg])
+    msg.submit(respond, [msg, chatbot, verbatim_toggle], [chatbot, msg])
     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
     # Update settings when toggles change