Spaces:
Running
Running
rewrite
Browse files- utils/coherence_bbscore.py +2 -4
- utils/conversation_logging.py +74 -0
- utils/hybrid_retrieval.py +70 -0
- utils/ingest_pdf.py +139 -0
- utils/model_generation.py +34 -40
- utils/query_constraints.py +58 -0
utils/coherence_bbscore.py
CHANGED
|
@@ -3,7 +3,7 @@ import math, re, unicodedata
|
|
| 3 |
from typing import List, Dict, Any, Optional, Tuple
|
| 4 |
import numpy as np
|
| 5 |
import os, re, unicodedata, numpy as np
|
| 6 |
-
|
| 7 |
try:
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
except Exception:
|
|
@@ -242,14 +242,12 @@ def coherence_assessment_std(
|
|
| 242 |
}
|
| 243 |
|
| 244 |
# Get the coherence report
|
| 245 |
-
def coherence_report(embedder="
|
| 246 |
input_text=None,
|
| 247 |
reranked_results=None,
|
| 248 |
run_zero_shot=True):
|
| 249 |
embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
|
| 250 |
if reranked_results is None:
|
| 251 |
-
# Import here to avoid circular imports
|
| 252 |
-
from utils.retrieve_n_rerank import retrieve_and_rerank
|
| 253 |
reranked_results = retrieve_and_rerank(input_text)
|
| 254 |
if not reranked_results:
|
| 255 |
return []
|
|
|
|
| 3 |
from typing import List, Dict, Any, Optional, Tuple
|
| 4 |
import numpy as np
|
| 5 |
import os, re, unicodedata, numpy as np
|
| 6 |
+
from utils.retrieve_n_rerank import retrieve_and_rerank
|
| 7 |
try:
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
except Exception:
|
|
|
|
| 242 |
}
|
| 243 |
|
| 244 |
# Get the coherence report
|
| 245 |
+
def coherence_report(embedder="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
|
| 246 |
input_text=None,
|
| 247 |
reranked_results=None,
|
| 248 |
run_zero_shot=True):
|
| 249 |
embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
|
| 250 |
if reranked_results is None:
|
|
|
|
|
|
|
| 251 |
reranked_results = retrieve_and_rerank(input_text)
|
| 252 |
if not reranked_results:
|
| 253 |
return []
|
utils/conversation_logging.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json, time, threading, logging
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
import boto3
|
| 7 |
+
from botocore.exceptions import ClientError, NoCredentialsError
|
| 8 |
+
except Exception:
|
| 9 |
+
boto3 = None
|
| 10 |
+
ClientError = NoCredentialsError = Exception
|
| 11 |
+
|
| 12 |
+
LOG_FILE = os.getenv("CONVO_LOG_FILE", "conversation_history.jsonl")
|
| 13 |
+
UPLOAD_ENABLED = os.getenv("SPACES_UPLOAD_CONVO", "true").lower() == "true"
|
| 14 |
+
|
| 15 |
+
SPACES_KEY = os.getenv("SPACES_KEY")
|
| 16 |
+
SPACES_SECRET = os.getenv("SPACES_SECRET")
|
| 17 |
+
SPACES_BUCKET = os.getenv("SPACES_BUCKET")
|
| 18 |
+
SPACES_REGION = os.getenv("SPACES_REGION", "ams3")
|
| 19 |
+
|
| 20 |
+
_lock = threading.Lock()
|
| 21 |
+
|
| 22 |
+
def load_history(max_lines: int = 500) -> List[Tuple[str,str]]:
|
| 23 |
+
if not os.path.exists(LOG_FILE):
|
| 24 |
+
return []
|
| 25 |
+
pairs: List[Tuple[str,str]] = []
|
| 26 |
+
try:
|
| 27 |
+
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 28 |
+
for line in f.readlines()[-max_lines:]:
|
| 29 |
+
try:
|
| 30 |
+
obj = json.loads(line)
|
| 31 |
+
if obj.get("role") == "exchange":
|
| 32 |
+
pairs.append((obj.get("user",""), obj.get("assistant","")))
|
| 33 |
+
except json.JSONDecodeError:
|
| 34 |
+
continue
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logging.error(f"Failed to load history: {e}")
|
| 37 |
+
return pairs
|
| 38 |
+
|
| 39 |
+
def _write_line(obj: dict):
|
| 40 |
+
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
| 41 |
+
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
| 42 |
+
|
| 43 |
+
def _upload_file():
|
| 44 |
+
if not (UPLOAD_ENABLED and boto3 and SPACES_KEY and SPACES_SECRET and SPACES_BUCKET):
|
| 45 |
+
return
|
| 46 |
+
try:
|
| 47 |
+
session = boto3.session.Session()
|
| 48 |
+
client = session.client(
|
| 49 |
+
's3',
|
| 50 |
+
region_name=SPACES_REGION,
|
| 51 |
+
endpoint_url=f"https://{SPACES_REGION}.digitaloceanspaces.com",
|
| 52 |
+
aws_access_key_id=SPACES_KEY,
|
| 53 |
+
aws_secret_access_key=SPACES_SECRET,
|
| 54 |
+
)
|
| 55 |
+
object_name = os.getenv("SPACES_CONVO_OBJECT", f"chat-logs/{os.path.basename(LOG_FILE)}")
|
| 56 |
+
client.upload_file(LOG_FILE, SPACES_BUCKET, object_name)
|
| 57 |
+
except (ClientError, NoCredentialsError) as e:
|
| 58 |
+
logging.error(f"Spaces upload failed: {e}")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logging.error(f"Unexpected upload error: {e}")
|
| 61 |
+
|
| 62 |
+
def log_exchange(user_msg: str, assistant_msg: str, meta: dict = None):
|
| 63 |
+
ts = time.time()
|
| 64 |
+
record = {
|
| 65 |
+
"role": "exchange",
|
| 66 |
+
"timestamp": datetime.utcfromtimestamp(ts).isoformat() + "Z",
|
| 67 |
+
"user": user_msg,
|
| 68 |
+
"assistant": assistant_msg,
|
| 69 |
+
"meta": meta or {}
|
| 70 |
+
}
|
| 71 |
+
with _lock:
|
| 72 |
+
_write_line(record)
|
| 73 |
+
# Upload in background thread to avoid blocking UI
|
| 74 |
+
threading.Thread(target=_upload_file, daemon=True).start()
|
utils/hybrid_retrieval.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hybrid retrieval (BM25 + dense) with deterministic filtering + page consolidation."""
|
| 2 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 3 |
+
from rank_bm25 import BM25Okapi
|
| 4 |
+
import numpy as np
|
| 5 |
+
from utils.encoding_input import encode_text
|
| 6 |
+
from utils.retrieve_n_rerank import rerank_cross_encoder
|
| 7 |
+
|
| 8 |
+
TOK_SPLIT = lambda t: [w.lower() for w in t.split() if w.strip()]
|
| 9 |
+
|
| 10 |
+
class HybridRetriever:
|
| 11 |
+
def __init__(self, vectorstore):
|
| 12 |
+
self.vs = vectorstore
|
| 13 |
+
# Build BM25 corpus from all docs
|
| 14 |
+
self.docs = [self.vs.docstore.search(self.vs.index_to_docstore_id[i]) for i in range(len(self.vs.index_to_docstore_id))]
|
| 15 |
+
corpus_tokens = [TOK_SPLIT(d.page_content) for d in self.docs]
|
| 16 |
+
self.bm25 = BM25Okapi(corpus_tokens)
|
| 17 |
+
|
| 18 |
+
def fetch(self, query: str, k_dense=30, k_bm25=30, filters: Dict[str, Any] = None, rerank_top=12) -> List[Any]:
|
| 19 |
+
filters = filters or {}
|
| 20 |
+
q_emb = encode_text(query)
|
| 21 |
+
# Dense search
|
| 22 |
+
q = np.asarray(q_emb, dtype="float32").reshape(1,-1)
|
| 23 |
+
D, I = self.vs.index.search(q, k_dense)
|
| 24 |
+
dense_docs = [self.docs[i] for i in I[0] if i < len(self.docs)]
|
| 25 |
+
# BM25
|
| 26 |
+
bm_scores = self.bm25.get_scores(TOK_SPLIT(query))
|
| 27 |
+
top_bm_idx = np.argsort(bm_scores)[::-1][:k_bm25]
|
| 28 |
+
bm25_docs = [self.docs[i] for i in top_bm_idx]
|
| 29 |
+
# Union
|
| 30 |
+
uniq = {}
|
| 31 |
+
for d in dense_docs + bm25_docs:
|
| 32 |
+
m = getattr(d, 'metadata', {})
|
| 33 |
+
key = (m.get('source'), m.get('page_label'), m.get('page') )
|
| 34 |
+
if key not in uniq:
|
| 35 |
+
uniq[key] = d
|
| 36 |
+
docs = list(uniq.values())
|
| 37 |
+
# Apply filters
|
| 38 |
+
def ok(d):
|
| 39 |
+
m = getattr(d,'metadata',{})
|
| 40 |
+
for k,v in filters.items():
|
| 41 |
+
if v is None: continue
|
| 42 |
+
if str(m.get(k)) != str(v):
|
| 43 |
+
return False
|
| 44 |
+
return True
|
| 45 |
+
docs = [d for d in docs if ok(d)] if filters else docs
|
| 46 |
+
if not docs:
|
| 47 |
+
return []
|
| 48 |
+
# Rerank
|
| 49 |
+
reranked = rerank_cross_encoder(query, docs, top_m=rerank_top)
|
| 50 |
+
return [d for d,_ in reranked]
|
| 51 |
+
|
| 52 |
+
def consolidate_page(docs: List[Any], target_page: Optional[str]) -> List[Any]:
|
| 53 |
+
if not target_page:
|
| 54 |
+
return docs
|
| 55 |
+
# Merge all docs with same (source,page_label)
|
| 56 |
+
by_key: Dict[Tuple[str,str], List[Any]] = {}
|
| 57 |
+
for d in docs:
|
| 58 |
+
m = getattr(d,'metadata',{})
|
| 59 |
+
key = (m.get('source'), str(m.get('page_label') or m.get('page')))
|
| 60 |
+
by_key.setdefault(key, []).append(d)
|
| 61 |
+
merged = []
|
| 62 |
+
from langchain.schema import Document
|
| 63 |
+
for (src,p), group in by_key.items():
|
| 64 |
+
if p != str(target_page):
|
| 65 |
+
continue
|
| 66 |
+
text = "\n".join(g.page_content for g in group)
|
| 67 |
+
meta = dict(group[0].metadata)
|
| 68 |
+
meta['merged_chunks'] = len(group)
|
| 69 |
+
merged.append(Document(page_content=text, metadata=meta))
|
| 70 |
+
return merged or docs
|
utils/ingest_pdf.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingestion pipeline to build a page‑level FAISS index with rich metadata.
|
| 2 |
+
|
| 3 |
+
Features:
|
| 4 |
+
- Per page extraction (page_index 0-based, page_label as shown in PDF)
|
| 5 |
+
- Optional OCR fallback for blank / low-text pages (scanned PDFs)
|
| 6 |
+
- Records include: doc_id, doc_title, page_index, page_label, text,
|
| 7 |
+
section_heading (heuristic), span_start/stop (page chars),
|
| 8 |
+
has_anchor flags for configured phrases.
|
| 9 |
+
- Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)
|
| 10 |
+
|
| 11 |
+
Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
import os, re, json, uuid
|
| 15 |
+
from dataclasses import dataclass, asdict
|
| 16 |
+
from typing import List, Dict, Iterable
|
| 17 |
+
from pypdf import PdfReader
|
| 18 |
+
import pytesseract
|
| 19 |
+
from PIL import Image
|
| 20 |
+
from io import BytesIO
|
| 21 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 22 |
+
from langchain_community.vectorstores import FAISS
|
| 23 |
+
from langchain.schema import Document
|
| 24 |
+
|
| 25 |
+
ANCHOR_PHRASES = [
|
| 26 |
+
"Specifically these objectives are",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$")
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class PageRecord:
|
| 33 |
+
doc_id: str
|
| 34 |
+
doc_title: str
|
| 35 |
+
page_index: int
|
| 36 |
+
page_label: str
|
| 37 |
+
text: str
|
| 38 |
+
section_heading: str
|
| 39 |
+
span_start: int
|
| 40 |
+
span_stop: int
|
| 41 |
+
has_anchors: Dict[str, bool]
|
| 42 |
+
source: str # original path
|
| 43 |
+
|
| 44 |
+
def _extract_page_label(reader, idx: int) -> str:
|
| 45 |
+
# Attempt to read logical page label from PDF (if present); fallback to idx+1
|
| 46 |
+
try:
|
| 47 |
+
return reader.page_labels[idx]
|
| 48 |
+
except Exception:
|
| 49 |
+
return str(idx + 1)
|
| 50 |
+
|
| 51 |
+
def _ocr_page(page) -> str:
|
| 52 |
+
try:
|
| 53 |
+
images = page.images
|
| 54 |
+
except Exception:
|
| 55 |
+
images = []
|
| 56 |
+
texts = []
|
| 57 |
+
for img_obj in images:
|
| 58 |
+
try:
|
| 59 |
+
data = img_obj.data
|
| 60 |
+
im = Image.open(BytesIO(data))
|
| 61 |
+
txt = pytesseract.image_to_string(im)
|
| 62 |
+
if txt.strip():
|
| 63 |
+
texts.append(txt)
|
| 64 |
+
except Exception:
|
| 65 |
+
continue
|
| 66 |
+
return "\n".join(texts).strip()
|
| 67 |
+
|
| 68 |
+
def _heading_from_text(text: str) -> str:
|
| 69 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 70 |
+
for l in lines[:8]: # inspect first few lines
|
| 71 |
+
if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
|
| 72 |
+
return l[:120]
|
| 73 |
+
return ""
|
| 74 |
+
|
| 75 |
+
def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
|
| 76 |
+
reader = PdfReader(path)
|
| 77 |
+
doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
|
| 78 |
+
doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
|
| 79 |
+
records: List[PageRecord] = []
|
| 80 |
+
for i, page in enumerate(reader.pages):
|
| 81 |
+
try:
|
| 82 |
+
raw = page.extract_text() or ""
|
| 83 |
+
except Exception:
|
| 84 |
+
raw = ""
|
| 85 |
+
if len(raw.strip()) < 20: # fallback to OCR for likely scanned page
|
| 86 |
+
raw_ocr = _ocr_page(page)
|
| 87 |
+
if len(raw_ocr) > len(raw):
|
| 88 |
+
raw = raw_ocr
|
| 89 |
+
page_label = _extract_page_label(reader, i)
|
| 90 |
+
heading = _heading_from_text(raw)
|
| 91 |
+
has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
|
| 92 |
+
rec = PageRecord(
|
| 93 |
+
doc_id=doc_id,
|
| 94 |
+
doc_title=doc_title,
|
| 95 |
+
page_index=i,
|
| 96 |
+
page_label=str(page_label),
|
| 97 |
+
text=raw,
|
| 98 |
+
section_heading=heading,
|
| 99 |
+
span_start=0,
|
| 100 |
+
span_stop=len(raw),
|
| 101 |
+
has_anchors=has_anchors,
|
| 102 |
+
source=path,
|
| 103 |
+
)
|
| 104 |
+
records.append(rec)
|
| 105 |
+
return records
|
| 106 |
+
|
| 107 |
+
def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
|
| 108 |
+
os.makedirs(index_dir, exist_ok=True)
|
| 109 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 110 |
+
docs = [Document(page_content=r.text, metadata={
|
| 111 |
+
"doc_id": r.doc_id,
|
| 112 |
+
"doc_title": r.doc_title,
|
| 113 |
+
"page_index": r.page_index,
|
| 114 |
+
"page_label": r.page_label,
|
| 115 |
+
"section_heading": r.section_heading,
|
| 116 |
+
"span_start": r.span_start,
|
| 117 |
+
"span_stop": r.span_stop,
|
| 118 |
+
"source": r.source,
|
| 119 |
+
**{f"anchor_{k}": v for k, v in r.has_anchors.items()}
|
| 120 |
+
}) for r in records]
|
| 121 |
+
vs = FAISS.from_documents(docs, embeddings)
|
| 122 |
+
vs.save_local(index_dir)
|
| 123 |
+
# also write JSONL
|
| 124 |
+
with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
|
| 125 |
+
for r in records:
|
| 126 |
+
f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
|
| 127 |
+
return index_dir
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
import argparse
|
| 131 |
+
ap = argparse.ArgumentParser()
|
| 132 |
+
ap.add_argument("pdf", help="Path to PDF")
|
| 133 |
+
ap.add_argument("--doc-id")
|
| 134 |
+
ap.add_argument("--doc-title")
|
| 135 |
+
ap.add_argument("--out", default="faiss_index_new")
|
| 136 |
+
args = ap.parse_args()
|
| 137 |
+
recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
|
| 138 |
+
build_vectorstore(recs, args.out)
|
| 139 |
+
print(f"Ingested {len(recs)} pages -> {args.out}")
|
utils/model_generation.py
CHANGED
|
@@ -26,37 +26,9 @@ PROMPT_TEMPLATES = {
|
|
| 26 |
"The context is already searched, retrieved and reranked when handed to you."
|
| 27 |
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
Deliverables (use the exact section headers below; omit any section whose input is empty/disabled):
|
| 34 |
-
1) Quoted Policy Excerpts
|
| 35 |
-
- Quote the necessary text and append citations like (filename p.X). Group by subtopic.
|
| 36 |
-
- Try to meet the user's specification as much as possible where if they only want items from a certain page only give out data from that page or if it is from a certain document please only retrieve just from that document
|
| 37 |
-
- Order by page
|
| 38 |
-
2) Sentiment Summary
|
| 39 |
-
- Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.
|
| 40 |
-
3) Coherence Assessment
|
| 41 |
-
- From the coherence report only provide when ticked: state on-topic vs off-topic; call out which sections were coherent, off-topic, or repeated.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
Constraints:
|
| 45 |
-
- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'
|
| 46 |
-
- Use full sentences (no telegraphic fragments).
|
| 47 |
-
- Each substantive statement has a citation.
|
| 48 |
-
|
| 49 |
-
Topic hint: {topic_hint}
|
| 50 |
-
|
| 51 |
-
Sentiment JSON (rolled-up across top docs):
|
| 52 |
-
{sentiment_json}
|
| 53 |
-
|
| 54 |
-
Coherence report:
|
| 55 |
-
{coherence_report}
|
| 56 |
-
|
| 57 |
-
Context Sources:
|
| 58 |
-
{context_block}
|
| 59 |
-
"""
|
| 60 |
},
|
| 61 |
|
| 62 |
"abstractive_summary": {
|
|
@@ -183,7 +155,7 @@ def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
|
|
| 183 |
|
| 184 |
citation = f"{filename}, p. {page_label}"
|
| 185 |
|
| 186 |
-
|
| 187 |
|
| 188 |
return "\n".join(blocks)
|
| 189 |
|
|
@@ -195,25 +167,47 @@ def build_messages(
|
|
| 195 |
task_mode: str,
|
| 196 |
sentiment_rollup: Dict[str, List[str]],
|
| 197 |
coherence_report: str = "",
|
| 198 |
-
topic_hint: str = "energy policy"
|
|
|
|
| 199 |
) -> List[Dict[str, str]]:
|
| 200 |
template = PROMPT_TEMPLATES.get(task_mode)
|
| 201 |
if not template:
|
| 202 |
raise ValueError(f"Unknown task mode: {task_mode}")
|
| 203 |
|
| 204 |
context_block = build_context_block(top_docs)
|
|
|
|
|
|
|
| 205 |
sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
return [
|
| 216 |
-
|
| 217 |
{"role": "user", "content": user_prompt}
|
| 218 |
]
|
| 219 |
|
|
|
|
| 26 |
"The context is already searched, retrieved and reranked when handed to you."
|
| 27 |
|
| 28 |
|
| 29 |
+
),
|
| 30 |
+
# dynamic assembly; placeholders kept for backward compatibility but sections may be removed
|
| 31 |
+
"user_template": "DYNAMIC"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
},
|
| 33 |
|
| 34 |
"abstractive_summary": {
|
|
|
|
| 155 |
|
| 156 |
citation = f"{filename}, p. {page_label}"
|
| 157 |
|
| 158 |
+
blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
|
| 159 |
|
| 160 |
return "\n".join(blocks)
|
| 161 |
|
|
|
|
| 167 |
task_mode: str,
|
| 168 |
sentiment_rollup: Dict[str, List[str]],
|
| 169 |
coherence_report: str = "",
|
| 170 |
+
topic_hint: str = "energy policy",
|
| 171 |
+
allowlist_meta: Dict[str, Any] = None
|
| 172 |
) -> List[Dict[str, str]]:
|
| 173 |
template = PROMPT_TEMPLATES.get(task_mode)
|
| 174 |
if not template:
|
| 175 |
raise ValueError(f"Unknown task mode: {task_mode}")
|
| 176 |
|
| 177 |
context_block = build_context_block(top_docs)
|
| 178 |
+
sentiment_present = bool(sentiment_rollup)
|
| 179 |
+
coherence_present = bool(coherence_report)
|
| 180 |
sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
|
| 181 |
|
| 182 |
+
# Build user prompt dynamically to truly omit absent sections
|
| 183 |
+
parts = [
|
| 184 |
+
f"Query: {query}\n",
|
| 185 |
+
"Deliverables (omit any section whose input is empty/disabled):",
|
| 186 |
+
"1) Quoted Policy Excerpts\n - Quote the necessary text and append citations like (filename p.X). Group by subtopic.\n - Honor any page or document restriction from the query strictly.\n - Order by page",
|
| 187 |
+
]
|
| 188 |
+
if sentiment_present:
|
| 189 |
+
parts.append("2) Sentiment Summary\n - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.")
|
| 190 |
+
if coherence_present:
|
| 191 |
+
idx = 3 if sentiment_present else 2
|
| 192 |
+
parts.append(f"{idx}) Coherence Assessment\n - From the coherence report: on-topic vs off-topic; note coherent/off-topic/repeated sections only if present.")
|
| 193 |
+
parts.append(
|
| 194 |
+
"\nConstraints:\n- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'\n- Use full sentences.\n- Each substantive statement has a citation."
|
| 195 |
)
|
| 196 |
+
parts.append(f"\nTopic hint: {topic_hint}\n")
|
| 197 |
+
if sentiment_present:
|
| 198 |
+
parts.append(f"Sentiment JSON (rolled-up across top docs):\n{sentiment_json}\n")
|
| 199 |
+
if coherence_present:
|
| 200 |
+
parts.append(f"Coherence report:\n{coherence_report}\n")
|
| 201 |
+
guard = ""
|
| 202 |
+
if allowlist_meta:
|
| 203 |
+
doc_id = allowlist_meta.get('doc_id')
|
| 204 |
+
pages = allowlist_meta.get('pages')
|
| 205 |
+
guard = f"[ALLOWLIST_DOCS] doc_id={doc_id}; pages={pages}\nOnly use text from chunks where doc_id={doc_id} and page_label in {pages}. If none present reply exactly: Not found in sources for page {pages} of {doc_id}. Do not use any other documents.\n"
|
| 206 |
+
parts.append(f"{guard}Context Sources:\n{context_block}")
|
| 207 |
+
user_prompt = "\n".join(parts)
|
| 208 |
|
| 209 |
return [
|
| 210 |
+
{"role": "system", "content": template["system"]},
|
| 211 |
{"role": "user", "content": user_prompt}
|
| 212 |
]
|
| 213 |
|
utils/query_constraints.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Optional, Dict, List
|
| 3 |
+
|
| 4 |
+
PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE)
|
| 5 |
+
# crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year
|
| 6 |
+
DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})")
|
| 7 |
+
STOP = {"the","and","of","in","policy","document","national"}
|
| 8 |
+
|
| 9 |
+
def _doc_tokens(phrase: str) -> List[str]:
|
| 10 |
+
return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \
|
| 11 |
+
[y for y in re.findall(r"20\d{2}", phrase)]
|
| 12 |
+
|
| 13 |
+
def parse_query_constraints(query: str) -> Dict[str, Optional[int]]:
|
| 14 |
+
"""Extract simple structured constraints from the natural language query.
|
| 15 |
+
|
| 16 |
+
Currently supports:
|
| 17 |
+
- page: "page 17" -> page=17
|
| 18 |
+
Extendable later for document title filtering.
|
| 19 |
+
"""
|
| 20 |
+
page = None
|
| 21 |
+
if query:
|
| 22 |
+
m = PAGE_PATTERN.search(query)
|
| 23 |
+
if m:
|
| 24 |
+
try:
|
| 25 |
+
page = int(m.group(1))
|
| 26 |
+
except ValueError:
|
| 27 |
+
page = None
|
| 28 |
+
doc_tokens: List[str] = []
|
| 29 |
+
if query:
|
| 30 |
+
for m in DOC_PHRASE_PATTERN.finditer(query):
|
| 31 |
+
doc_tokens = _doc_tokens(m.group(1))
|
| 32 |
+
if doc_tokens:
|
| 33 |
+
break
|
| 34 |
+
return {"page": page, "doc_tokens": doc_tokens}
|
| 35 |
+
|
| 36 |
+
def page_matches(meta, target_page: int) -> bool:
|
| 37 |
+
"""Return True if metadata page/page_label matches the requested page.
|
| 38 |
+
Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page.
|
| 39 |
+
"""
|
| 40 |
+
if target_page is None:
|
| 41 |
+
return True
|
| 42 |
+
label = meta.get("page_label") or meta.get("page") or ""
|
| 43 |
+
if label is None:
|
| 44 |
+
return False
|
| 45 |
+
# Normalize to string and extract integers present
|
| 46 |
+
s = str(label)
|
| 47 |
+
nums = re.findall(r"\d+", s)
|
| 48 |
+
return any(int(n) == target_page for n in nums)
|
| 49 |
+
|
| 50 |
+
def doc_matches(meta, tokens: List[str]) -> bool:
|
| 51 |
+
if not tokens:
|
| 52 |
+
return True
|
| 53 |
+
src = (meta.get("source") or meta.get("path") or "").lower()
|
| 54 |
+
if not src:
|
| 55 |
+
return False
|
| 56 |
+
hit = sum(1 for t in tokens if t in src)
|
| 57 |
+
# require at least 60% of tokens present
|
| 58 |
+
return hit / max(1, len(tokens)) >= 0.6
|