Spaces:
Paused
Paused
Upload app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ from sentence_transformers import SentenceTransformer
|
|
| 19 |
from extracted_phase2_core import AgenticSelfRAG, Chunk, K_PASSAGES
|
| 20 |
|
| 21 |
|
| 22 |
-
APP_NAME = "SourceTruth
|
| 23 |
APP_TAGLINE = "Ask grounded questions over the preloaded Phase 2 project corpus and inspect cited evidence."
|
| 24 |
|
| 25 |
APP_ROOT = Path(__file__).resolve().parent
|
|
@@ -27,13 +27,14 @@ UPLOAD_ROOT = APP_ROOT / "testing_uploads"
|
|
| 27 |
LOG_ROOT = APP_ROOT / "testing_logs"
|
| 28 |
EVENT_LOG_PATH = LOG_ROOT / "events.jsonl"
|
| 29 |
INTERACTION_LOG_PATH = LOG_ROOT / "interactions.jsonl"
|
| 30 |
-
LOCAL_CORPUS_DIR = Path(r"C:\4 Sem Project\Phase 2\phase 2 corpus")
|
| 31 |
CORPUS_CANDIDATES = [
|
| 32 |
APP_ROOT / "phase2_corpus",
|
| 33 |
APP_ROOT / "phase 2 corpus",
|
| 34 |
APP_ROOT,
|
| 35 |
-
LOCAL_CORPUS_DIR,
|
| 36 |
]
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "20"))
|
| 39 |
MAX_PAGES = int(os.getenv("MAX_PAGES", "75"))
|
|
@@ -48,10 +49,10 @@ LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "0") == "1"
|
|
| 48 |
MAX_SUMMARY_SENTENCES = int(os.getenv("MAX_SUMMARY_SENTENCES", "3"))
|
| 49 |
|
| 50 |
PRIVACY_NOTICE = (
|
| 51 |
-
"
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
)
|
| 56 |
|
| 57 |
CSS = """
|
|
@@ -69,8 +70,8 @@ CSS = """
|
|
| 69 |
"""
|
| 70 |
|
| 71 |
PERSON_RE = re.compile(r"\b(?:Dr\.?\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b")
|
| 72 |
-
CURRENCY_RE = re.compile(r"\b(?:INR|Rs\.?)\s*[\d,]+(?:\.\d+)?
|
| 73 |
-
AMOUNT_RE = re.compile(r"\b(?:paid amount|amount paid|total price|price|amount)\b[:\s-]*(INR|Rs\.?)?\s*([\d,]+(?:\.\d+)?)", re.I)
|
| 74 |
VERSION_RE = re.compile(r"\b\d+(?:\.\d+){1,3}\b")
|
| 75 |
DATE_RE = re.compile(
|
| 76 |
r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|"
|
|
@@ -255,7 +256,7 @@ def question_plan(question: str) -> QuestionPlan:
|
|
| 255 |
if any(cue in q for cue in ["how to", "how do", "how should", "steps", "process", "procedure", "workflow", "manage ", "handling "]):
|
| 256 |
mode = "procedural"
|
| 257 |
expected = "procedure"
|
| 258 |
-
allow_agentic_fallback =
|
| 259 |
elif q.startswith("who") or "who is" in q or "who was" in q:
|
| 260 |
mode = "factoid"
|
| 261 |
expected = "person"
|
|
@@ -377,6 +378,8 @@ def cleanup_expired_sessions():
|
|
| 377 |
expired: List[str] = []
|
| 378 |
with SESSIONS_LOCK:
|
| 379 |
for session_id, session in list(SESSIONS.items()):
|
|
|
|
|
|
|
| 380 |
if session.last_activity < cutoff:
|
| 381 |
expired.append(session_id)
|
| 382 |
for session_id in expired:
|
|
@@ -1032,6 +1035,12 @@ def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]
|
|
| 1032 |
idx += 5
|
| 1033 |
continue
|
| 1034 |
idx += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1035 |
return rows
|
| 1036 |
|
| 1037 |
|
|
@@ -1246,16 +1255,37 @@ def ask_question(question: str, session_id: Optional[str]):
|
|
| 1246 |
abstained = False
|
| 1247 |
hallucination_rate = 0.0
|
| 1248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1249 |
if not evidence_pairs:
|
| 1250 |
abstained = True
|
| 1251 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
| 1252 |
elif plan.mode in {"procedural", "descriptive"}:
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
answer_text =
|
|
|
|
| 1256 |
else:
|
| 1257 |
-
|
| 1258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1259 |
else:
|
| 1260 |
concise_answer = concise_factoid_answer(question, plan, evidence_pairs)
|
| 1261 |
if concise_answer and evidence_has_expected_type(plan, evidence_sentences):
|
|
@@ -1264,19 +1294,9 @@ def ask_question(question: str, session_id: Optional[str]):
|
|
| 1264 |
abstained = True
|
| 1265 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
| 1266 |
elif plan.allow_agentic_fallback:
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
|
| 1270 |
-
answer_text = (
|
| 1271 |
-
"I don't have enough evidence in the project corpus to answer that reliably."
|
| 1272 |
-
if output.abstained
|
| 1273 |
-
else (output.answer or "No answer produced.")
|
| 1274 |
-
)
|
| 1275 |
-
abstained = output.abstained
|
| 1276 |
-
hallucination_rate = output.hallucination_rate or 0.0
|
| 1277 |
-
except Exception as exc:
|
| 1278 |
-
log_event("inference_failed", session_id=session.session_id, question=question, error=str(exc))
|
| 1279 |
-
return error_html(f"Inference failed: {exc}"), None, ""
|
| 1280 |
else:
|
| 1281 |
abstained = True
|
| 1282 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
|
|
|
| 19 |
from extracted_phase2_core import AgenticSelfRAG, Chunk, K_PASSAGES
|
| 20 |
|
| 21 |
|
| 22 |
+
APP_NAME = "SourceTruth"
|
| 23 |
APP_TAGLINE = "Ask grounded questions over the preloaded Phase 2 project corpus and inspect cited evidence."
|
| 24 |
|
| 25 |
APP_ROOT = Path(__file__).resolve().parent
|
|
|
|
| 27 |
LOG_ROOT = APP_ROOT / "testing_logs"
|
| 28 |
EVENT_LOG_PATH = LOG_ROOT / "events.jsonl"
|
| 29 |
INTERACTION_LOG_PATH = LOG_ROOT / "interactions.jsonl"
|
|
|
|
| 30 |
CORPUS_CANDIDATES = [
|
| 31 |
APP_ROOT / "phase2_corpus",
|
| 32 |
APP_ROOT / "phase 2 corpus",
|
| 33 |
APP_ROOT,
|
|
|
|
| 34 |
]
|
| 35 |
+
LOCAL_CORPUS_DIR = os.getenv("LOCAL_CORPUS_DIR", "").strip()
|
| 36 |
+
if LOCAL_CORPUS_DIR:
|
| 37 |
+
CORPUS_CANDIDATES.append(Path(LOCAL_CORPUS_DIR).expanduser())
|
| 38 |
|
| 39 |
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "20"))
|
| 40 |
MAX_PAGES = int(os.getenv("MAX_PAGES", "75"))
|
|
|
|
| 49 |
MAX_SUMMARY_SENTENCES = int(os.getenv("MAX_SUMMARY_SENTENCES", "3"))
|
| 50 |
|
| 51 |
PRIVACY_NOTICE = (
|
| 52 |
+
"The preloaded project PDFs are processed only to answer your questions and produce citations. "
|
| 53 |
+
"Documents are not used to train models. Interaction logs may store the question, answer, citation, "
|
| 54 |
+
"and proxy evaluation metrics for testing analysis. Avoid using the application for confidential, "
|
| 55 |
+
"personal, medical, or legal decisions without direct document verification."
|
| 56 |
)
|
| 57 |
|
| 58 |
CSS = """
|
|
|
|
| 70 |
"""
|
| 71 |
|
| 72 |
PERSON_RE = re.compile(r"\b(?:Dr\.?\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b")
|
| 73 |
+
CURRENCY_RE = re.compile(r"(?:₹\s*[\d,]+(?:\.\d+)?|\b(?:INR|Rs\.?)\s*[\d,]+(?:\.\d+)?)", re.I)
|
| 74 |
+
AMOUNT_RE = re.compile(r"\b(?:paid amount|amount paid|total price|price|amount|budget|cost)\b[:\s-]*(₹|INR|Rs\.?)?\s*([\d,]+(?:\.\d+)?)", re.I)
|
| 75 |
VERSION_RE = re.compile(r"\b\d+(?:\.\d+){1,3}\b")
|
| 76 |
DATE_RE = re.compile(
|
| 77 |
r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|"
|
|
|
|
| 256 |
if any(cue in q for cue in ["how to", "how do", "how should", "steps", "process", "procedure", "workflow", "manage ", "handling "]):
|
| 257 |
mode = "procedural"
|
| 258 |
expected = "procedure"
|
| 259 |
+
allow_agentic_fallback = True
|
| 260 |
elif q.startswith("who") or "who is" in q or "who was" in q:
|
| 261 |
mode = "factoid"
|
| 262 |
expected = "person"
|
|
|
|
| 378 |
expired: List[str] = []
|
| 379 |
with SESSIONS_LOCK:
|
| 380 |
for session_id, session in list(SESSIONS.items()):
|
| 381 |
+
if session_id == "phase2-corpus":
|
| 382 |
+
continue
|
| 383 |
if session.last_activity < cutoff:
|
| 384 |
expired.append(session_id)
|
| 385 |
for session_id in expired:
|
|
|
|
| 1035 |
idx += 5
|
| 1036 |
continue
|
| 1037 |
idx += 1
|
| 1038 |
+
if not rows:
|
| 1039 |
+
log_event(
|
| 1040 |
+
"vmp_table_parse_empty",
|
| 1041 |
+
source_file="02_Validation_Master_Plan.pdf",
|
| 1042 |
+
page_count=len(page_records),
|
| 1043 |
+
)
|
| 1044 |
return rows
|
| 1045 |
|
| 1046 |
|
|
|
|
| 1255 |
abstained = False
|
| 1256 |
hallucination_rate = 0.0
|
| 1257 |
|
| 1258 |
+
def run_agentic_fallback() -> Tuple[str, bool, float, Optional[Chunk], Optional[str]]:
|
| 1259 |
+
nonlocal output, best_chunk
|
| 1260 |
+
try:
|
| 1261 |
+
output = session.agent.run(question)
|
| 1262 |
+
best_chunk = output.best_chunk or best_chunk
|
| 1263 |
+
answer = (
|
| 1264 |
+
"I don't have enough evidence in the project corpus to answer that reliably."
|
| 1265 |
+
if output.abstained
|
| 1266 |
+
else (output.answer or "No answer produced.")
|
| 1267 |
+
)
|
| 1268 |
+
return answer, output.abstained, output.hallucination_rate or 0.0, best_chunk, None
|
| 1269 |
+
except Exception as exc:
|
| 1270 |
+
log_event("inference_failed", session_id=session.session_id, question=question, error=str(exc))
|
| 1271 |
+
return "", False, 0.0, best_chunk, str(exc)
|
| 1272 |
+
|
| 1273 |
if not evidence_pairs:
|
| 1274 |
abstained = True
|
| 1275 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
| 1276 |
elif plan.mode in {"procedural", "descriptive"}:
|
| 1277 |
+
summary_answer = summarize_procedural_answer(evidence_pairs) if evidence_has_expected_type(plan, evidence_sentences) else None
|
| 1278 |
+
if summary_answer:
|
| 1279 |
+
answer_text = summary_answer
|
| 1280 |
+
abstained = False
|
| 1281 |
else:
|
| 1282 |
+
if plan.allow_agentic_fallback:
|
| 1283 |
+
answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
|
| 1284 |
+
if inference_error:
|
| 1285 |
+
return error_html(f"Inference failed: {inference_error}"), None, ""
|
| 1286 |
+
else:
|
| 1287 |
+
abstained = True
|
| 1288 |
+
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
| 1289 |
else:
|
| 1290 |
concise_answer = concise_factoid_answer(question, plan, evidence_pairs)
|
| 1291 |
if concise_answer and evidence_has_expected_type(plan, evidence_sentences):
|
|
|
|
| 1294 |
abstained = True
|
| 1295 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|
| 1296 |
elif plan.allow_agentic_fallback:
|
| 1297 |
+
answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
|
| 1298 |
+
if inference_error:
|
| 1299 |
+
return error_html(f"Inference failed: {inference_error}"), None, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1300 |
else:
|
| 1301 |
abstained = True
|
| 1302 |
answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
|