aravindkb7 commited on
Commit
25e0d89
·
verified ·
1 Parent(s): 06c7f52

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -28
app.py CHANGED
@@ -19,7 +19,7 @@ from sentence_transformers import SentenceTransformer
19
  from extracted_phase2_core import AgenticSelfRAG, Chunk, K_PASSAGES
20
 
21
 
22
- APP_NAME = "SourceTruth Testing"
23
  APP_TAGLINE = "Ask grounded questions over the preloaded Phase 2 project corpus and inspect cited evidence."
24
 
25
  APP_ROOT = Path(__file__).resolve().parent
@@ -27,13 +27,14 @@ UPLOAD_ROOT = APP_ROOT / "testing_uploads"
27
  LOG_ROOT = APP_ROOT / "testing_logs"
28
  EVENT_LOG_PATH = LOG_ROOT / "events.jsonl"
29
  INTERACTION_LOG_PATH = LOG_ROOT / "interactions.jsonl"
30
- LOCAL_CORPUS_DIR = Path(r"C:\4 Sem Project\Phase 2\phase 2 corpus")
31
  CORPUS_CANDIDATES = [
32
  APP_ROOT / "phase2_corpus",
33
  APP_ROOT / "phase 2 corpus",
34
  APP_ROOT,
35
- LOCAL_CORPUS_DIR,
36
  ]
 
 
 
37
 
38
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "20"))
39
  MAX_PAGES = int(os.getenv("MAX_PAGES", "75"))
@@ -48,10 +49,10 @@ LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "0") == "1"
48
  MAX_SUMMARY_SENTENCES = int(os.getenv("MAX_SUMMARY_SENTENCES", "3"))
49
 
50
  PRIVACY_NOTICE = (
51
- "Upload only PDF files. Documents are processed only to answer your questions and "
52
- "produce citations. Files are not used to train models. Interaction logs may store "
53
- "an anonymized file hash, question, answer, citation, and proxy evaluation metrics "
54
- "for testing analysis. Avoid uploading confidential, personal, medical, or legal files."
55
  )
56
 
57
  CSS = """
@@ -69,8 +70,8 @@ CSS = """
69
  """
70
 
71
  PERSON_RE = re.compile(r"\b(?:Dr\.?\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b")
72
- CURRENCY_RE = re.compile(r"\b(?:INR|Rs\.?)\s*[\d,]+(?:\.\d+)?\b", re.I)
73
- AMOUNT_RE = re.compile(r"\b(?:paid amount|amount paid|total price|price|amount)\b[:\s-]*(INR|Rs\.?)?\s*([\d,]+(?:\.\d+)?)", re.I)
74
  VERSION_RE = re.compile(r"\b\d+(?:\.\d+){1,3}\b")
75
  DATE_RE = re.compile(
76
  r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|"
@@ -255,7 +256,7 @@ def question_plan(question: str) -> QuestionPlan:
255
  if any(cue in q for cue in ["how to", "how do", "how should", "steps", "process", "procedure", "workflow", "manage ", "handling "]):
256
  mode = "procedural"
257
  expected = "procedure"
258
- allow_agentic_fallback = False
259
  elif q.startswith("who") or "who is" in q or "who was" in q:
260
  mode = "factoid"
261
  expected = "person"
@@ -377,6 +378,8 @@ def cleanup_expired_sessions():
377
  expired: List[str] = []
378
  with SESSIONS_LOCK:
379
  for session_id, session in list(SESSIONS.items()):
 
 
380
  if session.last_activity < cutoff:
381
  expired.append(session_id)
382
  for session_id in expired:
@@ -1032,6 +1035,12 @@ def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]
1032
  idx += 5
1033
  continue
1034
  idx += 1
 
 
 
 
 
 
1035
  return rows
1036
 
1037
 
@@ -1246,16 +1255,37 @@ def ask_question(question: str, session_id: Optional[str]):
1246
  abstained = False
1247
  hallucination_rate = 0.0
1248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  if not evidence_pairs:
1250
  abstained = True
1251
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
1252
  elif plan.mode in {"procedural", "descriptive"}:
1253
- if not evidence_has_expected_type(plan, evidence_sentences):
1254
- abstained = True
1255
- answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
 
1256
  else:
1257
- answer_text = summarize_procedural_answer(evidence_pairs) or "I don't have enough evidence in the project corpus to answer that reliably."
1258
- abstained = answer_text.startswith("I don't have enough evidence")
 
 
 
 
 
1259
  else:
1260
  concise_answer = concise_factoid_answer(question, plan, evidence_pairs)
1261
  if concise_answer and evidence_has_expected_type(plan, evidence_sentences):
@@ -1264,19 +1294,9 @@ def ask_question(question: str, session_id: Optional[str]):
1264
  abstained = True
1265
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
1266
  elif plan.allow_agentic_fallback:
1267
- try:
1268
- output = session.agent.run(question)
1269
- best_chunk = output.best_chunk or best_chunk
1270
- answer_text = (
1271
- "I don't have enough evidence in the project corpus to answer that reliably."
1272
- if output.abstained
1273
- else (output.answer or "No answer produced.")
1274
- )
1275
- abstained = output.abstained
1276
- hallucination_rate = output.hallucination_rate or 0.0
1277
- except Exception as exc:
1278
- log_event("inference_failed", session_id=session.session_id, question=question, error=str(exc))
1279
- return error_html(f"Inference failed: {exc}"), None, ""
1280
  else:
1281
  abstained = True
1282
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
 
19
  from extracted_phase2_core import AgenticSelfRAG, Chunk, K_PASSAGES
20
 
21
 
22
+ APP_NAME = "SourceTruth"
23
  APP_TAGLINE = "Ask grounded questions over the preloaded Phase 2 project corpus and inspect cited evidence."
24
 
25
  APP_ROOT = Path(__file__).resolve().parent
 
27
  LOG_ROOT = APP_ROOT / "testing_logs"
28
  EVENT_LOG_PATH = LOG_ROOT / "events.jsonl"
29
  INTERACTION_LOG_PATH = LOG_ROOT / "interactions.jsonl"
 
30
  CORPUS_CANDIDATES = [
31
  APP_ROOT / "phase2_corpus",
32
  APP_ROOT / "phase 2 corpus",
33
  APP_ROOT,
 
34
  ]
35
+ LOCAL_CORPUS_DIR = os.getenv("LOCAL_CORPUS_DIR", "").strip()
36
+ if LOCAL_CORPUS_DIR:
37
+ CORPUS_CANDIDATES.append(Path(LOCAL_CORPUS_DIR).expanduser())
38
 
39
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "20"))
40
  MAX_PAGES = int(os.getenv("MAX_PAGES", "75"))
 
49
  MAX_SUMMARY_SENTENCES = int(os.getenv("MAX_SUMMARY_SENTENCES", "3"))
50
 
51
  PRIVACY_NOTICE = (
52
+ "The preloaded project PDFs are processed only to answer your questions and produce citations. "
53
+ "Documents are not used to train models. Interaction logs may store the question, answer, citation, "
54
+ "and proxy evaluation metrics for testing analysis. Avoid using the application for confidential, "
55
+ "personal, medical, or legal decisions without direct document verification."
56
  )
57
 
58
  CSS = """
 
70
  """
71
 
72
  PERSON_RE = re.compile(r"\b(?:Dr\.?\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b")
73
+ CURRENCY_RE = re.compile(r"(?:₹\s*[\d,]+(?:\.\d+)?|\b(?:INR|Rs\.?)\s*[\d,]+(?:\.\d+)?)", re.I)
74
+ AMOUNT_RE = re.compile(r"\b(?:paid amount|amount paid|total price|price|amount|budget|cost)\b[:\s-]*(₹|INR|Rs\.?)?\s*([\d,]+(?:\.\d+)?)", re.I)
75
  VERSION_RE = re.compile(r"\b\d+(?:\.\d+){1,3}\b")
76
  DATE_RE = re.compile(
77
  r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|"
 
256
  if any(cue in q for cue in ["how to", "how do", "how should", "steps", "process", "procedure", "workflow", "manage ", "handling "]):
257
  mode = "procedural"
258
  expected = "procedure"
259
+ allow_agentic_fallback = True
260
  elif q.startswith("who") or "who is" in q or "who was" in q:
261
  mode = "factoid"
262
  expected = "person"
 
378
  expired: List[str] = []
379
  with SESSIONS_LOCK:
380
  for session_id, session in list(SESSIONS.items()):
381
+ if session_id == "phase2-corpus":
382
+ continue
383
  if session.last_activity < cutoff:
384
  expired.append(session_id)
385
  for session_id in expired:
 
1035
  idx += 5
1036
  continue
1037
  idx += 1
1038
+ if not rows:
1039
+ log_event(
1040
+ "vmp_table_parse_empty",
1041
+ source_file="02_Validation_Master_Plan.pdf",
1042
+ page_count=len(page_records),
1043
+ )
1044
  return rows
1045
 
1046
 
 
1255
  abstained = False
1256
  hallucination_rate = 0.0
1257
 
1258
+ def run_agentic_fallback() -> Tuple[str, bool, float, Optional[Chunk], Optional[str]]:
1259
+ nonlocal output, best_chunk
1260
+ try:
1261
+ output = session.agent.run(question)
1262
+ best_chunk = output.best_chunk or best_chunk
1263
+ answer = (
1264
+ "I don't have enough evidence in the project corpus to answer that reliably."
1265
+ if output.abstained
1266
+ else (output.answer or "No answer produced.")
1267
+ )
1268
+ return answer, output.abstained, output.hallucination_rate or 0.0, best_chunk, None
1269
+ except Exception as exc:
1270
+ log_event("inference_failed", session_id=session.session_id, question=question, error=str(exc))
1271
+ return "", False, 0.0, best_chunk, str(exc)
1272
+
1273
  if not evidence_pairs:
1274
  abstained = True
1275
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
1276
  elif plan.mode in {"procedural", "descriptive"}:
1277
+ summary_answer = summarize_procedural_answer(evidence_pairs) if evidence_has_expected_type(plan, evidence_sentences) else None
1278
+ if summary_answer:
1279
+ answer_text = summary_answer
1280
+ abstained = False
1281
  else:
1282
+ if plan.allow_agentic_fallback:
1283
+ answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
1284
+ if inference_error:
1285
+ return error_html(f"Inference failed: {inference_error}"), None, ""
1286
+ else:
1287
+ abstained = True
1288
+ answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
1289
  else:
1290
  concise_answer = concise_factoid_answer(question, plan, evidence_pairs)
1291
  if concise_answer and evidence_has_expected_type(plan, evidence_sentences):
 
1294
  abstained = True
1295
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
1296
  elif plan.allow_agentic_fallback:
1297
+ answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
1298
+ if inference_error:
1299
+ return error_html(f"Inference failed: {inference_error}"), None, ""
 
 
 
 
 
 
 
 
 
 
1300
  else:
1301
  abstained = True
1302
  answer_text = "I don't have enough evidence in the project corpus to answer that reliably."