hchevva commited on
Commit
4bf9d97
·
verified ·
1 Parent(s): 9457c11

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +551 -153
  2. cancer_risk_input_template.csv +1 -0
  3. literature_explorer.py +40 -27
  4. requirements.txt +5 -4
  5. runtime.txt +1 -0
app.py CHANGED
@@ -10,13 +10,137 @@ import numpy as np
10
  import pandas as pd
11
 
12
  from pypdf import PdfReader
13
- from sklearn.feature_extraction.text import TfidfVectorizer
 
 
 
14
 
15
  from openai import OpenAI
16
  from literature_explorer import build_literature_explorer_tab
 
 
 
 
 
17
 
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # =============================
21
  # Defaults
22
  # =============================
@@ -236,6 +360,22 @@ def select_relevant_chunks(
236
  if not texts:
237
  return []
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
240
  X = vectorizer.fit_transform(texts)
241
 
@@ -897,7 +1037,7 @@ def run_extraction(
897
  if not files:
898
  return (
899
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
900
- pd.DataFrame(), None, None, "Upload one or more PDFs.",
901
  gr.update(choices=[], value=None),
902
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
903
  )
@@ -907,7 +1047,7 @@ def run_extraction(
907
  except Exception as e:
908
  return (
909
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
910
- pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
911
  gr.update(choices=[], value=None),
912
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
913
  )
@@ -916,7 +1056,7 @@ def run_extraction(
916
  if not field_props:
917
  return (
918
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
919
- pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
920
  gr.update(choices=[], value=None),
921
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
922
  )
@@ -936,13 +1076,14 @@ def run_extraction(
936
  except Exception as e:
937
  return (
938
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
939
- pd.DataFrame(), None, None, str(e),
940
  gr.update(choices=[], value=None),
941
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
942
  )
943
 
944
  paper_details: List[Dict[str, Any]] = []
945
  output_rows: List[Dict[str, Any]] = []
 
946
 
947
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
948
 
@@ -963,20 +1104,62 @@ def run_extraction(
963
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
964
  "evidence": []
965
  }
 
 
 
 
 
 
 
 
 
966
  else:
967
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
968
-
969
- queries = [
970
  "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
971
  "chemical name CAS number",
972
  ]
973
- for ep in (selected_endpoints or []):
974
- queries.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
975
- for k, ins in field_instr.items():
976
- queries.append(ins if ins else k)
 
 
 
977
 
978
- selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
979
- context = build_context(selected, max_chars=int(max_context_chars))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
980
 
981
  ex = openai_structured_extract(
982
  client=client,
@@ -1060,7 +1243,16 @@ def run_extraction(
1060
  csv_path = tmpdir / "extraction_table.csv"
1061
  json_path = tmpdir / "extraction_details.json"
1062
  df.to_csv(csv_path, index=False)
1063
- json_path.write_text(json.dumps(paper_details, indent=2), encoding="utf-8")
 
 
 
 
 
 
 
 
 
1064
 
1065
  choices = [r.get("record_id") for r in records if r.get("record_id")]
1066
  default = choices[0] if choices else None
@@ -1083,6 +1275,7 @@ def run_extraction(
1083
  overview,
1084
  str(csv_path),
1085
  str(json_path),
 
1086
  status,
1087
  gr.update(choices=choices, value=default),
1088
  records,
@@ -1145,6 +1338,135 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
1145
  return str(path), "Reviewed CSV ready to download."
1146
 
1147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1148
  # =============================
1149
  # Synthesis tab handler
1150
  # =============================
@@ -1155,7 +1477,10 @@ def run_synthesis(api_key, model, extraction_json_file):
1155
  client = get_openai_client(api_key)
1156
  except Exception as e:
1157
  return str(e)
1158
- rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
 
 
 
1159
  return openai_synthesize_across_papers(client, model, rows)
1160
 
1161
 
@@ -1173,11 +1498,24 @@ def set_admin_visibility(is_admin: bool):
1173
  # =============================
1174
  # Gradio UI
1175
  # =============================
1176
- with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1177
- gr.Markdown(
1178
- "# Toxicology PDF → Grounded Extractor\n"
1179
- "Upload PDFs → choose endpoints → Run → review report → export.\n\n"
1180
- "**Note:** Text-based PDFs only (not scanned/image PDFs)."
 
 
 
 
 
 
 
 
 
 
 
 
 
1181
  )
1182
 
1183
  state_records = gr.State([])
@@ -1189,130 +1527,136 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1189
  vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
1190
 
1191
  with gr.Tab("Extract"):
1192
- # --- Run section (simple) ---
1193
- with gr.Group():
1194
- files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
1195
-
1196
- with gr.Row():
1197
- api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1198
- model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1199
-
1200
- with gr.Row():
1201
- endpoint_preset = gr.Dropdown(
1202
- label="Endpoint preset",
1203
- choices=list(ENDPOINT_PRESETS.keys()),
1204
- value="Required – Safety Assessor"
1205
- )
1206
- endpoints = gr.Dropdown(
1207
- label="Endpoints to extract (Core included automatically)",
1208
- choices=list(ENDPOINT_MODULES.keys()),
1209
- multiselect=True,
1210
- value=ENDPOINT_PRESETS["Required – Safety Assessor"]
1211
- )
1212
-
1213
- extract_btn = gr.Button("Run Extraction", variant="primary")
1214
- status = gr.Textbox(label="Status", interactive=False)
1215
-
1216
- # --- Report (results-first) ---
1217
- gr.Markdown("## Report")
1218
- summary_card = gr.HTML(render_summary_card("", []))
1219
-
1220
- overview_df = gr.Dataframe(
1221
- label="Batch Overview",
1222
- interactive=False,
1223
- wrap=True,
1224
- show_row_numbers=True
1225
- )
1226
-
1227
- with gr.Row():
1228
- out_csv = gr.File(label="Download: extraction_table.csv")
1229
- out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
1230
-
1231
- record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
1232
-
1233
- with gr.Row():
1234
- review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
1235
- save_btn = gr.Button("Save edits")
1236
- export_btn = gr.Button("Export reviewed CSV")
1237
-
1238
- review_status = gr.Textbox(label="Review status", interactive=False)
1239
-
1240
- with gr.Row():
1241
- vertical_view = gr.Dataframe(
1242
- headers=["Field", "Value"],
1243
- interactive=False,
1244
- wrap=True,
1245
- show_row_numbers=False,
1246
- label="Extracted fields (vertical)"
1247
- )
1248
- evidence_md = gr.Markdown()
1249
-
1250
- reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1251
-
1252
- # --- Advanced runtime settings (collapsed) ---
1253
- with gr.Accordion("Advanced runtime settings", open=False):
1254
- with gr.Row():
1255
- max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
1256
- chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
1257
- max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
1258
-
1259
- # --- Admin tools (collapsed) ---
1260
- with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False):
1261
- admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
1262
-
1263
- admin_group = gr.Group(visible=False)
1264
- admin_vocab_group = gr.Group(visible=False)
1265
- admin_fields_group = gr.Group(visible=False)
1266
-
1267
- with admin_group:
1268
- gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
1269
-
1270
- with admin_vocab_group:
1271
- gr.Markdown("### Controlled vocabulary (lists only)")
1272
- vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
1273
- vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
1274
-
1275
- with gr.Row():
1276
- vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
1277
- vocab_add_btn = gr.Button("Add")
1278
- with gr.Row():
1279
- vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
1280
- vocab_remove_btn = gr.Button("Remove")
1281
- vocab_apply_btn = gr.Button("Apply full list to category")
1282
- vocab_reset_btn = gr.Button("Reset vocab to defaults")
1283
-
1284
- vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
1285
- vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
1286
- vocab_status = gr.Textbox(label="Vocab status", interactive=False)
1287
-
1288
- with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
1289
- vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
1290
-
1291
- with admin_fields_group:
1292
- gr.Markdown("### Custom columns (Field Builder)")
1293
- gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
1294
-
1295
- with gr.Row():
1296
- admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
1297
- fields_apply_btn = gr.Button("Apply builder table")
1298
-
1299
- with gr.Row():
1300
- field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
1301
- field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
1302
-
1303
- enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
1304
- instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
1305
-
1306
- add_update_field_btn = gr.Button("Add/Update field")
1307
-
1308
- fields_df = gr.Dataframe(
1309
- label="Fields (edit and click Apply)",
1310
- headers=["field","type","enum_values","instructions"],
1311
- interactive=True,
1312
- wrap=True
1313
- )
1314
-
1315
- fields_status = gr.Textbox(label="Field builder status", interactive=False)
 
 
 
 
 
 
1316
 
1317
  # --- Wiring ---
1318
  admin_mode.change(
@@ -1336,7 +1680,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1336
  extract_btn.click(
1337
  fn=run_extraction,
1338
  inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
1339
- outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1340
  )
1341
 
1342
  record_pick.change(
@@ -1460,14 +1804,68 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
1460
  build_literature_explorer_tab()
1461
 
1462
  with gr.Tab("Cross-paper Synthesis"):
1463
- gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
1464
- api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1465
- model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1466
- extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
1467
- synth_btn = gr.Button("Synthesize Across Papers")
1468
- synth_md = gr.Markdown()
1469
- synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1470
 
1471
  if __name__ == "__main__":
1472
  port = int(os.environ.get("PORT", "7860"))
1473
- demo.queue().launch(server_name="0.0.0.0", server_port=port)
 
10
  import pandas as pd
11
 
12
  from pypdf import PdfReader
13
+ try:
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ except Exception: # pragma: no cover - fallback path for minimal runtime
16
+ TfidfVectorizer = None
17
 
18
  from openai import OpenAI
19
  from literature_explorer import build_literature_explorer_tab
20
+ from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json, write_markdown
21
+ from toxra_core.calculation_client import MCPClientError, run_batch_cancer_risk
22
+ from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
23
+ from toxra_core.nlp_pipeline import extract_evidence_span, expand_regulatory_queries, hybrid_rank_text_items
24
+ from toxra_core.regulatory_mapper import map_extraction_to_framework
25
 
26
 
27
 
28
+ # =============================
29
+ # UI theme
30
+ # =============================
31
+ APP_CSS = """
32
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@400;500;600;700&display=swap');
33
+
34
+ :root {
35
+ --bg: #f5f7fb;
36
+ --panel: #ffffff;
37
+ --ink: #0f172a;
38
+ --muted: #516079;
39
+ --line: #e2e8f0;
40
+ --accent: #2563eb;
41
+ --accent-2: #0ea5e9;
42
+ --accent-soft: #e6efff;
43
+ --shadow: 0 10px 28px rgba(15, 23, 42, 0.08);
44
+ --radius: 14px;
45
+ }
46
+
47
+ .gradio-container {
48
+ background: var(--bg);
49
+ color: var(--ink);
50
+ font-family: "IBM Plex Sans", ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, "Helvetica Neue", Arial, "Noto Sans", "Apple Color Emoji", "Segoe UI Emoji";
51
+ }
52
+
53
+ .hero {
54
+ background: linear-gradient(180deg, #edf3ff 0%, #f4f8ff 100%);
55
+ color: var(--ink);
56
+ border-radius: 16px;
57
+ padding: 18px 22px;
58
+ box-shadow: var(--shadow);
59
+ border: 1px solid #dbe5f4;
60
+ display: flex;
61
+ align-items: center;
62
+ justify-content: space-between;
63
+ gap: 16px;
64
+ flex-wrap: wrap;
65
+ }
66
+ .hero-left { min-width: 240px; }
67
+ .hero-right { margin-left: auto; }
68
+ .hero-title { font-size: 22px; font-weight: 700; letter-spacing: 0.08em; }
69
+ .hero-sub { margin-top: 4px; font-size: 13px; color: #3b4b63; }
70
+ .hero-pills { margin-top: 10px; display: flex; gap: 8px; flex-wrap: wrap; }
71
+ .hero-pill {
72
+ background: var(--accent-soft);
73
+ color: #1e3a8a;
74
+ border: 1px solid #d6e3f6;
75
+ border-radius: 999px;
76
+ padding: 4px 10px;
77
+ font-size: 11px;
78
+ font-weight: 600;
79
+ }
80
+ .hero-status {
81
+ background: #ffffff;
82
+ color: #334155;
83
+ border: 1px solid #d9e2ef;
84
+ border-radius: 999px;
85
+ padding: 6px 12px;
86
+ font-size: 12px;
87
+ font-weight: 600;
88
+ box-shadow: 0 6px 16px rgba(15, 23, 42, 0.06);
89
+ }
90
+
91
+ .split-row { gap: 18px; }
92
+ .card {
93
+ background: var(--panel);
94
+ border: 1px solid var(--line);
95
+ border-radius: var(--radius);
96
+ padding: 16px;
97
+ box-shadow: var(--shadow);
98
+ }
99
+ .left-rail .card + .card { margin-top: 16px; }
100
+ .right-panel .card { margin-bottom: 14px; }
101
+ .section-title {
102
+ font-size: 12px;
103
+ text-transform: uppercase;
104
+ letter-spacing: 0.14em;
105
+ color: var(--muted);
106
+ margin-bottom: 8px;
107
+ }
108
+
109
+ .gradio-container input,
110
+ .gradio-container textarea,
111
+ .gradio-container select {
112
+ border-radius: 10px !important;
113
+ border-color: var(--line) !important;
114
+ }
115
+
116
+ .gradio-container button.primary {
117
+ background: var(--accent) !important;
118
+ border-color: var(--accent) !important;
119
+ }
120
+ .gradio-container button.primary:hover { background: #1d4ed8 !important; }
121
+
122
+ .gradio-container .tab-nav { gap: 8px; }
123
+ .gradio-container .tab-nav button {
124
+ background: var(--panel);
125
+ border: 1px solid var(--line);
126
+ border-radius: 999px;
127
+ padding: 6px 14px;
128
+ font-size: 12px;
129
+ color: var(--muted);
130
+ }
131
+ .gradio-container .tab-nav button.selected {
132
+ background: var(--accent);
133
+ border-color: var(--accent);
134
+ color: #ffffff;
135
+ }
136
+
137
+ .gradio-container .accordion {
138
+ border: 1px solid var(--line);
139
+ border-radius: var(--radius);
140
+ }
141
+ """
142
+
143
+
144
  # =============================
145
  # Defaults
146
  # =============================
 
360
  if not texts:
361
  return []
362
 
363
+ if TfidfVectorizer is None:
364
+ selected_idx: List[int] = []
365
+ for q in queries:
366
+ q_tokens = set([w for w in re.findall(r"[a-zA-Z0-9\\-]+", (q or "").lower()) if len(w) >= 3])
367
+ scored = []
368
+ for i, t in enumerate(texts):
369
+ tl = t.lower()
370
+ scored.append((sum(1 for tok in q_tokens if tok in tl), i))
371
+ scored.sort(key=lambda x: x[0], reverse=True)
372
+ for _, i in scored[:top_per_query]:
373
+ if i not in selected_idx:
374
+ selected_idx.append(i)
375
+ if not selected_idx:
376
+ selected_idx = list(range(min(len(chunks), max_chunks)))
377
+ return [chunks[i] for i in selected_idx[:max_chunks]]
378
+
379
  vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
380
  X = vectorizer.fit_transform(texts)
381
 
 
1037
  if not files:
1038
  return (
1039
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
1040
+ pd.DataFrame(), None, None, None, "Upload one or more PDFs.",
1041
  gr.update(choices=[], value=None),
1042
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
1043
  )
 
1047
  except Exception as e:
1048
  return (
1049
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
1050
+ pd.DataFrame(), None, None, None, f"Controlled vocab JSON invalid: {e}",
1051
  gr.update(choices=[], value=None),
1052
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
1053
  )
 
1056
  if not field_props:
1057
  return (
1058
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
1059
+ pd.DataFrame(), None, None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
1060
  gr.update(choices=[], value=None),
1061
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
1062
  )
 
1076
  except Exception as e:
1077
  return (
1078
  "<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
1079
+ pd.DataFrame(), None, None, None, str(e),
1080
  gr.update(choices=[], value=None),
1081
  [], [], pd.DataFrame(columns=["Field","Value"]), ""
1082
  )
1083
 
1084
  paper_details: List[Dict[str, Any]] = []
1085
  output_rows: List[Dict[str, Any]] = []
1086
+ nlp_diagnostics: List[Dict[str, Any]] = []
1087
 
1088
  tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
1089
 
 
1104
  "extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
1105
  "evidence": []
1106
  }
1107
+ nlp_diagnostics.append(
1108
+ {
1109
+ "file": filename,
1110
+ "ranking_method": "unavailable_no_text",
1111
+ "selected_indices": [],
1112
+ "coverage_by_query_family": {},
1113
+ "coverage_score": 0.0,
1114
+ }
1115
+ )
1116
  else:
1117
  chunks = chunk_pages(pages, target_chars=int(chunk_chars))
1118
+ base_queries = [
 
1119
  "regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
1120
  "chemical name CAS number",
1121
  ]
1122
+ extra_terms = [ins if ins else k for k, ins in field_instr.items()]
1123
+ queries, families = expand_regulatory_queries(
1124
+ base_queries=base_queries,
1125
+ endpoint_modules=selected_endpoints or [],
1126
+ frameworks=["FDA CTP", "EPA"],
1127
+ extra_terms=extra_terms,
1128
+ )
1129
 
1130
+ emb_mat = None
1131
+ qemb = None
1132
+ try:
1133
+ texts = [c.get("text", "") for c in chunks]
1134
+ if texts:
1135
+ emb_mat = embed_texts(client, DEFAULT_EMBEDDING_MODEL, texts)
1136
+ qemb = embed_texts(client, DEFAULT_EMBEDDING_MODEL, [" ".join(queries[:20])])[0]
1137
+ except Exception:
1138
+ emb_mat = None
1139
+ qemb = None
1140
+
1141
+ selected, diag = hybrid_rank_text_items(
1142
+ items=chunks,
1143
+ query=" ".join(queries[:20]),
1144
+ families=families,
1145
+ top_k=12,
1146
+ item_embeddings=emb_mat,
1147
+ query_embedding=qemb,
1148
+ )
1149
+ nlp_diagnostics.append(dict({"file": filename}, **diag))
1150
+ span_blocks: List[str] = []
1151
+ chars = 0
1152
+ for c in selected:
1153
+ span = extract_evidence_span(c.get("text", ""), " ".join(queries[:20]), page=None, n_sentences=5)
1154
+ snippet = span.get("text", "") or c.get("text", "")
1155
+ block = f"[pages {c.get('pages','')}]\n{snippet}\n"
1156
+ if chars + len(block) > int(max_context_chars):
1157
+ break
1158
+ span_blocks.append(block)
1159
+ chars += len(block)
1160
+ context = "\n".join(span_blocks).strip()
1161
+ if not context:
1162
+ context = build_context(selected, max_chars=int(max_context_chars))
1163
 
1164
  ex = openai_structured_extract(
1165
  client=client,
 
1243
  csv_path = tmpdir / "extraction_table.csv"
1244
  json_path = tmpdir / "extraction_details.json"
1245
  df.to_csv(csv_path, index=False)
1246
+ details_payload = {
1247
+ "papers": paper_details,
1248
+ "toxra_extensions": {
1249
+ "nlp_diagnostics": nlp_diagnostics,
1250
+ "regulatory_gap_assessment": {},
1251
+ "risk_calculation_refs": [],
1252
+ },
1253
+ }
1254
+ json_path.write_text(json.dumps(details_payload, indent=2), encoding="utf-8")
1255
+ prefilled_template_path = export_prefilled_cancer_risk_template(records)
1256
 
1257
  choices = [r.get("record_id") for r in records if r.get("record_id")]
1258
  default = choices[0] if choices else None
 
1275
  overview,
1276
  str(csv_path),
1277
  str(json_path),
1278
+ str(prefilled_template_path),
1279
  status,
1280
  gr.update(choices=choices, value=default),
1281
  records,
 
1338
  return str(path), "Reviewed CSV ready to download."
1339
 
1340
 
1341
+ # =============================
1342
+ # New modules: template, mapping, MCP batch
1343
+ # =============================
1344
+ def _load_extraction_payload(file_obj: Any) -> Tuple[Any, List[Dict[str, Any]], Dict[str, Any]]:
1345
+ if file_obj is None:
1346
+ raise ValueError("Upload extraction_details.json first.")
1347
+ payload = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
1348
+ if isinstance(payload, list):
1349
+ return payload, payload, {}
1350
+ if isinstance(payload, dict):
1351
+ papers = payload.get("papers", [])
1352
+ if not isinstance(papers, list):
1353
+ raise ValueError("Invalid extraction_details.json format: papers must be a list.")
1354
+ ext = payload.get("toxra_extensions", {})
1355
+ return payload, papers, (ext if isinstance(ext, dict) else {})
1356
+ raise ValueError("Unsupported extraction_details.json format.")
1357
+
1358
+
1359
+ def export_blank_cancer_risk_template():
1360
+ tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_"))
1361
+ path = tmpdir / "cancer_risk_input_template.csv"
1362
+ pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
1363
+ return str(path), "Blank cancer risk template ready."
1364
+
1365
+
1366
+ def export_prefilled_cancer_risk_template(records: List[Dict[str, Any]]):
1367
+ tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_prefilled_"))
1368
+ path = tmpdir / "cancer_risk_input_template_prefilled.csv"
1369
+ if not records:
1370
+ pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
1371
+ return str(path)
1372
+
1373
+ rows: List[Dict[str, Any]] = []
1374
+ seen = set()
1375
+ for r in records:
1376
+ rid = str(r.get("record_id", "")).strip()
1377
+ if not rid or rid in seen:
1378
+ continue
1379
+ seen.add(rid)
1380
+ route = str(r.get("exposure_route", "")).strip().lower()
1381
+ if route not in {"oral", "inhalation"}:
1382
+ route = ""
1383
+ casn = str(r.get("cas_numbers", "")).split(";")[0].strip()
1384
+ rows.append(
1385
+ {
1386
+ "record_id": rid,
1387
+ "chemical_name": str(r.get("chemical", "")).strip(),
1388
+ "casrn": casn,
1389
+ "route": route,
1390
+ "exposure_value": "",
1391
+ "exposure_unit": "",
1392
+ "body_weight_kg": "",
1393
+ "csf_value": "",
1394
+ "csf_unit": "",
1395
+ "iur_value": "",
1396
+ "air_conc_value": "",
1397
+ "air_conc_unit": "",
1398
+ "source_reference": str(r.get("file", "")).strip(),
1399
+ }
1400
+ )
1401
+
1402
+ df = pd.DataFrame(rows, columns=CANCER_RISK_TEMPLATE_COLUMNS)
1403
+ df.to_csv(path, index=False)
1404
+ return str(path)
1405
+
1406
+
1407
+ def run_regulatory_gap_assessment(extraction_json_file, framework: str, override_notes: str):
1408
+ if extraction_json_file is None:
1409
+ return pd.DataFrame(), "Upload extraction_details.json first.", None, None, "No input file."
1410
+ try:
1411
+ payload, _, _ = _load_extraction_payload(extraction_json_file)
1412
+ matrix_df, report, report_md = map_extraction_to_framework(
1413
+ extraction_payload=payload,
1414
+ framework=framework,
1415
+ catalog_dir="regulatory_catalog",
1416
+ override_notes=override_notes or "",
1417
+ )
1418
+ except Exception as e:
1419
+ return pd.DataFrame(), f"(assessment unavailable: {e})", None, None, str(e)
1420
+
1421
+ run_dir = make_run_dir(base_dir="runs")
1422
+ matrix_path = write_dataframe_csv(run_dir / "regulatory_gap_matrix.csv", matrix_df)
1423
+ report_path = write_json(run_dir / "regulatory_gap_report.json", report)
1424
+ write_markdown(run_dir / "regulatory_gap_report.md", report_md)
1425
+
1426
+ md = "### Regulatory Gap Summary\n" + report_md
1427
+ status = f"✅ Gap assessment complete. Covered={report.get('summary', {}).get('covered', 0)} | Missing={report.get('summary', {}).get('missing', 0)}"
1428
+ return matrix_df, md, str(matrix_path), str(report_path), status
1429
+
1430
+
1431
+ def run_cancer_risk_batch_ui(input_csv_file):
1432
+ if input_csv_file is None:
1433
+ return pd.DataFrame(), None, None, None, "Upload a populated cancer risk input CSV."
1434
+ try:
1435
+ df = pd.read_csv(input_csv_file.name)
1436
+ except Exception as e:
1437
+ return pd.DataFrame(), None, None, None, f"Could not read CSV: {e}"
1438
+
1439
+ missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
1440
+ if missing:
1441
+ return pd.DataFrame(), None, None, None, f"Missing required columns: {missing}"
1442
+
1443
+ run_dir = make_run_dir(base_dir="runs")
1444
+ rows = df.fillna("").to_dict("records")
1445
+
1446
+ try:
1447
+ result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
1448
+ except MCPClientError as e:
1449
+ return pd.DataFrame(), None, None, None, f"MCP server unavailable: {e}"
1450
+ except Exception as e:
1451
+ return pd.DataFrame(), None, None, None, f"Calculation failed: {e}"
1452
+
1453
+ result_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
1454
+ out_df = pd.DataFrame(result_rows)
1455
+ result_csv_path = write_dataframe_csv(run_dir / "cancer_risk_results.csv", out_df)
1456
+ write_json(run_dir / "cancer_risk_results.json", result)
1457
+
1458
+ artifacts = result.get("artifacts", {}) if isinstance(result, dict) else {}
1459
+ log_path = artifacts.get("log_jsonl", str(run_dir / "cancer_risk_log.jsonl"))
1460
+ report_path = artifacts.get("report_md", str(run_dir / "cancer_risk_report.md"))
1461
+
1462
+ summ = result.get("summary", {})
1463
+ status = (
1464
+ f"✅ Batch complete. total={summ.get('total_rows', 0)} "
1465
+ f"ok={summ.get('ok_rows', 0)} error={summ.get('error_rows', 0)}"
1466
+ )
1467
+ return out_df, str(result_csv_path), str(log_path), str(report_path), status
1468
+
1469
+
1470
  # =============================
1471
  # Synthesis tab handler
1472
  # =============================
 
1477
  client = get_openai_client(api_key)
1478
  except Exception as e:
1479
  return str(e)
1480
+ payload = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
1481
+ rows = payload.get("papers", payload) if isinstance(payload, dict) else payload
1482
+ if not isinstance(rows, list):
1483
+ return "Invalid extraction JSON format for synthesis."
1484
  return openai_synthesize_across_papers(client, model, rows)
1485
 
1486
 
 
1498
  # =============================
1499
  # Gradio UI
1500
  # =============================
1501
+ with gr.Blocks(title="Toxicology PDF → Grounded Extractor", css=APP_CSS) as demo:
1502
+ gr.HTML(
1503
+ """
1504
+ <div class="hero">
1505
+ <div class="hero-left">
1506
+ <div class="hero-title">TOXRA.AI</div>
1507
+ <div class="hero-sub">Grounded toxicology extraction &amp; literature exploration</div>
1508
+ <div class="hero-pills">
1509
+ <span class="hero-pill">Text-based PDFs only</span>
1510
+ <span class="hero-pill">Results-first reporting</span>
1511
+ <span class="hero-pill">Admin-configurable extraction</span>
1512
+ </div>
1513
+ </div>
1514
+ <div class="hero-right">
1515
+ <span class="hero-status">Production · Beta</span>
1516
+ </div>
1517
+ </div>
1518
+ """
1519
  )
1520
 
1521
  state_records = gr.State([])
 
1527
  vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
1528
 
1529
  with gr.Tab("Extract"):
1530
+ with gr.Row(elem_classes="split-row"):
1531
+ with gr.Column(scale=4, min_width=320, elem_classes="left-rail"):
1532
+ with gr.Group(elem_classes="card"):
1533
+ gr.Markdown("Extract setup", elem_classes="section-title")
1534
+ files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
1535
+
1536
+ with gr.Row():
1537
+ api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1538
+ model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1539
+
1540
+ with gr.Row():
1541
+ endpoint_preset = gr.Dropdown(
1542
+ label="Endpoint preset",
1543
+ choices=list(ENDPOINT_PRESETS.keys()),
1544
+ value="Required – Safety Assessor"
1545
+ )
1546
+ endpoints = gr.Dropdown(
1547
+ label="Endpoints to extract (Core included automatically)",
1548
+ choices=list(ENDPOINT_MODULES.keys()),
1549
+ multiselect=True,
1550
+ value=ENDPOINT_PRESETS["Required – Safety Assessor"]
1551
+ )
1552
+
1553
+ extract_btn = gr.Button("Run Extraction", variant="primary")
1554
+ status = gr.Textbox(label="Status", interactive=False)
1555
+
1556
+ with gr.Accordion("Advanced runtime settings", open=False, elem_classes="card"):
1557
+ with gr.Row():
1558
+ max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
1559
+ chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
1560
+ max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
1561
+
1562
+ with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False, elem_classes="card"):
1563
+ admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
1564
+
1565
+ admin_group = gr.Group(visible=False)
1566
+ admin_vocab_group = gr.Group(visible=False)
1567
+ admin_fields_group = gr.Group(visible=False)
1568
+
1569
+ with admin_group:
1570
+ gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
1571
+
1572
+ with admin_vocab_group:
1573
+ gr.Markdown("### Controlled vocabulary (lists only)")
1574
+ vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
1575
+ vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
1576
+
1577
+ with gr.Row():
1578
+ vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
1579
+ vocab_add_btn = gr.Button("Add")
1580
+ with gr.Row():
1581
+ vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
1582
+ vocab_remove_btn = gr.Button("Remove")
1583
+ vocab_apply_btn = gr.Button("Apply full list to category")
1584
+ vocab_reset_btn = gr.Button("Reset vocab to defaults")
1585
+
1586
+ vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
1587
+ vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
1588
+ vocab_status = gr.Textbox(label="Vocab status", interactive=False)
1589
+
1590
+ with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
1591
+ vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
1592
+
1593
+ with admin_fields_group:
1594
+ gr.Markdown("### Custom columns (Field Builder)")
1595
+ gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
1596
+
1597
+ with gr.Row():
1598
+ admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
1599
+ fields_apply_btn = gr.Button("Apply builder table")
1600
+
1601
+ with gr.Row():
1602
+ field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
1603
+ field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
1604
+
1605
+ enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
1606
+ instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
1607
+
1608
+ add_update_field_btn = gr.Button("Add/Update field")
1609
+
1610
+ fields_df = gr.Dataframe(
1611
+ label="Fields (edit and click Apply)",
1612
+ headers=["field","type","enum_values","instructions"],
1613
+ interactive=True,
1614
+ wrap=True
1615
+ )
1616
+
1617
+ fields_status = gr.Textbox(label="Field builder status", interactive=False)
1618
+
1619
+ with gr.Column(scale=7, min_width=480, elem_classes="right-panel"):
1620
+ with gr.Tabs(elem_classes="report-tabs"):
1621
+ with gr.Tab("Overview"):
1622
+ with gr.Group(elem_classes="card"):
1623
+ gr.Markdown("Report overview", elem_classes="section-title")
1624
+ summary_card = gr.HTML(render_summary_card("", []))
1625
+ with gr.Group(elem_classes="card"):
1626
+ overview_df = gr.Dataframe(
1627
+ label="Batch Overview",
1628
+ interactive=False,
1629
+ wrap=True,
1630
+ show_row_numbers=True
1631
+ )
1632
+
1633
+ with gr.Tab("Record"):
1634
+ with gr.Group(elem_classes="card"):
1635
+ record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
1636
+ with gr.Row():
1637
+ review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
1638
+ save_btn = gr.Button("Save edits")
1639
+ export_btn = gr.Button("Export reviewed CSV")
1640
+ review_status = gr.Textbox(label="Review status", interactive=False)
1641
+ with gr.Group(elem_classes="card"):
1642
+ vertical_view = gr.Dataframe(
1643
+ headers=["Field", "Value"],
1644
+ interactive=False,
1645
+ wrap=True,
1646
+ show_row_numbers=False,
1647
+ label="Extracted fields (vertical)"
1648
+ )
1649
+
1650
+ with gr.Tab("Evidence"):
1651
+ with gr.Group(elem_classes="card"):
1652
+ evidence_md = gr.Markdown()
1653
+
1654
+ with gr.Tab("Exports"):
1655
+ with gr.Group(elem_classes="card"):
1656
+ out_csv = gr.File(label="Download: extraction_table.csv")
1657
+ out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
1658
+ risk_template_prefilled = gr.File(label="Download: cancer_risk_input_template_prefilled.csv (record_id linked)")
1659
+ reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
1660
 
1661
  # --- Wiring ---
1662
  admin_mode.change(
 
1680
  extract_btn.click(
1681
  fn=run_extraction,
1682
  inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
1683
+ outputs=[summary_card, overview_df, out_csv, out_json, risk_template_prefilled, status, record_pick, state_records, state_details, vertical_view, evidence_md]
1684
  )
1685
 
1686
  record_pick.change(
 
1804
  build_literature_explorer_tab()
1805
 
1806
  with gr.Tab("Cross-paper Synthesis"):
1807
+ with gr.Group(elem_classes="card"):
1808
+ gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
1809
+ api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
1810
+ model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
1811
+ extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
1812
+ synth_btn = gr.Button("Synthesize Across Papers")
1813
+ synth_md = gr.Markdown()
1814
+ synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
1815
+
1816
+ with gr.Tab("Regulatory Gap Assessment"):
1817
+ with gr.Group(elem_classes="card"):
1818
+ gr.Markdown(
1819
+ "Run clause-level mapping against regulatory catalogs. "
1820
+ "Use `extraction_details.json` from Extract tab."
1821
+ )
1822
+ with gr.Row():
1823
+ reg_extraction_json = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
1824
+ reg_framework = gr.Dropdown(label="Framework profile", choices=["FDA CTP", "EPA"], value="FDA CTP")
1825
+ reg_override_notes = gr.Textbox(
1826
+ label="Override notes (optional)",
1827
+ lines=2,
1828
+ placeholder="Context to include in gap prompts."
1829
+ )
1830
+ reg_run_btn = gr.Button("Run Regulatory Gap Assessment", variant="primary")
1831
+ reg_status = gr.Textbox(label="Status", interactive=False)
1832
+ reg_summary_md = gr.Markdown()
1833
+ reg_matrix_df = gr.Dataframe(label="Clause-level gap matrix", interactive=False, wrap=True)
1834
+ reg_matrix_file = gr.File(label="Download: regulatory_gap_matrix.csv")
1835
+ reg_report_file = gr.File(label="Download: regulatory_gap_report.json")
1836
+
1837
+ reg_run_btn.click(
1838
+ fn=run_regulatory_gap_assessment,
1839
+ inputs=[reg_extraction_json, reg_framework, reg_override_notes],
1840
+ outputs=[reg_matrix_df, reg_summary_md, reg_matrix_file, reg_report_file, reg_status]
1841
+ )
1842
+
1843
+ with gr.Tab("Cancer Risk Calculator"):
1844
+ with gr.Group(elem_classes="card"):
1845
+ gr.Markdown(
1846
+ "Deterministic FDA/EPA cancer risk calculations routed through a dedicated local MCP server. "
1847
+ "Use `record_id` values from extraction outputs for traceability."
1848
+ )
1849
+ with gr.Row():
1850
+ template_btn = gr.Button("Download Blank CSV Template")
1851
+ template_file = gr.File(label="Download: cancer_risk_input_template.csv")
1852
+ template_status = gr.Textbox(label="Template status", interactive=False)
1853
+ template_btn.click(fn=export_blank_cancer_risk_template, inputs=None, outputs=[template_file, template_status])
1854
+
1855
+ risk_input_csv = gr.File(label="Upload populated cancer risk input CSV", file_types=[".csv"], file_count="single")
1856
+ risk_run_btn = gr.Button("Run Cancer Risk Batch", variant="primary")
1857
+ risk_status = gr.Textbox(label="Status", interactive=False)
1858
+ risk_results_df = gr.Dataframe(label="Cancer risk results", interactive=False, wrap=True)
1859
+ risk_results_csv = gr.File(label="Download: cancer_risk_results.csv")
1860
+ risk_log_file = gr.File(label="Download: cancer_risk_log.jsonl")
1861
+ risk_report_file = gr.File(label="Download: cancer_risk_report.md")
1862
+
1863
+ risk_run_btn.click(
1864
+ fn=run_cancer_risk_batch_ui,
1865
+ inputs=[risk_input_csv],
1866
+ outputs=[risk_results_df, risk_results_csv, risk_log_file, risk_report_file, risk_status]
1867
+ )
1868
 
1869
  if __name__ == "__main__":
1870
  port = int(os.environ.get("PORT", "7860"))
1871
+ demo.queue().launch(server_name="0.0.0.0", server_port=port)
cancer_risk_input_template.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ record_id,chemical_name,casrn,route,exposure_value,exposure_unit,body_weight_kg,csf_value,csf_unit,iur_value,air_conc_value,air_conc_unit,source_reference
literature_explorer.py CHANGED
@@ -9,6 +9,11 @@ import numpy as np
9
  import pandas as pd
10
  from pypdf import PdfReader
11
  from openai import OpenAI
 
 
 
 
 
12
 
13
 
14
  # =============================
@@ -454,39 +459,41 @@ def search(
454
  if not filtered_idx:
455
  return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
456
 
457
- ranked: List[Tuple[float, Dict[str, Any]]] = []
458
-
459
- # embeddings path
460
  if idx.get("has_embeddings") and idx.get("embeddings") is not None:
461
  try:
462
  client = get_client(api_key)
463
  qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
464
- mat = idx["embeddings"][filtered_idx, :]
465
- scores = mat @ qemb
466
- order = np.argsort(scores)[::-1][:max(1, int(top_k))]
467
- for j in order:
468
- page_i = filtered_idx[int(j)]
469
- ranked.append((float(scores[int(j)]), pages[page_i]))
470
  except Exception:
471
- ranked = []
472
-
473
- # fallback ranking
474
- if not ranked:
475
- qwords = set([w for w in re.findall(r"[a-zA-Z0-9\-]+", query.lower()) if len(w) >= 3])
476
- tmp = []
477
- for i in filtered_idx:
478
- t = (pages[i].get("text") or "").lower()
479
- hits = sum(1 for w in qwords if w in t)
480
- tmp.append((hits, pages[i]))
481
- tmp.sort(key=lambda x: x[0], reverse=True)
482
- ranked = [(float(h), r) for h, r in tmp[:max(1, int(top_k))]]
 
 
 
 
 
 
483
 
484
  rows = []
485
  evidence = []
486
- for score, r in ranked:
487
  pid = r["paper_id"]
488
  org = (papers.get(pid, {}) or {}).get("organ", "unknown")
489
- ctx = expanded_context(r.get("text", ""), query, n_sentences=5)
 
490
  ctx_wrapped = hard_wrap(ctx, width=110)
491
 
492
  preview = ctx.strip()
@@ -495,7 +502,7 @@ def search(
495
  rows.append({
496
  "file": r.get("file",""),
497
  "page": r.get("page",""),
498
- "score": round(score, 4),
499
  "organ": org,
500
  "endpoints": "; ".join(r.get("endpoints") or []),
501
  "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
@@ -530,6 +537,12 @@ def search(
530
  except Exception as e:
531
  mini_summary = f"(mini-summary unavailable: {e})"
532
 
 
 
 
 
 
 
533
  mini_md = "### Grounded mini-summary\n" + mini_summary
534
  return results_df, mini_md, evidence_md
535
 
@@ -551,8 +564,8 @@ def on_select_result(df: pd.DataFrame, idx: dict, query: str, evt: gr.SelectData
551
  meta = f"**{citation}**"
552
  return meta, citation, "(page text not found)", ""
553
 
554
- ctx = expanded_context(rec.get("text",""), query, n_sentences=5)
555
- ctx = hard_wrap(ctx, width=110)
556
  full_txt = hard_wrap(rec.get("text",""), width=110)
557
 
558
  meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
@@ -646,4 +659,4 @@ def build_literature_explorer_tab():
646
  fn=citation_ready,
647
  inputs=[citation_box],
648
  outputs=[copy_status]
649
- )
 
9
  import pandas as pd
10
  from pypdf import PdfReader
11
  from openai import OpenAI
12
+ from toxra_core.nlp_pipeline import (
13
+ expand_regulatory_queries,
14
+ extract_evidence_span,
15
+ hybrid_rank_text_items,
16
+ )
17
 
18
 
19
  # =============================
 
459
  if not filtered_idx:
460
  return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
461
 
462
+ filtered_pages = [pages[i] for i in filtered_idx]
463
+ emb_mat = None
464
+ qemb = None
465
  if idx.get("has_embeddings") and idx.get("embeddings") is not None:
466
  try:
467
  client = get_client(api_key)
468
  qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
469
+ emb_mat = idx["embeddings"][filtered_idx, :]
 
 
 
 
 
470
  except Exception:
471
+ emb_mat = None
472
+ qemb = None
473
+
474
+ _, query_families = expand_regulatory_queries(
475
+ base_queries=[query],
476
+ endpoint_modules=endpoint_filter or [],
477
+ frameworks=["FDA CTP", "EPA"],
478
+ extra_terms=[],
479
+ )
480
+
481
+ ranked_pages, rank_diag = hybrid_rank_text_items(
482
+ items=filtered_pages,
483
+ query=query,
484
+ families=query_families,
485
+ top_k=max(1, int(top_k)),
486
+ item_embeddings=emb_mat,
487
+ query_embedding=qemb,
488
+ )
489
 
490
  rows = []
491
  evidence = []
492
+ for r in ranked_pages:
493
  pid = r["paper_id"]
494
  org = (papers.get(pid, {}) or {}).get("organ", "unknown")
495
+ span = extract_evidence_span(r.get("text", ""), query, page=r.get("page"), n_sentences=5)
496
+ ctx = span.get("text", "")
497
  ctx_wrapped = hard_wrap(ctx, width=110)
498
 
499
  preview = ctx.strip()
 
502
  rows.append({
503
  "file": r.get("file",""),
504
  "page": r.get("page",""),
505
+ "score": round(float(r.get("_nlp_rrf_score", 0.0)), 4),
506
  "organ": org,
507
  "endpoints": "; ".join(r.get("endpoints") or []),
508
  "enzymes": "; ".join((r.get("enzymes") or [])[:12]),
 
537
  except Exception as e:
538
  mini_summary = f"(mini-summary unavailable: {e})"
539
 
540
+ if rank_diag:
541
+ mini_summary = (
542
+ f"{mini_summary}\n\n"
543
+ f"_NLP diagnostics: method={rank_diag.get('ranking_method','')}, "
544
+ f"coverage={rank_diag.get('coverage_score', 0.0)}._"
545
+ )
546
  mini_md = "### Grounded mini-summary\n" + mini_summary
547
  return results_df, mini_md, evidence_md
548
 
 
564
  meta = f"**{citation}**"
565
  return meta, citation, "(page text not found)", ""
566
 
567
+ span = extract_evidence_span(rec.get("text",""), query, page=page, n_sentences=5)
568
+ ctx = hard_wrap(span.get("text", ""), width=110)
569
  full_txt = hard_wrap(rec.get("text",""), width=110)
570
 
571
  meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
 
659
  fn=citation_ready,
660
  inputs=[citation_box],
661
  outputs=[copy_status]
662
+ )
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- gradio>=5.0.0
 
2
  pandas>=2.0.0
3
- numpy>=2.0.0
4
- pypdf>=5.0.0
5
  scikit-learn>=1.4.0
6
- openai>=1.0.0
 
 
1
+ gradio>=4.0.0
2
+ numpy>=1.26.0
3
  pandas>=2.0.0
4
+ pypdf>=4.0.0
 
5
  scikit-learn>=1.4.0
6
+ openai>=1.40.0
7
+ pytest>=8.0.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11