Spaces:
Running
Running
Upload 5 files
Browse files- app.py +551 -153
- cancer_risk_input_template.csv +1 -0
- literature_explorer.py +40 -27
- requirements.txt +5 -4
- runtime.txt +1 -0
app.py
CHANGED
|
@@ -10,13 +10,137 @@ import numpy as np
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
from pypdf import PdfReader
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
from openai import OpenAI
|
| 16 |
from literature_explorer import build_literature_explorer_tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# =============================
|
| 21 |
# Defaults
|
| 22 |
# =============================
|
|
@@ -236,6 +360,22 @@ def select_relevant_chunks(
|
|
| 236 |
if not texts:
|
| 237 |
return []
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
|
| 240 |
X = vectorizer.fit_transform(texts)
|
| 241 |
|
|
@@ -897,7 +1037,7 @@ def run_extraction(
|
|
| 897 |
if not files:
|
| 898 |
return (
|
| 899 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
|
| 900 |
-
pd.DataFrame(), None, None, "Upload one or more PDFs.",
|
| 901 |
gr.update(choices=[], value=None),
|
| 902 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 903 |
)
|
|
@@ -907,7 +1047,7 @@ def run_extraction(
|
|
| 907 |
except Exception as e:
|
| 908 |
return (
|
| 909 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
|
| 910 |
-
pd.DataFrame(), None, None, f"Controlled vocab JSON invalid: {e}",
|
| 911 |
gr.update(choices=[], value=None),
|
| 912 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 913 |
)
|
|
@@ -916,7 +1056,7 @@ def run_extraction(
|
|
| 916 |
if not field_props:
|
| 917 |
return (
|
| 918 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
|
| 919 |
-
pd.DataFrame(), None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
|
| 920 |
gr.update(choices=[], value=None),
|
| 921 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 922 |
)
|
|
@@ -936,13 +1076,14 @@ def run_extraction(
|
|
| 936 |
except Exception as e:
|
| 937 |
return (
|
| 938 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
|
| 939 |
-
pd.DataFrame(), None, None, str(e),
|
| 940 |
gr.update(choices=[], value=None),
|
| 941 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 942 |
)
|
| 943 |
|
| 944 |
paper_details: List[Dict[str, Any]] = []
|
| 945 |
output_rows: List[Dict[str, Any]] = []
|
|
|
|
| 946 |
|
| 947 |
tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
|
| 948 |
|
|
@@ -963,20 +1104,62 @@ def run_extraction(
|
|
| 963 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 964 |
"evidence": []
|
| 965 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
else:
|
| 967 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 968 |
-
|
| 969 |
-
queries = [
|
| 970 |
"regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
|
| 971 |
"chemical name CAS number",
|
| 972 |
]
|
| 973 |
-
for
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
|
|
|
|
|
|
|
|
|
| 977 |
|
| 978 |
-
|
| 979 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 980 |
|
| 981 |
ex = openai_structured_extract(
|
| 982 |
client=client,
|
|
@@ -1060,7 +1243,16 @@ def run_extraction(
|
|
| 1060 |
csv_path = tmpdir / "extraction_table.csv"
|
| 1061 |
json_path = tmpdir / "extraction_details.json"
|
| 1062 |
df.to_csv(csv_path, index=False)
|
| 1063 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
|
| 1065 |
choices = [r.get("record_id") for r in records if r.get("record_id")]
|
| 1066 |
default = choices[0] if choices else None
|
|
@@ -1083,6 +1275,7 @@ def run_extraction(
|
|
| 1083 |
overview,
|
| 1084 |
str(csv_path),
|
| 1085 |
str(json_path),
|
|
|
|
| 1086 |
status,
|
| 1087 |
gr.update(choices=choices, value=default),
|
| 1088 |
records,
|
|
@@ -1145,6 +1338,135 @@ def export_reviewed_csv(records: List[Dict[str, Any]]):
|
|
| 1145 |
return str(path), "Reviewed CSV ready to download."
|
| 1146 |
|
| 1147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1148 |
# =============================
|
| 1149 |
# Synthesis tab handler
|
| 1150 |
# =============================
|
|
@@ -1155,7 +1477,10 @@ def run_synthesis(api_key, model, extraction_json_file):
|
|
| 1155 |
client = get_openai_client(api_key)
|
| 1156 |
except Exception as e:
|
| 1157 |
return str(e)
|
| 1158 |
-
|
|
|
|
|
|
|
|
|
|
| 1159 |
return openai_synthesize_across_papers(client, model, rows)
|
| 1160 |
|
| 1161 |
|
|
@@ -1173,11 +1498,24 @@ def set_admin_visibility(is_admin: bool):
|
|
| 1173 |
# =============================
|
| 1174 |
# Gradio UI
|
| 1175 |
# =============================
|
| 1176 |
-
with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
| 1177 |
-
gr.
|
| 1178 |
-
"
|
| 1179 |
-
|
| 1180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1181 |
)
|
| 1182 |
|
| 1183 |
state_records = gr.State([])
|
|
@@ -1189,130 +1527,136 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1189 |
vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
|
| 1190 |
|
| 1191 |
with gr.Tab("Extract"):
|
| 1192 |
-
|
| 1193 |
-
|
| 1194 |
-
|
| 1195 |
-
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
| 1199 |
-
|
| 1200 |
-
|
| 1201 |
-
|
| 1202 |
-
|
| 1203 |
-
|
| 1204 |
-
|
| 1205 |
-
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
-
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
-
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
|
| 1231 |
-
|
| 1232 |
-
|
| 1233 |
-
|
| 1234 |
-
|
| 1235 |
-
|
| 1236 |
-
|
| 1237 |
-
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
|
| 1241 |
-
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
-
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
-
|
| 1260 |
-
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
|
| 1270 |
-
|
| 1271 |
-
|
| 1272 |
-
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
|
| 1297 |
-
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
-
|
| 1301 |
-
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
-
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
-
|
| 1314 |
-
|
| 1315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1316 |
|
| 1317 |
# --- Wiring ---
|
| 1318 |
admin_mode.change(
|
|
@@ -1336,7 +1680,7 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1336 |
extract_btn.click(
|
| 1337 |
fn=run_extraction,
|
| 1338 |
inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
|
| 1339 |
-
outputs=[summary_card, overview_df, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1340 |
)
|
| 1341 |
|
| 1342 |
record_pick.change(
|
|
@@ -1460,14 +1804,68 @@ with gr.Blocks(title="Toxicology PDF → Grounded Extractor") as demo:
|
|
| 1460 |
build_literature_explorer_tab()
|
| 1461 |
|
| 1462 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1463 |
-
gr.
|
| 1464 |
-
|
| 1465 |
-
|
| 1466 |
-
|
| 1467 |
-
|
| 1468 |
-
|
| 1469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1470 |
|
| 1471 |
if __name__ == "__main__":
|
| 1472 |
port = int(os.environ.get("PORT", "7860"))
|
| 1473 |
-
demo.queue().launch(server_name="0.0.0.0", server_port=port)
|
|
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
from pypdf import PdfReader
|
| 13 |
+
try:
|
| 14 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 15 |
+
except Exception: # pragma: no cover - fallback path for minimal runtime
|
| 16 |
+
TfidfVectorizer = None
|
| 17 |
|
| 18 |
from openai import OpenAI
|
| 19 |
from literature_explorer import build_literature_explorer_tab
|
| 20 |
+
from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json, write_markdown
|
| 21 |
+
from toxra_core.calculation_client import MCPClientError, run_batch_cancer_risk
|
| 22 |
+
from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
|
| 23 |
+
from toxra_core.nlp_pipeline import extract_evidence_span, expand_regulatory_queries, hybrid_rank_text_items
|
| 24 |
+
from toxra_core.regulatory_mapper import map_extraction_to_framework
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
+
# =============================
|
| 29 |
+
# UI theme
|
| 30 |
+
# =============================
|
| 31 |
+
APP_CSS = """
|
| 32 |
+
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@400;500;600;700&display=swap');
|
| 33 |
+
|
| 34 |
+
:root {
|
| 35 |
+
--bg: #f5f7fb;
|
| 36 |
+
--panel: #ffffff;
|
| 37 |
+
--ink: #0f172a;
|
| 38 |
+
--muted: #516079;
|
| 39 |
+
--line: #e2e8f0;
|
| 40 |
+
--accent: #2563eb;
|
| 41 |
+
--accent-2: #0ea5e9;
|
| 42 |
+
--accent-soft: #e6efff;
|
| 43 |
+
--shadow: 0 10px 28px rgba(15, 23, 42, 0.08);
|
| 44 |
+
--radius: 14px;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
.gradio-container {
|
| 48 |
+
background: var(--bg);
|
| 49 |
+
color: var(--ink);
|
| 50 |
+
font-family: "IBM Plex Sans", ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, "Helvetica Neue", Arial, "Noto Sans", "Apple Color Emoji", "Segoe UI Emoji";
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.hero {
|
| 54 |
+
background: linear-gradient(180deg, #edf3ff 0%, #f4f8ff 100%);
|
| 55 |
+
color: var(--ink);
|
| 56 |
+
border-radius: 16px;
|
| 57 |
+
padding: 18px 22px;
|
| 58 |
+
box-shadow: var(--shadow);
|
| 59 |
+
border: 1px solid #dbe5f4;
|
| 60 |
+
display: flex;
|
| 61 |
+
align-items: center;
|
| 62 |
+
justify-content: space-between;
|
| 63 |
+
gap: 16px;
|
| 64 |
+
flex-wrap: wrap;
|
| 65 |
+
}
|
| 66 |
+
.hero-left { min-width: 240px; }
|
| 67 |
+
.hero-right { margin-left: auto; }
|
| 68 |
+
.hero-title { font-size: 22px; font-weight: 700; letter-spacing: 0.08em; }
|
| 69 |
+
.hero-sub { margin-top: 4px; font-size: 13px; color: #3b4b63; }
|
| 70 |
+
.hero-pills { margin-top: 10px; display: flex; gap: 8px; flex-wrap: wrap; }
|
| 71 |
+
.hero-pill {
|
| 72 |
+
background: var(--accent-soft);
|
| 73 |
+
color: #1e3a8a;
|
| 74 |
+
border: 1px solid #d6e3f6;
|
| 75 |
+
border-radius: 999px;
|
| 76 |
+
padding: 4px 10px;
|
| 77 |
+
font-size: 11px;
|
| 78 |
+
font-weight: 600;
|
| 79 |
+
}
|
| 80 |
+
.hero-status {
|
| 81 |
+
background: #ffffff;
|
| 82 |
+
color: #334155;
|
| 83 |
+
border: 1px solid #d9e2ef;
|
| 84 |
+
border-radius: 999px;
|
| 85 |
+
padding: 6px 12px;
|
| 86 |
+
font-size: 12px;
|
| 87 |
+
font-weight: 600;
|
| 88 |
+
box-shadow: 0 6px 16px rgba(15, 23, 42, 0.06);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.split-row { gap: 18px; }
|
| 92 |
+
.card {
|
| 93 |
+
background: var(--panel);
|
| 94 |
+
border: 1px solid var(--line);
|
| 95 |
+
border-radius: var(--radius);
|
| 96 |
+
padding: 16px;
|
| 97 |
+
box-shadow: var(--shadow);
|
| 98 |
+
}
|
| 99 |
+
.left-rail .card + .card { margin-top: 16px; }
|
| 100 |
+
.right-panel .card { margin-bottom: 14px; }
|
| 101 |
+
.section-title {
|
| 102 |
+
font-size: 12px;
|
| 103 |
+
text-transform: uppercase;
|
| 104 |
+
letter-spacing: 0.14em;
|
| 105 |
+
color: var(--muted);
|
| 106 |
+
margin-bottom: 8px;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.gradio-container input,
|
| 110 |
+
.gradio-container textarea,
|
| 111 |
+
.gradio-container select {
|
| 112 |
+
border-radius: 10px !important;
|
| 113 |
+
border-color: var(--line) !important;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.gradio-container button.primary {
|
| 117 |
+
background: var(--accent) !important;
|
| 118 |
+
border-color: var(--accent) !important;
|
| 119 |
+
}
|
| 120 |
+
.gradio-container button.primary:hover { background: #1d4ed8 !important; }
|
| 121 |
+
|
| 122 |
+
.gradio-container .tab-nav { gap: 8px; }
|
| 123 |
+
.gradio-container .tab-nav button {
|
| 124 |
+
background: var(--panel);
|
| 125 |
+
border: 1px solid var(--line);
|
| 126 |
+
border-radius: 999px;
|
| 127 |
+
padding: 6px 14px;
|
| 128 |
+
font-size: 12px;
|
| 129 |
+
color: var(--muted);
|
| 130 |
+
}
|
| 131 |
+
.gradio-container .tab-nav button.selected {
|
| 132 |
+
background: var(--accent);
|
| 133 |
+
border-color: var(--accent);
|
| 134 |
+
color: #ffffff;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.gradio-container .accordion {
|
| 138 |
+
border: 1px solid var(--line);
|
| 139 |
+
border-radius: var(--radius);
|
| 140 |
+
}
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
|
| 144 |
# =============================
|
| 145 |
# Defaults
|
| 146 |
# =============================
|
|
|
|
| 360 |
if not texts:
|
| 361 |
return []
|
| 362 |
|
| 363 |
+
if TfidfVectorizer is None:
|
| 364 |
+
selected_idx: List[int] = []
|
| 365 |
+
for q in queries:
|
| 366 |
+
q_tokens = set([w for w in re.findall(r"[a-zA-Z0-9\\-]+", (q or "").lower()) if len(w) >= 3])
|
| 367 |
+
scored = []
|
| 368 |
+
for i, t in enumerate(texts):
|
| 369 |
+
tl = t.lower()
|
| 370 |
+
scored.append((sum(1 for tok in q_tokens if tok in tl), i))
|
| 371 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 372 |
+
for _, i in scored[:top_per_query]:
|
| 373 |
+
if i not in selected_idx:
|
| 374 |
+
selected_idx.append(i)
|
| 375 |
+
if not selected_idx:
|
| 376 |
+
selected_idx = list(range(min(len(chunks), max_chunks)))
|
| 377 |
+
return [chunks[i] for i in selected_idx[:max_chunks]]
|
| 378 |
+
|
| 379 |
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
|
| 380 |
X = vectorizer.fit_transform(texts)
|
| 381 |
|
|
|
|
| 1037 |
if not files:
|
| 1038 |
return (
|
| 1039 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#666;'>Upload PDFs to run extraction.</div></div>",
|
| 1040 |
+
pd.DataFrame(), None, None, None, "Upload one or more PDFs.",
|
| 1041 |
gr.update(choices=[], value=None),
|
| 1042 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 1043 |
)
|
|
|
|
| 1047 |
except Exception as e:
|
| 1048 |
return (
|
| 1049 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Invalid vocab JSON.</div></div>",
|
| 1050 |
+
pd.DataFrame(), None, None, None, f"Controlled vocab JSON invalid: {e}",
|
| 1051 |
gr.update(choices=[], value=None),
|
| 1052 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 1053 |
)
|
|
|
|
| 1056 |
if not field_props:
|
| 1057 |
return (
|
| 1058 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>No columns defined.</div></div>",
|
| 1059 |
+
pd.DataFrame(), None, None, None, "No extraction fields are defined. (Check selected endpoints or admin field spec.)",
|
| 1060 |
gr.update(choices=[], value=None),
|
| 1061 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 1062 |
)
|
|
|
|
| 1076 |
except Exception as e:
|
| 1077 |
return (
|
| 1078 |
"<div style='border:1px solid #eee;padding:14px;border-radius:10px;'><b>Executive Summary</b><div style='margin-top:8px;color:#b00;'>Missing API key.</div></div>",
|
| 1079 |
+
pd.DataFrame(), None, None, None, str(e),
|
| 1080 |
gr.update(choices=[], value=None),
|
| 1081 |
[], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 1082 |
)
|
| 1083 |
|
| 1084 |
paper_details: List[Dict[str, Any]] = []
|
| 1085 |
output_rows: List[Dict[str, Any]] = []
|
| 1086 |
+
nlp_diagnostics: List[Dict[str, Any]] = []
|
| 1087 |
|
| 1088 |
tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
|
| 1089 |
|
|
|
|
| 1104 |
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 1105 |
"evidence": []
|
| 1106 |
}
|
| 1107 |
+
nlp_diagnostics.append(
|
| 1108 |
+
{
|
| 1109 |
+
"file": filename,
|
| 1110 |
+
"ranking_method": "unavailable_no_text",
|
| 1111 |
+
"selected_indices": [],
|
| 1112 |
+
"coverage_by_query_family": {},
|
| 1113 |
+
"coverage_score": 0.0,
|
| 1114 |
+
}
|
| 1115 |
+
)
|
| 1116 |
else:
|
| 1117 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 1118 |
+
base_queries = [
|
|
|
|
| 1119 |
"regulatory acceptability risk hazard concern conclusion uncertainty evidence NOAEL LOAEL BMD",
|
| 1120 |
"chemical name CAS number",
|
| 1121 |
]
|
| 1122 |
+
extra_terms = [ins if ins else k for k, ins in field_instr.items()]
|
| 1123 |
+
queries, families = expand_regulatory_queries(
|
| 1124 |
+
base_queries=base_queries,
|
| 1125 |
+
endpoint_modules=selected_endpoints or [],
|
| 1126 |
+
frameworks=["FDA CTP", "EPA"],
|
| 1127 |
+
extra_terms=extra_terms,
|
| 1128 |
+
)
|
| 1129 |
|
| 1130 |
+
emb_mat = None
|
| 1131 |
+
qemb = None
|
| 1132 |
+
try:
|
| 1133 |
+
texts = [c.get("text", "") for c in chunks]
|
| 1134 |
+
if texts:
|
| 1135 |
+
emb_mat = embed_texts(client, DEFAULT_EMBEDDING_MODEL, texts)
|
| 1136 |
+
qemb = embed_texts(client, DEFAULT_EMBEDDING_MODEL, [" ".join(queries[:20])])[0]
|
| 1137 |
+
except Exception:
|
| 1138 |
+
emb_mat = None
|
| 1139 |
+
qemb = None
|
| 1140 |
+
|
| 1141 |
+
selected, diag = hybrid_rank_text_items(
|
| 1142 |
+
items=chunks,
|
| 1143 |
+
query=" ".join(queries[:20]),
|
| 1144 |
+
families=families,
|
| 1145 |
+
top_k=12,
|
| 1146 |
+
item_embeddings=emb_mat,
|
| 1147 |
+
query_embedding=qemb,
|
| 1148 |
+
)
|
| 1149 |
+
nlp_diagnostics.append(dict({"file": filename}, **diag))
|
| 1150 |
+
span_blocks: List[str] = []
|
| 1151 |
+
chars = 0
|
| 1152 |
+
for c in selected:
|
| 1153 |
+
span = extract_evidence_span(c.get("text", ""), " ".join(queries[:20]), page=None, n_sentences=5)
|
| 1154 |
+
snippet = span.get("text", "") or c.get("text", "")
|
| 1155 |
+
block = f"[pages {c.get('pages','')}]\n{snippet}\n"
|
| 1156 |
+
if chars + len(block) > int(max_context_chars):
|
| 1157 |
+
break
|
| 1158 |
+
span_blocks.append(block)
|
| 1159 |
+
chars += len(block)
|
| 1160 |
+
context = "\n".join(span_blocks).strip()
|
| 1161 |
+
if not context:
|
| 1162 |
+
context = build_context(selected, max_chars=int(max_context_chars))
|
| 1163 |
|
| 1164 |
ex = openai_structured_extract(
|
| 1165 |
client=client,
|
|
|
|
| 1243 |
csv_path = tmpdir / "extraction_table.csv"
|
| 1244 |
json_path = tmpdir / "extraction_details.json"
|
| 1245 |
df.to_csv(csv_path, index=False)
|
| 1246 |
+
details_payload = {
|
| 1247 |
+
"papers": paper_details,
|
| 1248 |
+
"toxra_extensions": {
|
| 1249 |
+
"nlp_diagnostics": nlp_diagnostics,
|
| 1250 |
+
"regulatory_gap_assessment": {},
|
| 1251 |
+
"risk_calculation_refs": [],
|
| 1252 |
+
},
|
| 1253 |
+
}
|
| 1254 |
+
json_path.write_text(json.dumps(details_payload, indent=2), encoding="utf-8")
|
| 1255 |
+
prefilled_template_path = export_prefilled_cancer_risk_template(records)
|
| 1256 |
|
| 1257 |
choices = [r.get("record_id") for r in records if r.get("record_id")]
|
| 1258 |
default = choices[0] if choices else None
|
|
|
|
| 1275 |
overview,
|
| 1276 |
str(csv_path),
|
| 1277 |
str(json_path),
|
| 1278 |
+
str(prefilled_template_path),
|
| 1279 |
status,
|
| 1280 |
gr.update(choices=choices, value=default),
|
| 1281 |
records,
|
|
|
|
| 1338 |
return str(path), "Reviewed CSV ready to download."
|
| 1339 |
|
| 1340 |
|
| 1341 |
+
# =============================
|
| 1342 |
+
# New modules: template, mapping, MCP batch
|
| 1343 |
+
# =============================
|
| 1344 |
+
def _load_extraction_payload(file_obj: Any) -> Tuple[Any, List[Dict[str, Any]], Dict[str, Any]]:
|
| 1345 |
+
if file_obj is None:
|
| 1346 |
+
raise ValueError("Upload extraction_details.json first.")
|
| 1347 |
+
payload = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
|
| 1348 |
+
if isinstance(payload, list):
|
| 1349 |
+
return payload, payload, {}
|
| 1350 |
+
if isinstance(payload, dict):
|
| 1351 |
+
papers = payload.get("papers", [])
|
| 1352 |
+
if not isinstance(papers, list):
|
| 1353 |
+
raise ValueError("Invalid extraction_details.json format: papers must be a list.")
|
| 1354 |
+
ext = payload.get("toxra_extensions", {})
|
| 1355 |
+
return payload, papers, (ext if isinstance(ext, dict) else {})
|
| 1356 |
+
raise ValueError("Unsupported extraction_details.json format.")
|
| 1357 |
+
|
| 1358 |
+
|
| 1359 |
+
def export_blank_cancer_risk_template():
|
| 1360 |
+
tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_"))
|
| 1361 |
+
path = tmpdir / "cancer_risk_input_template.csv"
|
| 1362 |
+
pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
|
| 1363 |
+
return str(path), "Blank cancer risk template ready."
|
| 1364 |
+
|
| 1365 |
+
|
| 1366 |
+
def export_prefilled_cancer_risk_template(records: List[Dict[str, Any]]):
|
| 1367 |
+
tmpdir = Path(tempfile.mkdtemp(prefix="tox_template_prefilled_"))
|
| 1368 |
+
path = tmpdir / "cancer_risk_input_template_prefilled.csv"
|
| 1369 |
+
if not records:
|
| 1370 |
+
pd.DataFrame(columns=CANCER_RISK_TEMPLATE_COLUMNS).to_csv(path, index=False)
|
| 1371 |
+
return str(path)
|
| 1372 |
+
|
| 1373 |
+
rows: List[Dict[str, Any]] = []
|
| 1374 |
+
seen = set()
|
| 1375 |
+
for r in records:
|
| 1376 |
+
rid = str(r.get("record_id", "")).strip()
|
| 1377 |
+
if not rid or rid in seen:
|
| 1378 |
+
continue
|
| 1379 |
+
seen.add(rid)
|
| 1380 |
+
route = str(r.get("exposure_route", "")).strip().lower()
|
| 1381 |
+
if route not in {"oral", "inhalation"}:
|
| 1382 |
+
route = ""
|
| 1383 |
+
casn = str(r.get("cas_numbers", "")).split(";")[0].strip()
|
| 1384 |
+
rows.append(
|
| 1385 |
+
{
|
| 1386 |
+
"record_id": rid,
|
| 1387 |
+
"chemical_name": str(r.get("chemical", "")).strip(),
|
| 1388 |
+
"casrn": casn,
|
| 1389 |
+
"route": route,
|
| 1390 |
+
"exposure_value": "",
|
| 1391 |
+
"exposure_unit": "",
|
| 1392 |
+
"body_weight_kg": "",
|
| 1393 |
+
"csf_value": "",
|
| 1394 |
+
"csf_unit": "",
|
| 1395 |
+
"iur_value": "",
|
| 1396 |
+
"air_conc_value": "",
|
| 1397 |
+
"air_conc_unit": "",
|
| 1398 |
+
"source_reference": str(r.get("file", "")).strip(),
|
| 1399 |
+
}
|
| 1400 |
+
)
|
| 1401 |
+
|
| 1402 |
+
df = pd.DataFrame(rows, columns=CANCER_RISK_TEMPLATE_COLUMNS)
|
| 1403 |
+
df.to_csv(path, index=False)
|
| 1404 |
+
return str(path)
|
| 1405 |
+
|
| 1406 |
+
|
| 1407 |
+
def run_regulatory_gap_assessment(extraction_json_file, framework: str, override_notes: str):
|
| 1408 |
+
if extraction_json_file is None:
|
| 1409 |
+
return pd.DataFrame(), "Upload extraction_details.json first.", None, None, "No input file."
|
| 1410 |
+
try:
|
| 1411 |
+
payload, _, _ = _load_extraction_payload(extraction_json_file)
|
| 1412 |
+
matrix_df, report, report_md = map_extraction_to_framework(
|
| 1413 |
+
extraction_payload=payload,
|
| 1414 |
+
framework=framework,
|
| 1415 |
+
catalog_dir="regulatory_catalog",
|
| 1416 |
+
override_notes=override_notes or "",
|
| 1417 |
+
)
|
| 1418 |
+
except Exception as e:
|
| 1419 |
+
return pd.DataFrame(), f"(assessment unavailable: {e})", None, None, str(e)
|
| 1420 |
+
|
| 1421 |
+
run_dir = make_run_dir(base_dir="runs")
|
| 1422 |
+
matrix_path = write_dataframe_csv(run_dir / "regulatory_gap_matrix.csv", matrix_df)
|
| 1423 |
+
report_path = write_json(run_dir / "regulatory_gap_report.json", report)
|
| 1424 |
+
write_markdown(run_dir / "regulatory_gap_report.md", report_md)
|
| 1425 |
+
|
| 1426 |
+
md = "### Regulatory Gap Summary\n" + report_md
|
| 1427 |
+
status = f"✅ Gap assessment complete. Covered={report.get('summary', {}).get('covered', 0)} | Missing={report.get('summary', {}).get('missing', 0)}"
|
| 1428 |
+
return matrix_df, md, str(matrix_path), str(report_path), status
|
| 1429 |
+
|
| 1430 |
+
|
| 1431 |
+
def run_cancer_risk_batch_ui(input_csv_file):
|
| 1432 |
+
if input_csv_file is None:
|
| 1433 |
+
return pd.DataFrame(), None, None, None, "Upload a populated cancer risk input CSV."
|
| 1434 |
+
try:
|
| 1435 |
+
df = pd.read_csv(input_csv_file.name)
|
| 1436 |
+
except Exception as e:
|
| 1437 |
+
return pd.DataFrame(), None, None, None, f"Could not read CSV: {e}"
|
| 1438 |
+
|
| 1439 |
+
missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
|
| 1440 |
+
if missing:
|
| 1441 |
+
return pd.DataFrame(), None, None, None, f"Missing required columns: {missing}"
|
| 1442 |
+
|
| 1443 |
+
run_dir = make_run_dir(base_dir="runs")
|
| 1444 |
+
rows = df.fillna("").to_dict("records")
|
| 1445 |
+
|
| 1446 |
+
try:
|
| 1447 |
+
result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
|
| 1448 |
+
except MCPClientError as e:
|
| 1449 |
+
return pd.DataFrame(), None, None, None, f"MCP server unavailable: {e}"
|
| 1450 |
+
except Exception as e:
|
| 1451 |
+
return pd.DataFrame(), None, None, None, f"Calculation failed: {e}"
|
| 1452 |
+
|
| 1453 |
+
result_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
|
| 1454 |
+
out_df = pd.DataFrame(result_rows)
|
| 1455 |
+
result_csv_path = write_dataframe_csv(run_dir / "cancer_risk_results.csv", out_df)
|
| 1456 |
+
write_json(run_dir / "cancer_risk_results.json", result)
|
| 1457 |
+
|
| 1458 |
+
artifacts = result.get("artifacts", {}) if isinstance(result, dict) else {}
|
| 1459 |
+
log_path = artifacts.get("log_jsonl", str(run_dir / "cancer_risk_log.jsonl"))
|
| 1460 |
+
report_path = artifacts.get("report_md", str(run_dir / "cancer_risk_report.md"))
|
| 1461 |
+
|
| 1462 |
+
summ = result.get("summary", {})
|
| 1463 |
+
status = (
|
| 1464 |
+
f"✅ Batch complete. total={summ.get('total_rows', 0)} "
|
| 1465 |
+
f"ok={summ.get('ok_rows', 0)} error={summ.get('error_rows', 0)}"
|
| 1466 |
+
)
|
| 1467 |
+
return out_df, str(result_csv_path), str(log_path), str(report_path), status
|
| 1468 |
+
|
| 1469 |
+
|
| 1470 |
# =============================
|
| 1471 |
# Synthesis tab handler
|
| 1472 |
# =============================
|
|
|
|
| 1477 |
client = get_openai_client(api_key)
|
| 1478 |
except Exception as e:
|
| 1479 |
return str(e)
|
| 1480 |
+
payload = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
|
| 1481 |
+
rows = payload.get("papers", payload) if isinstance(payload, dict) else payload
|
| 1482 |
+
if not isinstance(rows, list):
|
| 1483 |
+
return "Invalid extraction JSON format for synthesis."
|
| 1484 |
return openai_synthesize_across_papers(client, model, rows)
|
| 1485 |
|
| 1486 |
|
|
|
|
| 1498 |
# =============================
|
| 1499 |
# Gradio UI
|
| 1500 |
# =============================
|
| 1501 |
+
with gr.Blocks(title="Toxicology PDF → Grounded Extractor", css=APP_CSS) as demo:
|
| 1502 |
+
gr.HTML(
|
| 1503 |
+
"""
|
| 1504 |
+
<div class="hero">
|
| 1505 |
+
<div class="hero-left">
|
| 1506 |
+
<div class="hero-title">TOXRA.AI</div>
|
| 1507 |
+
<div class="hero-sub">Grounded toxicology extraction & literature exploration</div>
|
| 1508 |
+
<div class="hero-pills">
|
| 1509 |
+
<span class="hero-pill">Text-based PDFs only</span>
|
| 1510 |
+
<span class="hero-pill">Results-first reporting</span>
|
| 1511 |
+
<span class="hero-pill">Admin-configurable extraction</span>
|
| 1512 |
+
</div>
|
| 1513 |
+
</div>
|
| 1514 |
+
<div class="hero-right">
|
| 1515 |
+
<span class="hero-status">Production · Beta</span>
|
| 1516 |
+
</div>
|
| 1517 |
+
</div>
|
| 1518 |
+
"""
|
| 1519 |
)
|
| 1520 |
|
| 1521 |
state_records = gr.State([])
|
|
|
|
| 1527 |
vocab_json = gr.Textbox(visible=False, interactive=False, lines=8)
|
| 1528 |
|
| 1529 |
with gr.Tab("Extract"):
|
| 1530 |
+
with gr.Row(elem_classes="split-row"):
|
| 1531 |
+
with gr.Column(scale=4, min_width=320, elem_classes="left-rail"):
|
| 1532 |
+
with gr.Group(elem_classes="card"):
|
| 1533 |
+
gr.Markdown("Extract setup", elem_classes="section-title")
|
| 1534 |
+
files = gr.File(label="Upload toxicology PDFs", file_types=[".pdf"], file_count="multiple")
|
| 1535 |
+
|
| 1536 |
+
with gr.Row():
|
| 1537 |
+
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1538 |
+
model = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1539 |
+
|
| 1540 |
+
with gr.Row():
|
| 1541 |
+
endpoint_preset = gr.Dropdown(
|
| 1542 |
+
label="Endpoint preset",
|
| 1543 |
+
choices=list(ENDPOINT_PRESETS.keys()),
|
| 1544 |
+
value="Required – Safety Assessor"
|
| 1545 |
+
)
|
| 1546 |
+
endpoints = gr.Dropdown(
|
| 1547 |
+
label="Endpoints to extract (Core included automatically)",
|
| 1548 |
+
choices=list(ENDPOINT_MODULES.keys()),
|
| 1549 |
+
multiselect=True,
|
| 1550 |
+
value=ENDPOINT_PRESETS["Required – Safety Assessor"]
|
| 1551 |
+
)
|
| 1552 |
+
|
| 1553 |
+
extract_btn = gr.Button("Run Extraction", variant="primary")
|
| 1554 |
+
status = gr.Textbox(label="Status", interactive=False)
|
| 1555 |
+
|
| 1556 |
+
with gr.Accordion("Advanced runtime settings", open=False, elem_classes="card"):
|
| 1557 |
+
with gr.Row():
|
| 1558 |
+
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 1559 |
+
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 1560 |
+
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 1561 |
+
|
| 1562 |
+
with gr.Accordion("Admin tools (taxonomy + custom columns)", open=False, elem_classes="card"):
|
| 1563 |
+
admin_mode = gr.Checkbox(label="Enable Admin mode", value=False)
|
| 1564 |
+
|
| 1565 |
+
admin_group = gr.Group(visible=False)
|
| 1566 |
+
admin_vocab_group = gr.Group(visible=False)
|
| 1567 |
+
admin_fields_group = gr.Group(visible=False)
|
| 1568 |
+
|
| 1569 |
+
with admin_group:
|
| 1570 |
+
gr.Markdown("### Admin: Configure extraction taxonomy + custom columns.")
|
| 1571 |
+
|
| 1572 |
+
with admin_vocab_group:
|
| 1573 |
+
gr.Markdown("### Controlled vocabulary (lists only)")
|
| 1574 |
+
vocab_category = gr.Dropdown(label="Category (lists only)", choices=[], value=None)
|
| 1575 |
+
vocab_search = gr.Textbox(label="Search terms", placeholder="Type to filter (e.g., 471, AMES, comet)", lines=1)
|
| 1576 |
+
|
| 1577 |
+
with gr.Row():
|
| 1578 |
+
vocab_term_add = gr.Textbox(label="Add term", placeholder="type term and click Add")
|
| 1579 |
+
vocab_add_btn = gr.Button("Add")
|
| 1580 |
+
with gr.Row():
|
| 1581 |
+
vocab_term_remove = gr.Textbox(label="Remove term", placeholder="type exact term and click Remove")
|
| 1582 |
+
vocab_remove_btn = gr.Button("Remove")
|
| 1583 |
+
vocab_apply_btn = gr.Button("Apply full list to category")
|
| 1584 |
+
vocab_reset_btn = gr.Button("Reset vocab to defaults")
|
| 1585 |
+
|
| 1586 |
+
vocab_terms_df = gr.Dataframe(headers=["term"], label="Terms (full list; edit directly)", interactive=True, wrap=True)
|
| 1587 |
+
vocab_terms_filtered = gr.Dataframe(headers=["term"], label="Filtered preview (read-only)", interactive=False, wrap=True)
|
| 1588 |
+
vocab_status = gr.Textbox(label="Vocab status", interactive=False)
|
| 1589 |
+
|
| 1590 |
+
with gr.Accordion("Raw vocab JSON (auto-generated)", open=False):
|
| 1591 |
+
vocab_json_admin = gr.Textbox(label="Controlled vocab JSON", lines=12, interactive=False)
|
| 1592 |
+
|
| 1593 |
+
with admin_fields_group:
|
| 1594 |
+
gr.Markdown("### Custom columns (Field Builder)")
|
| 1595 |
+
gr.Markdown("Tip: Use endpoint selection to start, then tweak fields.")
|
| 1596 |
+
|
| 1597 |
+
with gr.Row():
|
| 1598 |
+
admin_apply_endpoints_btn = gr.Button("Load selected endpoints into builder (Replace)", variant="secondary")
|
| 1599 |
+
fields_apply_btn = gr.Button("Apply builder table")
|
| 1600 |
+
|
| 1601 |
+
with gr.Row():
|
| 1602 |
+
field_name_in = gr.Textbox(label="Field name", placeholder="e.g., genotoxicity_result")
|
| 1603 |
+
field_type_in = gr.Dropdown(label="Type", choices=TYPE_CHOICES, value="str")
|
| 1604 |
+
|
| 1605 |
+
enum_values_in = gr.Textbox(label="Enum values (comma-separated; for enum/list[enum])", placeholder="a,b,c", lines=2)
|
| 1606 |
+
instructions_in = gr.Textbox(label="Instructions", placeholder="Tell the extractor exactly what to pull.", lines=2)
|
| 1607 |
+
|
| 1608 |
+
add_update_field_btn = gr.Button("Add/Update field")
|
| 1609 |
+
|
| 1610 |
+
fields_df = gr.Dataframe(
|
| 1611 |
+
label="Fields (edit and click Apply)",
|
| 1612 |
+
headers=["field","type","enum_values","instructions"],
|
| 1613 |
+
interactive=True,
|
| 1614 |
+
wrap=True
|
| 1615 |
+
)
|
| 1616 |
+
|
| 1617 |
+
fields_status = gr.Textbox(label="Field builder status", interactive=False)
|
| 1618 |
+
|
| 1619 |
+
with gr.Column(scale=7, min_width=480, elem_classes="right-panel"):
|
| 1620 |
+
with gr.Tabs(elem_classes="report-tabs"):
|
| 1621 |
+
with gr.Tab("Overview"):
|
| 1622 |
+
with gr.Group(elem_classes="card"):
|
| 1623 |
+
gr.Markdown("Report overview", elem_classes="section-title")
|
| 1624 |
+
summary_card = gr.HTML(render_summary_card("", []))
|
| 1625 |
+
with gr.Group(elem_classes="card"):
|
| 1626 |
+
overview_df = gr.Dataframe(
|
| 1627 |
+
label="Batch Overview",
|
| 1628 |
+
interactive=False,
|
| 1629 |
+
wrap=True,
|
| 1630 |
+
show_row_numbers=True
|
| 1631 |
+
)
|
| 1632 |
+
|
| 1633 |
+
with gr.Tab("Record"):
|
| 1634 |
+
with gr.Group(elem_classes="card"):
|
| 1635 |
+
record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
|
| 1636 |
+
with gr.Row():
|
| 1637 |
+
review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
|
| 1638 |
+
save_btn = gr.Button("Save edits")
|
| 1639 |
+
export_btn = gr.Button("Export reviewed CSV")
|
| 1640 |
+
review_status = gr.Textbox(label="Review status", interactive=False)
|
| 1641 |
+
with gr.Group(elem_classes="card"):
|
| 1642 |
+
vertical_view = gr.Dataframe(
|
| 1643 |
+
headers=["Field", "Value"],
|
| 1644 |
+
interactive=False,
|
| 1645 |
+
wrap=True,
|
| 1646 |
+
show_row_numbers=False,
|
| 1647 |
+
label="Extracted fields (vertical)"
|
| 1648 |
+
)
|
| 1649 |
+
|
| 1650 |
+
with gr.Tab("Evidence"):
|
| 1651 |
+
with gr.Group(elem_classes="card"):
|
| 1652 |
+
evidence_md = gr.Markdown()
|
| 1653 |
+
|
| 1654 |
+
with gr.Tab("Exports"):
|
| 1655 |
+
with gr.Group(elem_classes="card"):
|
| 1656 |
+
out_csv = gr.File(label="Download: extraction_table.csv")
|
| 1657 |
+
out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
|
| 1658 |
+
risk_template_prefilled = gr.File(label="Download: cancer_risk_input_template_prefilled.csv (record_id linked)")
|
| 1659 |
+
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 1660 |
|
| 1661 |
# --- Wiring ---
|
| 1662 |
admin_mode.change(
|
|
|
|
| 1680 |
extract_btn.click(
|
| 1681 |
fn=run_extraction,
|
| 1682 |
inputs=[files, api_key, model, endpoints, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars, admin_mode],
|
| 1683 |
+
outputs=[summary_card, overview_df, out_csv, out_json, risk_template_prefilled, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 1684 |
)
|
| 1685 |
|
| 1686 |
record_pick.change(
|
|
|
|
| 1804 |
build_literature_explorer_tab()
|
| 1805 |
|
| 1806 |
with gr.Tab("Cross-paper Synthesis"):
|
| 1807 |
+
with gr.Group(elem_classes="card"):
|
| 1808 |
+
gr.Markdown("Upload `extraction_details.json` from Extract tab. Synthesis is based strictly on grounded extractions.")
|
| 1809 |
+
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 1810 |
+
model2 = gr.Dropdown(label="Model", choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"], value="gpt-4o-2024-08-06")
|
| 1811 |
+
extraction_json_file = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|
| 1812 |
+
synth_btn = gr.Button("Synthesize Across Papers")
|
| 1813 |
+
synth_md = gr.Markdown()
|
| 1814 |
+
synth_btn.click(fn=run_synthesis, inputs=[api_key2, model2, extraction_json_file], outputs=[synth_md])
|
| 1815 |
+
|
| 1816 |
+
with gr.Tab("Regulatory Gap Assessment"):
|
| 1817 |
+
with gr.Group(elem_classes="card"):
|
| 1818 |
+
gr.Markdown(
|
| 1819 |
+
"Run clause-level mapping against regulatory catalogs. "
|
| 1820 |
+
"Use `extraction_details.json` from Extract tab."
|
| 1821 |
+
)
|
| 1822 |
+
with gr.Row():
|
| 1823 |
+
reg_extraction_json = gr.File(label="Upload extraction_details.json", file_types=[".json"], file_count="single")
|
| 1824 |
+
reg_framework = gr.Dropdown(label="Framework profile", choices=["FDA CTP", "EPA"], value="FDA CTP")
|
| 1825 |
+
reg_override_notes = gr.Textbox(
|
| 1826 |
+
label="Override notes (optional)",
|
| 1827 |
+
lines=2,
|
| 1828 |
+
placeholder="Context to include in gap prompts."
|
| 1829 |
+
)
|
| 1830 |
+
reg_run_btn = gr.Button("Run Regulatory Gap Assessment", variant="primary")
|
| 1831 |
+
reg_status = gr.Textbox(label="Status", interactive=False)
|
| 1832 |
+
reg_summary_md = gr.Markdown()
|
| 1833 |
+
reg_matrix_df = gr.Dataframe(label="Clause-level gap matrix", interactive=False, wrap=True)
|
| 1834 |
+
reg_matrix_file = gr.File(label="Download: regulatory_gap_matrix.csv")
|
| 1835 |
+
reg_report_file = gr.File(label="Download: regulatory_gap_report.json")
|
| 1836 |
+
|
| 1837 |
+
reg_run_btn.click(
|
| 1838 |
+
fn=run_regulatory_gap_assessment,
|
| 1839 |
+
inputs=[reg_extraction_json, reg_framework, reg_override_notes],
|
| 1840 |
+
outputs=[reg_matrix_df, reg_summary_md, reg_matrix_file, reg_report_file, reg_status]
|
| 1841 |
+
)
|
| 1842 |
+
|
| 1843 |
+
with gr.Tab("Cancer Risk Calculator"):
|
| 1844 |
+
with gr.Group(elem_classes="card"):
|
| 1845 |
+
gr.Markdown(
|
| 1846 |
+
"Deterministic FDA/EPA cancer risk calculations routed through a dedicated local MCP server. "
|
| 1847 |
+
"Use `record_id` values from extraction outputs for traceability."
|
| 1848 |
+
)
|
| 1849 |
+
with gr.Row():
|
| 1850 |
+
template_btn = gr.Button("Download Blank CSV Template")
|
| 1851 |
+
template_file = gr.File(label="Download: cancer_risk_input_template.csv")
|
| 1852 |
+
template_status = gr.Textbox(label="Template status", interactive=False)
|
| 1853 |
+
template_btn.click(fn=export_blank_cancer_risk_template, inputs=None, outputs=[template_file, template_status])
|
| 1854 |
+
|
| 1855 |
+
risk_input_csv = gr.File(label="Upload populated cancer risk input CSV", file_types=[".csv"], file_count="single")
|
| 1856 |
+
risk_run_btn = gr.Button("Run Cancer Risk Batch", variant="primary")
|
| 1857 |
+
risk_status = gr.Textbox(label="Status", interactive=False)
|
| 1858 |
+
risk_results_df = gr.Dataframe(label="Cancer risk results", interactive=False, wrap=True)
|
| 1859 |
+
risk_results_csv = gr.File(label="Download: cancer_risk_results.csv")
|
| 1860 |
+
risk_log_file = gr.File(label="Download: cancer_risk_log.jsonl")
|
| 1861 |
+
risk_report_file = gr.File(label="Download: cancer_risk_report.md")
|
| 1862 |
+
|
| 1863 |
+
risk_run_btn.click(
|
| 1864 |
+
fn=run_cancer_risk_batch_ui,
|
| 1865 |
+
inputs=[risk_input_csv],
|
| 1866 |
+
outputs=[risk_results_df, risk_results_csv, risk_log_file, risk_report_file, risk_status]
|
| 1867 |
+
)
|
| 1868 |
|
| 1869 |
if __name__ == "__main__":
|
| 1870 |
port = int(os.environ.get("PORT", "7860"))
|
| 1871 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=port)
|
cancer_risk_input_template.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
record_id,chemical_name,casrn,route,exposure_value,exposure_unit,body_weight_kg,csf_value,csf_unit,iur_value,air_conc_value,air_conc_unit,source_reference
|
literature_explorer.py
CHANGED
|
@@ -9,6 +9,11 @@ import numpy as np
|
|
| 9 |
import pandas as pd
|
| 10 |
from pypdf import PdfReader
|
| 11 |
from openai import OpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# =============================
|
|
@@ -454,39 +459,41 @@ def search(
|
|
| 454 |
if not filtered_idx:
|
| 455 |
return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
|
| 456 |
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
if idx.get("has_embeddings") and idx.get("embeddings") is not None:
|
| 461 |
try:
|
| 462 |
client = get_client(api_key)
|
| 463 |
qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
|
| 464 |
-
|
| 465 |
-
scores = mat @ qemb
|
| 466 |
-
order = np.argsort(scores)[::-1][:max(1, int(top_k))]
|
| 467 |
-
for j in order:
|
| 468 |
-
page_i = filtered_idx[int(j)]
|
| 469 |
-
ranked.append((float(scores[int(j)]), pages[page_i]))
|
| 470 |
except Exception:
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
rows = []
|
| 485 |
evidence = []
|
| 486 |
-
for
|
| 487 |
pid = r["paper_id"]
|
| 488 |
org = (papers.get(pid, {}) or {}).get("organ", "unknown")
|
| 489 |
-
|
|
|
|
| 490 |
ctx_wrapped = hard_wrap(ctx, width=110)
|
| 491 |
|
| 492 |
preview = ctx.strip()
|
|
@@ -495,7 +502,7 @@ def search(
|
|
| 495 |
rows.append({
|
| 496 |
"file": r.get("file",""),
|
| 497 |
"page": r.get("page",""),
|
| 498 |
-
"score": round(
|
| 499 |
"organ": org,
|
| 500 |
"endpoints": "; ".join(r.get("endpoints") or []),
|
| 501 |
"enzymes": "; ".join((r.get("enzymes") or [])[:12]),
|
|
@@ -530,6 +537,12 @@ def search(
|
|
| 530 |
except Exception as e:
|
| 531 |
mini_summary = f"(mini-summary unavailable: {e})"
|
| 532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
mini_md = "### Grounded mini-summary\n" + mini_summary
|
| 534 |
return results_df, mini_md, evidence_md
|
| 535 |
|
|
@@ -551,8 +564,8 @@ def on_select_result(df: pd.DataFrame, idx: dict, query: str, evt: gr.SelectData
|
|
| 551 |
meta = f"**{citation}**"
|
| 552 |
return meta, citation, "(page text not found)", ""
|
| 553 |
|
| 554 |
-
|
| 555 |
-
ctx = hard_wrap(
|
| 556 |
full_txt = hard_wrap(rec.get("text",""), width=110)
|
| 557 |
|
| 558 |
meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
|
|
@@ -646,4 +659,4 @@ def build_literature_explorer_tab():
|
|
| 646 |
fn=citation_ready,
|
| 647 |
inputs=[citation_box],
|
| 648 |
outputs=[copy_status]
|
| 649 |
-
)
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
from pypdf import PdfReader
|
| 11 |
from openai import OpenAI
|
| 12 |
+
from toxra_core.nlp_pipeline import (
|
| 13 |
+
expand_regulatory_queries,
|
| 14 |
+
extract_evidence_span,
|
| 15 |
+
hybrid_rank_text_items,
|
| 16 |
+
)
|
| 17 |
|
| 18 |
|
| 19 |
# =============================
|
|
|
|
| 459 |
if not filtered_idx:
|
| 460 |
return pd.DataFrame(), "### Grounded mini-summary\n(No pages match your filters)", "### Evidence used\n"
|
| 461 |
|
| 462 |
+
filtered_pages = [pages[i] for i in filtered_idx]
|
| 463 |
+
emb_mat = None
|
| 464 |
+
qemb = None
|
| 465 |
if idx.get("has_embeddings") and idx.get("embeddings") is not None:
|
| 466 |
try:
|
| 467 |
client = get_client(api_key)
|
| 468 |
qemb = embed_texts(client, embedding_model or idx.get("embedding_model") or DEFAULT_EMBEDDING_MODEL, [query])[0]
|
| 469 |
+
emb_mat = idx["embeddings"][filtered_idx, :]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
except Exception:
|
| 471 |
+
emb_mat = None
|
| 472 |
+
qemb = None
|
| 473 |
+
|
| 474 |
+
_, query_families = expand_regulatory_queries(
|
| 475 |
+
base_queries=[query],
|
| 476 |
+
endpoint_modules=endpoint_filter or [],
|
| 477 |
+
frameworks=["FDA CTP", "EPA"],
|
| 478 |
+
extra_terms=[],
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
ranked_pages, rank_diag = hybrid_rank_text_items(
|
| 482 |
+
items=filtered_pages,
|
| 483 |
+
query=query,
|
| 484 |
+
families=query_families,
|
| 485 |
+
top_k=max(1, int(top_k)),
|
| 486 |
+
item_embeddings=emb_mat,
|
| 487 |
+
query_embedding=qemb,
|
| 488 |
+
)
|
| 489 |
|
| 490 |
rows = []
|
| 491 |
evidence = []
|
| 492 |
+
for r in ranked_pages:
|
| 493 |
pid = r["paper_id"]
|
| 494 |
org = (papers.get(pid, {}) or {}).get("organ", "unknown")
|
| 495 |
+
span = extract_evidence_span(r.get("text", ""), query, page=r.get("page"), n_sentences=5)
|
| 496 |
+
ctx = span.get("text", "")
|
| 497 |
ctx_wrapped = hard_wrap(ctx, width=110)
|
| 498 |
|
| 499 |
preview = ctx.strip()
|
|
|
|
| 502 |
rows.append({
|
| 503 |
"file": r.get("file",""),
|
| 504 |
"page": r.get("page",""),
|
| 505 |
+
"score": round(float(r.get("_nlp_rrf_score", 0.0)), 4),
|
| 506 |
"organ": org,
|
| 507 |
"endpoints": "; ".join(r.get("endpoints") or []),
|
| 508 |
"enzymes": "; ".join((r.get("enzymes") or [])[:12]),
|
|
|
|
| 537 |
except Exception as e:
|
| 538 |
mini_summary = f"(mini-summary unavailable: {e})"
|
| 539 |
|
| 540 |
+
if rank_diag:
|
| 541 |
+
mini_summary = (
|
| 542 |
+
f"{mini_summary}\n\n"
|
| 543 |
+
f"_NLP diagnostics: method={rank_diag.get('ranking_method','')}, "
|
| 544 |
+
f"coverage={rank_diag.get('coverage_score', 0.0)}._"
|
| 545 |
+
)
|
| 546 |
mini_md = "### Grounded mini-summary\n" + mini_summary
|
| 547 |
return results_df, mini_md, evidence_md
|
| 548 |
|
|
|
|
| 564 |
meta = f"**{citation}**"
|
| 565 |
return meta, citation, "(page text not found)", ""
|
| 566 |
|
| 567 |
+
span = extract_evidence_span(rec.get("text",""), query, page=page, n_sentences=5)
|
| 568 |
+
ctx = hard_wrap(span.get("text", ""), width=110)
|
| 569 |
full_txt = hard_wrap(rec.get("text",""), width=110)
|
| 570 |
|
| 571 |
meta = f"**{citation}** | organ: **{r.get('organ','')}** | score: **{r.get('score','')}**"
|
|
|
|
| 659 |
fn=citation_ready,
|
| 660 |
inputs=[citation_box],
|
| 661 |
outputs=[copy_status]
|
| 662 |
+
)
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
gradio>=
|
|
|
|
| 2 |
pandas>=2.0.0
|
| 3 |
-
|
| 4 |
-
pypdf>=5.0.0
|
| 5 |
scikit-learn>=1.4.0
|
| 6 |
-
openai>=1.
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
numpy>=1.26.0
|
| 3 |
pandas>=2.0.0
|
| 4 |
+
pypdf>=4.0.0
|
|
|
|
| 5 |
scikit-learn>=1.4.0
|
| 6 |
+
openai>=1.40.0
|
| 7 |
+
pytest>=8.0.0
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.11
|