Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import json
|
| 4 |
-
import math
|
| 5 |
import tempfile
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Dict, List, Tuple, Any
|
|
@@ -13,39 +12,52 @@ import pandas as pd
|
|
| 13 |
from pypdf import PdfReader
|
| 14 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 15 |
|
| 16 |
-
from openai import OpenAI
|
| 17 |
|
| 18 |
|
| 19 |
-
#
|
| 20 |
# Defaults
|
| 21 |
-
#
|
| 22 |
DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
| 23 |
"risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
|
|
|
|
| 24 |
"study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
|
| 25 |
"exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
|
| 26 |
"species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
}"""
|
| 31 |
|
| 32 |
-
DEFAULT_FIELD_SPEC = """# One field per line:
|
| 33 |
# types: str, num, bool, list[str], list[num], enum[a,b,c]
|
| 34 |
Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
|
| 35 |
CAS_numbers | list[str] | Extract any CAS numbers mentioned.
|
| 36 |
Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
|
| 37 |
Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
|
| 38 |
Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
|
| 39 |
-
|
| 40 |
-
|
| 41 |
Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
|
|
|
|
| 42 |
Conclusion | str | What does the paper conclude about safety/risk?
|
| 43 |
"""
|
| 44 |
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
# PDF extraction (
|
| 48 |
-
#
|
| 49 |
def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
|
| 50 |
reader = PdfReader(pdf_path)
|
| 51 |
page_count = len(reader.pages)
|
|
@@ -57,8 +69,7 @@ def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tupl
|
|
| 57 |
t = reader.pages[i].extract_text() or ""
|
| 58 |
except Exception:
|
| 59 |
t = ""
|
| 60 |
-
|
| 61 |
-
pages.append((i + 1, t))
|
| 62 |
return pages, page_count
|
| 63 |
|
| 64 |
|
|
@@ -70,9 +81,6 @@ def clean_text(t: str) -> str:
|
|
| 70 |
|
| 71 |
|
| 72 |
def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
|
| 73 |
-
"""
|
| 74 |
-
Build chunks with page ranges, roughly target_chars each.
|
| 75 |
-
"""
|
| 76 |
chunks = []
|
| 77 |
buf = []
|
| 78 |
start_page = None
|
|
@@ -85,12 +93,10 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
|
|
| 85 |
if start_page is None:
|
| 86 |
start_page = pno
|
| 87 |
|
| 88 |
-
# If adding this page exceeds chunk size, flush
|
| 89 |
if cur_len + len(txt) + 1 > target_chars and buf:
|
| 90 |
-
end_page =
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
)
|
| 94 |
buf = [txt]
|
| 95 |
start_page = pno
|
| 96 |
cur_len = len(txt)
|
|
@@ -99,16 +105,21 @@ def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[
|
|
| 99 |
cur_len += len(txt) + 1
|
| 100 |
|
| 101 |
if buf and start_page is not None:
|
| 102 |
-
end_page = pages[-1][0]
|
| 103 |
chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
|
| 104 |
|
| 105 |
return chunks
|
| 106 |
|
| 107 |
|
| 108 |
-
#
|
| 109 |
# Lightweight retrieval (TF-IDF) to select relevant excerpts
|
| 110 |
-
#
|
| 111 |
-
def select_relevant_chunks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
texts = [c["text"] for c in chunks]
|
| 113 |
if not texts:
|
| 114 |
return []
|
|
@@ -116,24 +127,22 @@ def select_relevant_chunks(chunks: List[Dict[str, Any]], queries: List[str], top
|
|
| 116 |
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
|
| 117 |
X = vectorizer.fit_transform(texts)
|
| 118 |
|
| 119 |
-
selected_idx = []
|
| 120 |
for q in queries:
|
| 121 |
q = (q or "").strip()
|
| 122 |
if not q:
|
| 123 |
continue
|
| 124 |
qv = vectorizer.transform([q])
|
| 125 |
-
sims = (X @ qv.T).toarray().ravel()
|
| 126 |
idx = np.argsort(sims)[::-1]
|
| 127 |
for i in idx[:top_per_query]:
|
| 128 |
if i not in selected_idx:
|
| 129 |
selected_idx.append(i)
|
| 130 |
|
| 131 |
-
# fallback: if nothing selected, take first few chunks
|
| 132 |
if not selected_idx:
|
| 133 |
selected_idx = list(range(min(len(chunks), max_chunks)))
|
| 134 |
|
| 135 |
-
|
| 136 |
-
return selected
|
| 137 |
|
| 138 |
|
| 139 |
def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
|
|
@@ -148,9 +157,9 @@ def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000)
|
|
| 148 |
return "\n".join(parts).strip()
|
| 149 |
|
| 150 |
|
| 151 |
-
#
|
| 152 |
# User-defined extraction spec -> JSON Schema
|
| 153 |
-
#
|
| 154 |
def slugify_field(name: str) -> str:
|
| 155 |
name = name.strip()
|
| 156 |
name = re.sub(r"[^\w\s-]", "", name)
|
|
@@ -158,14 +167,13 @@ def slugify_field(name: str) -> str:
|
|
| 158 |
return name[:60] if name else "field"
|
| 159 |
|
| 160 |
|
| 161 |
-
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any],
|
| 162 |
"""
|
| 163 |
spec lines: Field Name | type | instructions
|
| 164 |
-
Returns: properties dict,
|
| 165 |
"""
|
| 166 |
-
props = {}
|
| 167 |
-
|
| 168 |
-
instr = {}
|
| 169 |
|
| 170 |
for raw_line in (spec or "").splitlines():
|
| 171 |
line = raw_line.strip()
|
|
@@ -180,15 +188,10 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
|
|
| 180 |
ftype = parts[1]
|
| 181 |
finstr = parts[2] if len(parts) >= 3 else ""
|
| 182 |
|
| 183 |
-
is_required = False
|
| 184 |
-
if field_name.startswith("*"):
|
| 185 |
-
is_required = True
|
| 186 |
-
field_name = field_name[1:].strip()
|
| 187 |
-
|
| 188 |
key = slugify_field(field_name)
|
| 189 |
instr[key] = finstr
|
| 190 |
|
| 191 |
-
schema = {"type": "string"}
|
| 192 |
|
| 193 |
if ftype == "str":
|
| 194 |
schema = {"type": "string"}
|
|
@@ -208,20 +211,20 @@ def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], List[str], Dict[str, st
|
|
| 208 |
schema = {"type": "string"}
|
| 209 |
|
| 210 |
props[key] = schema
|
| 211 |
-
if is_required:
|
| 212 |
-
required.append(key)
|
| 213 |
|
| 214 |
-
|
| 215 |
-
return props, required, instr
|
| 216 |
|
| 217 |
|
| 218 |
-
def build_extraction_schema(field_props: Dict[str, Any],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
risk_enum = vocab.get(
|
| 220 |
"risk_stance_enum",
|
| 221 |
["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
|
| 222 |
)
|
| 223 |
|
| 224 |
-
# IMPORTANT: strict schema requires required == all property keys
|
| 225 |
all_field_keys = list(field_props.keys())
|
| 226 |
|
| 227 |
schema = {
|
|
@@ -236,7 +239,7 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
|
|
| 236 |
"type": "object",
|
| 237 |
"additionalProperties": False,
|
| 238 |
"properties": field_props,
|
| 239 |
-
"required": all_field_keys #
|
| 240 |
},
|
| 241 |
"evidence": {
|
| 242 |
"type": "array",
|
|
@@ -257,13 +260,13 @@ def build_extraction_schema(field_props: Dict[str, Any], required_fields: List[s
|
|
| 257 |
return schema
|
| 258 |
|
| 259 |
|
| 260 |
-
#
|
| 261 |
-
# OpenAI
|
| 262 |
-
#
|
| 263 |
def get_openai_client(api_key: str) -> OpenAI:
|
| 264 |
key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
|
| 265 |
if not key:
|
| 266 |
-
raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY.")
|
| 267 |
return OpenAI(api_key=key)
|
| 268 |
|
| 269 |
|
|
@@ -275,25 +278,20 @@ def openai_structured_extract(
|
|
| 275 |
field_instructions: Dict[str, str],
|
| 276 |
context: str
|
| 277 |
) -> Dict[str, Any]:
|
| 278 |
-
|
| 279 |
-
# Build instruction text for the model
|
| 280 |
field_instr_lines = []
|
| 281 |
for k, v in field_instructions.items():
|
| 282 |
-
if v
|
| 283 |
-
field_instr_lines.append(f"- {k}: {v}")
|
| 284 |
-
else:
|
| 285 |
-
field_instr_lines.append(f"- {k}: (no extra instructions)")
|
| 286 |
|
| 287 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
| 288 |
|
| 289 |
system_msg = (
|
| 290 |
"You are a toxicology research paper data-extraction assistant.\n"
|
| 291 |
-
"
|
| 292 |
-
"1) Use ONLY the provided excerpts; do
|
| 293 |
-
"2) If a value is not stated,
|
| 294 |
-
"3)
|
| 295 |
-
"4) risk_stance
|
| 296 |
-
"5) Prefer controlled
|
| 297 |
)
|
| 298 |
|
| 299 |
user_msg = (
|
|
@@ -302,7 +300,7 @@ def openai_structured_extract(
|
|
| 302 |
"FIELD INSTRUCTIONS:\n"
|
| 303 |
+ "\n".join(field_instr_lines)
|
| 304 |
+ "\n\n"
|
| 305 |
-
"EXCERPTS:\n"
|
| 306 |
f"{context}\n"
|
| 307 |
)
|
| 308 |
|
|
@@ -321,103 +319,104 @@ def openai_structured_extract(
|
|
| 321 |
}
|
| 322 |
}
|
| 323 |
)
|
| 324 |
-
|
| 325 |
-
# Structured outputs: JSON is in output_text
|
| 326 |
-
out = resp.output_text
|
| 327 |
-
return json.loads(out)
|
| 328 |
|
| 329 |
|
| 330 |
def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
|
| 331 |
system_msg = (
|
| 332 |
"You are a senior toxicology scientist summarizing multiple papers.\n"
|
| 333 |
-
"
|
| 334 |
-
"Base
|
| 335 |
)
|
| 336 |
user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
|
| 337 |
|
| 338 |
-
resp = client.responses.create(
|
| 339 |
-
model=model,
|
| 340 |
-
input=[
|
| 341 |
-
{"role": "system", "content": system_msg},
|
| 342 |
-
{"role": "user", "content": user_msg}
|
| 343 |
-
]
|
| 344 |
-
)
|
| 345 |
-
return resp.output_text
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
def openai_suggest_vocab_additions(client: OpenAI, model: str, current_vocab: Dict[str, Any], context: str) -> Dict[str, Any]:
|
| 349 |
-
schema = {
|
| 350 |
-
"type": "object",
|
| 351 |
-
"additionalProperties": False,
|
| 352 |
-
"properties": {
|
| 353 |
-
"additions": {
|
| 354 |
-
"type": "object",
|
| 355 |
-
"additionalProperties": {
|
| 356 |
-
"type": "array",
|
| 357 |
-
"items": {"type": "string"}
|
| 358 |
-
}
|
| 359 |
-
},
|
| 360 |
-
"notes": {"type": "string"}
|
| 361 |
-
},
|
| 362 |
-
"required": ["additions", "notes"]
|
| 363 |
-
}
|
| 364 |
-
|
| 365 |
-
system_msg = (
|
| 366 |
-
"You propose controlled-vocabulary additions for toxicology paper extraction.\n"
|
| 367 |
-
"Return only new candidate terms grouped under keys that already exist or new keys if needed.\n"
|
| 368 |
-
"Avoid duplicates already in current vocab.\n"
|
| 369 |
-
)
|
| 370 |
-
user_msg = (
|
| 371 |
-
"CURRENT_VOCAB_JSON:\n"
|
| 372 |
-
+ json.dumps(current_vocab, indent=2)
|
| 373 |
-
+ "\n\n"
|
| 374 |
-
"EXCERPTS:\n"
|
| 375 |
-
+ context
|
| 376 |
-
)
|
| 377 |
-
|
| 378 |
resp = client.responses.create(
|
| 379 |
model=model,
|
| 380 |
input=[
|
| 381 |
{"role": "system", "content": system_msg},
|
| 382 |
{"role": "user", "content": user_msg}
|
| 383 |
],
|
| 384 |
-
text={
|
| 385 |
-
"format": {
|
| 386 |
-
"type": "json_schema",
|
| 387 |
-
"name": "vocab_additions",
|
| 388 |
-
"schema": schema,
|
| 389 |
-
"strict": True
|
| 390 |
-
}
|
| 391 |
-
}
|
| 392 |
)
|
| 393 |
-
return
|
| 394 |
|
| 395 |
|
| 396 |
-
#
|
| 397 |
-
#
|
| 398 |
-
#
|
| 399 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
if not files:
|
| 401 |
-
return None, None, None, "Upload one or more PDFs."
|
| 402 |
|
|
|
|
| 403 |
try:
|
| 404 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 405 |
except Exception as e:
|
| 406 |
-
return None, None, None, f"Controlled vocab JSON is invalid: {e}"
|
| 407 |
|
| 408 |
-
|
|
|
|
| 409 |
if not field_props:
|
| 410 |
-
return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions"
|
| 411 |
|
| 412 |
-
schema = build_extraction_schema(field_props,
|
| 413 |
|
|
|
|
| 414 |
try:
|
| 415 |
client = get_openai_client(api_key)
|
| 416 |
except Exception as e:
|
| 417 |
-
return None, None, None, str(e)
|
| 418 |
|
| 419 |
-
results = []
|
| 420 |
-
flat_rows = []
|
| 421 |
|
| 422 |
tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
|
| 423 |
|
|
@@ -426,46 +425,51 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
|
|
| 426 |
filename = os.path.basename(pdf_path)
|
| 427 |
|
| 428 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 430 |
|
| 431 |
-
#
|
| 432 |
-
queries = [
|
| 433 |
-
"risk stance hazard risk conclusion adverse effect noael loael bmd bmdl ld50 lc50 safety concern",
|
| 434 |
-
]
|
| 435 |
for k, ins in field_instr.items():
|
| 436 |
-
if ins
|
| 437 |
-
queries.append(ins)
|
| 438 |
-
else:
|
| 439 |
-
queries.append(k)
|
| 440 |
|
| 441 |
selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
|
| 442 |
context = build_context(selected, max_chars=int(max_context_chars))
|
| 443 |
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
"evidence": []
|
| 453 |
-
}
|
| 454 |
-
else:
|
| 455 |
-
extracted = openai_structured_extract(
|
| 456 |
-
client=client,
|
| 457 |
-
model=model,
|
| 458 |
-
schema=schema,
|
| 459 |
-
controlled_vocab=vocab,
|
| 460 |
-
field_instructions=field_instr,
|
| 461 |
-
context=context
|
| 462 |
-
)
|
| 463 |
|
| 464 |
extracted["_file"] = filename
|
| 465 |
extracted["_pages_in_pdf"] = page_count
|
| 466 |
results.append(extracted)
|
| 467 |
|
| 468 |
-
#
|
| 469 |
row = {
|
| 470 |
"file": filename,
|
| 471 |
"paper_title": extracted.get("paper_title", ""),
|
|
@@ -473,12 +477,15 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
|
|
| 473 |
"risk_confidence": extracted.get("risk_confidence", ""),
|
| 474 |
"risk_summary": extracted.get("risk_summary", "")
|
| 475 |
}
|
|
|
|
|
|
|
| 476 |
for k in field_props.keys():
|
| 477 |
-
v =
|
| 478 |
if isinstance(v, list):
|
| 479 |
row[k] = "; ".join([str(x) for x in v])
|
| 480 |
else:
|
| 481 |
row[k] = v
|
|
|
|
| 482 |
flat_rows.append(row)
|
| 483 |
|
| 484 |
df = pd.DataFrame(flat_rows)
|
|
@@ -488,120 +495,208 @@ def run_extraction(files, api_key, model, field_spec, vocab_json, max_pages, chu
|
|
| 488 |
df.to_csv(csv_path, index=False)
|
| 489 |
json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
|
|
|
|
|
|
| 498 |
|
| 499 |
-
try:
|
| 500 |
-
client = get_openai_client(api_key)
|
| 501 |
-
except Exception as e:
|
| 502 |
-
return str(e)
|
| 503 |
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
return
|
| 507 |
|
| 508 |
|
| 509 |
-
def
|
| 510 |
-
|
| 511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
|
|
|
|
| 513 |
try:
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
return vocab_json, f"Controlled vocab JSON is invalid: {e}"
|
| 522 |
|
| 523 |
-
# Build a small context from the first 1-2 docs
|
| 524 |
-
contexts = []
|
| 525 |
-
for f in files[:2]:
|
| 526 |
-
pages, _ = extract_pages_from_pdf(f.name, max_pages=int(max_pages))
|
| 527 |
-
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 528 |
-
selected = select_relevant_chunks(
|
| 529 |
-
chunks,
|
| 530 |
-
queries=["toxicology endpoints noael loael bmd genotoxicity carcinogenicity endocrine exposure route species"],
|
| 531 |
-
top_per_query=2,
|
| 532 |
-
max_chunks=8
|
| 533 |
-
)
|
| 534 |
-
ctx = build_context(selected, max_chars=int(max_context_chars))
|
| 535 |
-
if ctx:
|
| 536 |
-
contexts.append(ctx)
|
| 537 |
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
-
additions = openai_suggest_vocab_additions(client, model, vocab, combined)
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
if k not in merged:
|
| 549 |
-
merged[k] = []
|
| 550 |
-
if isinstance(merged[k], list):
|
| 551 |
-
for term in arr:
|
| 552 |
-
if term not in merged[k]:
|
| 553 |
-
merged[k].append(term)
|
| 554 |
|
| 555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
|
| 557 |
|
| 558 |
-
#
|
| 559 |
# Gradio UI
|
| 560 |
-
#
|
| 561 |
-
with gr.Blocks(title="Toxicology PDF → Table Extractor
|
| 562 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
|
| 564 |
with gr.Tab("Extract to Table"):
|
| 565 |
files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
|
| 566 |
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
|
|
|
| 573 |
|
| 574 |
with gr.Row():
|
| 575 |
-
max_pages = gr.Slider(0,
|
| 576 |
-
chunk_chars = gr.Slider(1200,
|
| 577 |
-
max_context_chars = gr.Slider(5000,
|
| 578 |
|
| 579 |
-
vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=
|
| 580 |
-
field_spec = gr.Textbox(label="Extraction spec (you control
|
| 581 |
|
| 582 |
-
|
| 583 |
-
vocab_btn = gr.Button("Suggest vocab additions from PDFs")
|
| 584 |
-
extract_btn = gr.Button("Run Extraction (Table)")
|
| 585 |
status = gr.Textbox(label="Status", interactive=False)
|
| 586 |
|
| 587 |
-
table = gr.Dataframe(
|
| 588 |
-
|
| 589 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
)
|
|
|
|
| 596 |
|
|
|
|
|
|
|
|
|
|
| 597 |
extract_btn.click(
|
| 598 |
fn=run_extraction,
|
| 599 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 600 |
-
outputs=[table, out_csv, out_json, status]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
)
|
| 602 |
|
| 603 |
with gr.Tab("Cross-paper Synthesis"):
|
| 604 |
-
gr.Markdown("Upload the `extraction_details.json`
|
| 605 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 606 |
model2 = gr.Dropdown(
|
| 607 |
label="Model",
|
|
@@ -618,6 +713,32 @@ with gr.Blocks(title="Toxicology PDF → Table Extractor (GPT-4o)") as demo:
|
|
| 618 |
outputs=[synth_md]
|
| 619 |
)
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
if __name__ == "__main__":
|
| 622 |
port = int(os.environ.get("PORT", "7860"))
|
| 623 |
demo.queue().launch(server_name="0.0.0.0", server_port=port)
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import json
|
|
|
|
| 4 |
import tempfile
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Dict, List, Tuple, Any
|
|
|
|
| 12 |
from pypdf import PdfReader
|
| 13 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 14 |
|
| 15 |
+
from openai import OpenAI
|
| 16 |
|
| 17 |
|
| 18 |
+
# =============================
|
| 19 |
# Defaults
|
| 20 |
+
# =============================
|
| 21 |
DEFAULT_CONTROLLED_VOCAB_JSON = """{
|
| 22 |
"risk_stance_enum": ["acceptable","acceptable_with_uncertainty","not_acceptable","insufficient_data"],
|
| 23 |
+
|
| 24 |
"study_type_enum": ["in_vivo","in_vitro","epidemiology","in_silico","review","methodology","other"],
|
| 25 |
"exposure_route_enum": ["oral","inhalation","dermal","parenteral","multiple","not_reported"],
|
| 26 |
"species_enum": ["human","rat","mouse","rabbit","dog","non_human_primate","cell_line","other","not_reported"],
|
| 27 |
+
|
| 28 |
+
"oecd_endpoints": [
|
| 29 |
+
"acute_toxicity","subacute_toxicity","subchronic_toxicity","chronic_toxicity",
|
| 30 |
+
"carcinogenicity","genotoxicity","reproductive_toxicity","developmental_toxicity",
|
| 31 |
+
"neurotoxicity","immunotoxicity","endocrine_activity","sensitization","irritation_corrosion"
|
| 32 |
+
],
|
| 33 |
+
|
| 34 |
+
"meddra_like_terms": [
|
| 35 |
+
"hepatic_disorder","renal_disorder","nervous_system_disorder","respiratory_disorder",
|
| 36 |
+
"skin_and_subcutaneous_tissue_disorder","reproductive_system_disorder",
|
| 37 |
+
"immune_system_disorder","blood_and_lymphatic_system_disorder"
|
| 38 |
+
],
|
| 39 |
+
|
| 40 |
+
"dose_metric_terms": ["noael","loael","bmd","bmdl","ld50","lc50","ec50","ic50"]
|
| 41 |
}"""
|
| 42 |
|
| 43 |
+
DEFAULT_FIELD_SPEC = """# One field per line: Field Name | type | instructions
|
| 44 |
# types: str, num, bool, list[str], list[num], enum[a,b,c]
|
| 45 |
Chemical(s) | list[str] | Primary chemical(s) studied; include common name + abbreviation if present.
|
| 46 |
CAS_numbers | list[str] | Extract any CAS numbers mentioned.
|
| 47 |
Study_type | enum[in_vivo,in_vitro,epidemiology,in_silico,review,methodology,other] | Choose the best match.
|
| 48 |
Exposure_route | enum[oral,inhalation,dermal,parenteral,multiple,not_reported] | Choose best match.
|
| 49 |
Species | enum[human,rat,mouse,rabbit,dog,non_human_primate,cell_line,other,not_reported] | Choose best match.
|
| 50 |
+
OECD_endpoints | list[str] | Extract endpoints; prefer controlled vocab 'oecd_endpoints' when applicable.
|
| 51 |
+
MedDRA_like_terms | list[str] | Extract effects; prefer controlled vocab 'meddra_like_terms' when applicable.
|
| 52 |
Dose_metrics | list[str] | Include any reported NOAEL/LOAEL/BMD/BMDL/LD50/LC50 etc with units if available.
|
| 53 |
+
Key_findings | str | 2-4 bullet-like sentences summarizing the main findings.
|
| 54 |
Conclusion | str | What does the paper conclude about safety/risk?
|
| 55 |
"""
|
| 56 |
|
| 57 |
|
| 58 |
+
# =============================
|
| 59 |
+
# PDF extraction (text-based PDFs only)
|
| 60 |
+
# =============================
|
| 61 |
def extract_pages_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[List[Tuple[int, str]], int]:
|
| 62 |
reader = PdfReader(pdf_path)
|
| 63 |
page_count = len(reader.pages)
|
|
|
|
| 69 |
t = reader.pages[i].extract_text() or ""
|
| 70 |
except Exception:
|
| 71 |
t = ""
|
| 72 |
+
pages.append((i + 1, t or ""))
|
|
|
|
| 73 |
return pages, page_count
|
| 74 |
|
| 75 |
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def chunk_pages(pages: List[Tuple[int, str]], target_chars: int = 3000) -> List[Dict[str, Any]]:
|
|
|
|
|
|
|
|
|
|
| 84 |
chunks = []
|
| 85 |
buf = []
|
| 86 |
start_page = None
|
|
|
|
| 93 |
if start_page is None:
|
| 94 |
start_page = pno
|
| 95 |
|
|
|
|
| 96 |
if cur_len + len(txt) + 1 > target_chars and buf:
|
| 97 |
+
end_page = pno - 1
|
| 98 |
+
end_page = end_page if end_page >= start_page else start_page
|
| 99 |
+
chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
|
|
|
|
| 100 |
buf = [txt]
|
| 101 |
start_page = pno
|
| 102 |
cur_len = len(txt)
|
|
|
|
| 105 |
cur_len += len(txt) + 1
|
| 106 |
|
| 107 |
if buf and start_page is not None:
|
| 108 |
+
end_page = pages[-1][0] if pages else start_page
|
| 109 |
chunks.append({"pages": f"{start_page}-{end_page}", "text": " ".join(buf)})
|
| 110 |
|
| 111 |
return chunks
|
| 112 |
|
| 113 |
|
| 114 |
+
# =============================
|
| 115 |
# Lightweight retrieval (TF-IDF) to select relevant excerpts
|
| 116 |
+
# =============================
|
| 117 |
+
def select_relevant_chunks(
|
| 118 |
+
chunks: List[Dict[str, Any]],
|
| 119 |
+
queries: List[str],
|
| 120 |
+
top_per_query: int = 2,
|
| 121 |
+
max_chunks: int = 12
|
| 122 |
+
) -> List[Dict[str, Any]]:
|
| 123 |
texts = [c["text"] for c in chunks]
|
| 124 |
if not texts:
|
| 125 |
return []
|
|
|
|
| 127 |
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
|
| 128 |
X = vectorizer.fit_transform(texts)
|
| 129 |
|
| 130 |
+
selected_idx: List[int] = []
|
| 131 |
for q in queries:
|
| 132 |
q = (q or "").strip()
|
| 133 |
if not q:
|
| 134 |
continue
|
| 135 |
qv = vectorizer.transform([q])
|
| 136 |
+
sims = (X @ qv.T).toarray().ravel()
|
| 137 |
idx = np.argsort(sims)[::-1]
|
| 138 |
for i in idx[:top_per_query]:
|
| 139 |
if i not in selected_idx:
|
| 140 |
selected_idx.append(i)
|
| 141 |
|
|
|
|
| 142 |
if not selected_idx:
|
| 143 |
selected_idx = list(range(min(len(chunks), max_chunks)))
|
| 144 |
|
| 145 |
+
return [chunks[i] for i in selected_idx[:max_chunks]]
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def build_context(selected_chunks: List[Dict[str, Any]], max_chars: int = 20000) -> str:
|
|
|
|
| 157 |
return "\n".join(parts).strip()
|
| 158 |
|
| 159 |
|
| 160 |
+
# =============================
|
| 161 |
# User-defined extraction spec -> JSON Schema
|
| 162 |
+
# =============================
|
| 163 |
def slugify_field(name: str) -> str:
|
| 164 |
name = name.strip()
|
| 165 |
name = re.sub(r"[^\w\s-]", "", name)
|
|
|
|
| 167 |
return name[:60] if name else "field"
|
| 168 |
|
| 169 |
|
| 170 |
+
def parse_field_spec(spec: str) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
| 171 |
"""
|
| 172 |
spec lines: Field Name | type | instructions
|
| 173 |
+
Returns: properties dict, instructions map (field_key -> instruction)
|
| 174 |
"""
|
| 175 |
+
props: Dict[str, Any] = {}
|
| 176 |
+
instr: Dict[str, str] = {}
|
|
|
|
| 177 |
|
| 178 |
for raw_line in (spec or "").splitlines():
|
| 179 |
line = raw_line.strip()
|
|
|
|
| 188 |
ftype = parts[1]
|
| 189 |
finstr = parts[2] if len(parts) >= 3 else ""
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
key = slugify_field(field_name)
|
| 192 |
instr[key] = finstr
|
| 193 |
|
| 194 |
+
schema: Dict[str, Any] = {"type": "string"}
|
| 195 |
|
| 196 |
if ftype == "str":
|
| 197 |
schema = {"type": "string"}
|
|
|
|
| 211 |
schema = {"type": "string"}
|
| 212 |
|
| 213 |
props[key] = schema
|
|
|
|
|
|
|
| 214 |
|
| 215 |
+
return props, instr
|
|
|
|
| 216 |
|
| 217 |
|
| 218 |
+
def build_extraction_schema(field_props: Dict[str, Any], vocab: Dict[str, Any]) -> Dict[str, Any]:
|
| 219 |
+
"""
|
| 220 |
+
IMPORTANT: Structured Outputs (strict=True) requires that for every object:
|
| 221 |
+
required must exist and include every key in properties.
|
| 222 |
+
"""
|
| 223 |
risk_enum = vocab.get(
|
| 224 |
"risk_stance_enum",
|
| 225 |
["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
|
| 226 |
)
|
| 227 |
|
|
|
|
| 228 |
all_field_keys = list(field_props.keys())
|
| 229 |
|
| 230 |
schema = {
|
|
|
|
| 239 |
"type": "object",
|
| 240 |
"additionalProperties": False,
|
| 241 |
"properties": field_props,
|
| 242 |
+
"required": all_field_keys # strict requirement
|
| 243 |
},
|
| 244 |
"evidence": {
|
| 245 |
"type": "array",
|
|
|
|
| 260 |
return schema
|
| 261 |
|
| 262 |
|
| 263 |
+
# =============================
|
| 264 |
+
# OpenAI client + extraction
|
| 265 |
+
# =============================
|
| 266 |
def get_openai_client(api_key: str) -> OpenAI:
|
| 267 |
key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
|
| 268 |
if not key:
|
| 269 |
+
raise ValueError("Missing OpenAI API key. Provide it in the UI or set OPENAI_API_KEY secret in Hugging Face.")
|
| 270 |
return OpenAI(api_key=key)
|
| 271 |
|
| 272 |
|
|
|
|
| 278 |
field_instructions: Dict[str, str],
|
| 279 |
context: str
|
| 280 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
| 281 |
field_instr_lines = []
|
| 282 |
for k, v in field_instructions.items():
|
| 283 |
+
field_instr_lines.append(f"- {k}: {v if v else '(no extra instructions)'}")
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
vocab_text = json.dumps(controlled_vocab, indent=2)
|
| 286 |
|
| 287 |
system_msg = (
|
| 288 |
"You are a toxicology research paper data-extraction assistant.\n"
|
| 289 |
+
"Grounding rules (must follow):\n"
|
| 290 |
+
"1) Use ONLY the provided excerpts; do NOT invent details.\n"
|
| 291 |
+
"2) If a value is not explicitly stated, output empty string or empty list (or an allowed enum like 'not_reported').\n"
|
| 292 |
+
"3) Provide evidence quotes + page ranges for extracted fields.\n"
|
| 293 |
+
"4) risk_stance is regulatory: acceptable / acceptable_with_uncertainty / not_acceptable / insufficient_data.\n"
|
| 294 |
+
"5) Prefer controlled vocab terms when applicable.\n"
|
| 295 |
)
|
| 296 |
|
| 297 |
user_msg = (
|
|
|
|
| 300 |
"FIELD INSTRUCTIONS:\n"
|
| 301 |
+ "\n".join(field_instr_lines)
|
| 302 |
+ "\n\n"
|
| 303 |
+
"EXCERPTS (with page ranges):\n"
|
| 304 |
f"{context}\n"
|
| 305 |
)
|
| 306 |
|
|
|
|
| 319 |
}
|
| 320 |
}
|
| 321 |
)
|
| 322 |
+
return json.loads(resp.output_text)
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
|
| 325 |
def openai_synthesize_across_papers(client: OpenAI, model: str, rows: List[Dict[str, Any]]) -> str:
|
| 326 |
system_msg = (
|
| 327 |
"You are a senior toxicology scientist summarizing multiple papers.\n"
|
| 328 |
+
"Create a concise synthesis: consensus, disagreements, data gaps, and actionable next steps.\n"
|
| 329 |
+
"Base strictly on the provided extracted JSON (which is evidence-backed).\n"
|
| 330 |
)
|
| 331 |
user_msg = "EXTRACTED_ROWS_JSON:\n" + json.dumps(rows, indent=2)
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
resp = client.responses.create(
|
| 334 |
model=model,
|
| 335 |
input=[
|
| 336 |
{"role": "system", "content": system_msg},
|
| 337 |
{"role": "user", "content": user_msg}
|
| 338 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
)
|
| 340 |
+
return resp.output_text
|
| 341 |
|
| 342 |
|
| 343 |
+
# =============================
|
| 344 |
+
# Grounding helpers (UI)
|
| 345 |
+
# =============================
|
| 346 |
+
def _make_vertical(records: List[Dict[str, Any]], file_name: str) -> pd.DataFrame:
|
| 347 |
+
if not records or not file_name:
|
| 348 |
+
return pd.DataFrame(columns=["Field", "Value"])
|
| 349 |
+
row = next((r for r in records if r.get("file") == file_name), None)
|
| 350 |
+
if not row:
|
| 351 |
+
return pd.DataFrame(columns=["Field", "Value"])
|
| 352 |
+
return pd.DataFrame({"Field": list(row.keys()), "Value": [row[k] for k in row.keys()]})
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def _render_evidence(details: List[Dict[str, Any]], file_name: str, max_items: int = 80) -> str:
|
| 356 |
+
if not details or not file_name:
|
| 357 |
+
return ""
|
| 358 |
+
d = next((x for x in details if x.get("_file") == file_name), None)
|
| 359 |
+
if not d:
|
| 360 |
+
return ""
|
| 361 |
+
ev = d.get("evidence", []) or []
|
| 362 |
+
lines = []
|
| 363 |
+
for e in ev[:max_items]:
|
| 364 |
+
quote = (e.get("quote", "") or "").strip()
|
| 365 |
+
pages = (e.get("pages", "") or "").strip()
|
| 366 |
+
field = (e.get("field", "") or "").strip()
|
| 367 |
+
if quote:
|
| 368 |
+
if len(quote) > 280:
|
| 369 |
+
quote = quote[:280] + "…"
|
| 370 |
+
lines.append(f"- **{field}** (pages {pages}): “{quote}”")
|
| 371 |
+
header = "### Evidence (grounding)\n"
|
| 372 |
+
if not lines:
|
| 373 |
+
lines = ["- (no evidence returned)"]
|
| 374 |
+
return header + "\n".join(lines) + "\n\n> Review note: evidence reflects the original extraction. If you change values, re-run extraction to refresh evidence."
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def _text_based_pdf_warning(pages: List[Tuple[int, str]]) -> bool:
|
| 378 |
+
# If almost no text exists across pages, treat as non-text PDF.
|
| 379 |
+
joined = " ".join([clean_text(t) for _, t in pages if clean_text(t)])
|
| 380 |
+
return len(joined.strip()) < 200 # heuristic threshold
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
# =============================
|
| 384 |
+
# Main extraction handler
|
| 385 |
+
# =============================
|
| 386 |
+
def run_extraction(
|
| 387 |
+
files,
|
| 388 |
+
api_key,
|
| 389 |
+
model,
|
| 390 |
+
field_spec,
|
| 391 |
+
vocab_json,
|
| 392 |
+
max_pages,
|
| 393 |
+
chunk_chars,
|
| 394 |
+
max_context_chars
|
| 395 |
+
):
|
| 396 |
if not files:
|
| 397 |
+
return None, None, None, "Upload one or more PDFs.", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 398 |
|
| 399 |
+
# vocab
|
| 400 |
try:
|
| 401 |
vocab = json.loads(vocab_json or DEFAULT_CONTROLLED_VOCAB_JSON)
|
| 402 |
except Exception as e:
|
| 403 |
+
return None, None, None, f"Controlled vocab JSON is invalid: {e}", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 404 |
|
| 405 |
+
# field spec
|
| 406 |
+
field_props, field_instr = parse_field_spec(field_spec or DEFAULT_FIELD_SPEC)
|
| 407 |
if not field_props:
|
| 408 |
+
return None, None, None, "Field spec produced no fields. Add lines like: Field | str | instructions", gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 409 |
|
| 410 |
+
schema = build_extraction_schema(field_props, vocab)
|
| 411 |
|
| 412 |
+
# OpenAI
|
| 413 |
try:
|
| 414 |
client = get_openai_client(api_key)
|
| 415 |
except Exception as e:
|
| 416 |
+
return None, None, None, str(e), gr.update(choices=[], value=None), [], [], pd.DataFrame(columns=["Field","Value"]), ""
|
| 417 |
|
| 418 |
+
results: List[Dict[str, Any]] = []
|
| 419 |
+
flat_rows: List[Dict[str, Any]] = []
|
| 420 |
|
| 421 |
tmpdir = Path(tempfile.mkdtemp(prefix="tox_extract_"))
|
| 422 |
|
|
|
|
| 425 |
filename = os.path.basename(pdf_path)
|
| 426 |
|
| 427 |
pages, page_count = extract_pages_from_pdf(pdf_path, max_pages=int(max_pages))
|
| 428 |
+
|
| 429 |
+
# enforce text-based PDFs note
|
| 430 |
+
if _text_based_pdf_warning(pages):
|
| 431 |
+
# create an "empty" record with warning
|
| 432 |
+
row = {"file": filename, "paper_title": "", "risk_stance": "insufficient_data", "risk_confidence": 0.0, "risk_summary": "No extractable text found. This app supports text-based PDFs only."}
|
| 433 |
+
for k, sch in field_props.items():
|
| 434 |
+
row[k] = "" if sch.get("type") != "array" else ""
|
| 435 |
+
flat_rows.append(row)
|
| 436 |
+
|
| 437 |
+
results.append({
|
| 438 |
+
"_file": filename,
|
| 439 |
+
"_pages_in_pdf": page_count,
|
| 440 |
+
"paper_title": "",
|
| 441 |
+
"risk_stance": "insufficient_data",
|
| 442 |
+
"risk_confidence": 0.0,
|
| 443 |
+
"risk_summary": "No extractable text found. This app supports text-based PDFs only.",
|
| 444 |
+
"extracted": {k: ([] if field_props[k].get("type") == "array" else "") for k in field_props.keys()},
|
| 445 |
+
"evidence": []
|
| 446 |
+
})
|
| 447 |
+
continue
|
| 448 |
+
|
| 449 |
chunks = chunk_pages(pages, target_chars=int(chunk_chars))
|
| 450 |
|
| 451 |
+
# Queries: risk stance + each field instruction (or field key)
|
| 452 |
+
queries = ["regulatory acceptability risk hazard concern conclusion noael loael bmd bmdl adverse effect uncertainty"]
|
|
|
|
|
|
|
| 453 |
for k, ins in field_instr.items():
|
| 454 |
+
queries.append(ins if ins else k)
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
selected = select_relevant_chunks(chunks, queries, top_per_query=2, max_chunks=12)
|
| 457 |
context = build_context(selected, max_chars=int(max_context_chars))
|
| 458 |
|
| 459 |
+
extracted = openai_structured_extract(
|
| 460 |
+
client=client,
|
| 461 |
+
model=model,
|
| 462 |
+
schema=schema,
|
| 463 |
+
controlled_vocab=vocab,
|
| 464 |
+
field_instructions=field_instr,
|
| 465 |
+
context=context
|
| 466 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
extracted["_file"] = filename
|
| 469 |
extracted["_pages_in_pdf"] = page_count
|
| 470 |
results.append(extracted)
|
| 471 |
|
| 472 |
+
# flatten to table (wide)
|
| 473 |
row = {
|
| 474 |
"file": filename,
|
| 475 |
"paper_title": extracted.get("paper_title", ""),
|
|
|
|
| 477 |
"risk_confidence": extracted.get("risk_confidence", ""),
|
| 478 |
"risk_summary": extracted.get("risk_summary", "")
|
| 479 |
}
|
| 480 |
+
|
| 481 |
+
ext = extracted.get("extracted") or {}
|
| 482 |
for k in field_props.keys():
|
| 483 |
+
v = ext.get(k, "" if field_props[k].get("type") != "array" else [])
|
| 484 |
if isinstance(v, list):
|
| 485 |
row[k] = "; ".join([str(x) for x in v])
|
| 486 |
else:
|
| 487 |
row[k] = v
|
| 488 |
+
|
| 489 |
flat_rows.append(row)
|
| 490 |
|
| 491 |
df = pd.DataFrame(flat_rows)
|
|
|
|
| 495 |
df.to_csv(csv_path, index=False)
|
| 496 |
json_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 497 |
|
| 498 |
+
records = df.to_dict("records")
|
| 499 |
+
choices = [r["file"] for r in records if "file" in r]
|
| 500 |
+
default = choices[0] if choices else None
|
| 501 |
+
vertical = _make_vertical(records, default)
|
| 502 |
+
evidence = _render_evidence(results, default)
|
| 503 |
+
|
| 504 |
+
status = "Done. Use the vertical view to read cleanly. Enable Review Mode to edit and export a reviewed CSV."
|
| 505 |
+
|
| 506 |
+
return (
|
| 507 |
+
df,
|
| 508 |
+
str(csv_path),
|
| 509 |
+
str(json_path),
|
| 510 |
+
status,
|
| 511 |
+
gr.update(choices=choices, value=default),
|
| 512 |
+
records,
|
| 513 |
+
results,
|
| 514 |
+
vertical,
|
| 515 |
+
evidence
|
| 516 |
+
)
|
| 517 |
|
| 518 |
|
| 519 |
+
# =============================
|
| 520 |
+
# Review mode handlers
|
| 521 |
+
# =============================
|
| 522 |
+
def on_pick(file_name: str, records: List[Dict[str, Any]], details: List[Dict[str, Any]]):
|
| 523 |
+
return _make_vertical(records, file_name), _render_evidence(details, file_name)
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
+
def toggle_review_mode(is_on: bool):
|
| 527 |
+
# make vertical table editable when review mode is on
|
| 528 |
+
return gr.update(interactive=bool(is_on))
|
| 529 |
|
| 530 |
|
| 531 |
+
def save_review_changes(file_name: str, vertical_df: Any, records: List[Dict[str, Any]]):
|
| 532 |
+
"""
|
| 533 |
+
vertical_df comes from gr.Dataframe: typically list[list] or pandas df-like.
|
| 534 |
+
Expect two columns: Field, Value
|
| 535 |
+
"""
|
| 536 |
+
if not file_name or not records:
|
| 537 |
+
return None, records, "Nothing to save."
|
| 538 |
|
| 539 |
+
# Convert vertical_df into dict
|
| 540 |
try:
|
| 541 |
+
if isinstance(vertical_df, pd.DataFrame):
|
| 542 |
+
dfv = vertical_df
|
| 543 |
+
else:
|
| 544 |
+
# gradio may pass list-of-lists
|
| 545 |
+
dfv = pd.DataFrame(vertical_df, columns=["Field", "Value"])
|
| 546 |
+
except Exception:
|
| 547 |
+
return None, records, "Could not parse edited vertical table."
|
| 548 |
+
|
| 549 |
+
dfv = dfv.dropna(subset=["Field"])
|
| 550 |
+
updates = {str(r["Field"]): r["Value"] for _, r in dfv.iterrows() if str(r["Field"]).strip()}
|
| 551 |
+
|
| 552 |
+
# Update matching record
|
| 553 |
+
new_records = []
|
| 554 |
+
updated = False
|
| 555 |
+
for r in records:
|
| 556 |
+
if r.get("file") == file_name:
|
| 557 |
+
rr = dict(r)
|
| 558 |
+
for k, v in updates.items():
|
| 559 |
+
rr[k] = v
|
| 560 |
+
new_records.append(rr)
|
| 561 |
+
updated = True
|
| 562 |
+
else:
|
| 563 |
+
new_records.append(r)
|
| 564 |
|
| 565 |
+
df_wide = pd.DataFrame(new_records) if new_records else pd.DataFrame()
|
| 566 |
+
msg = "Saved changes into session table. Export reviewed CSV to download." if updated else "Record not found."
|
| 567 |
+
return df_wide, new_records, msg
|
|
|
|
| 568 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
+
def export_reviewed_csv(records: List[Dict[str, Any]]):
|
| 571 |
+
if not records:
|
| 572 |
+
return None, "No reviewed data to export."
|
| 573 |
+
tmpdir = Path(tempfile.mkdtemp(prefix="tox_review_"))
|
| 574 |
+
path = tmpdir / "reviewed_extraction_table.csv"
|
| 575 |
+
pd.DataFrame(records).to_csv(path, index=False)
|
| 576 |
+
return str(path), "Reviewed CSV ready to download."
|
| 577 |
|
|
|
|
| 578 |
|
| 579 |
+
# =============================
|
| 580 |
+
# Synthesis tab handler
|
| 581 |
+
# =============================
|
| 582 |
+
def run_synthesis(api_key, model, extraction_json_file):
|
| 583 |
+
if extraction_json_file is None:
|
| 584 |
+
return "Upload the extraction_details.json produced by the Extract tab first."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
|
| 586 |
+
try:
|
| 587 |
+
client = get_openai_client(api_key)
|
| 588 |
+
except Exception as e:
|
| 589 |
+
return str(e)
|
| 590 |
+
|
| 591 |
+
rows = json.loads(Path(extraction_json_file.name).read_text(encoding="utf-8"))
|
| 592 |
+
return openai_synthesize_across_papers(client, model, rows)
|
| 593 |
|
| 594 |
|
| 595 |
+
# =============================
|
| 596 |
# Gradio UI
|
| 597 |
+
# =============================
|
| 598 |
+
with gr.Blocks(title="Toxicology PDF → Grounded Table Extractor") as demo:
|
| 599 |
+
gr.Markdown(
|
| 600 |
+
"# Toxicology PDF → Grounded Table Extractor (GPT-4o)\n\n"
|
| 601 |
+
"**Important:** This app supports **text-based PDFs only** (not scanned/image PDFs). If a PDF has no extractable text, it will be flagged as insufficient_data.\n\n"
|
| 602 |
+
"You control *what* to extract using the **Extraction spec**. Outputs are grounded by evidence quotes + page ranges."
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
# State stores for review mode
|
| 606 |
+
state_records = gr.State([]) # wide table rows: list[dict]
|
| 607 |
+
state_details = gr.State([]) # extraction details JSON: list[dict]
|
| 608 |
|
| 609 |
with gr.Tab("Extract to Table"):
|
| 610 |
files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
|
| 611 |
|
| 612 |
+
with gr.Row():
|
| 613 |
+
api_key = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 614 |
+
model = gr.Dropdown(
|
| 615 |
+
label="Model",
|
| 616 |
+
choices=["gpt-4o-2024-08-06", "gpt-4o", "gpt-4o-mini"],
|
| 617 |
+
value="gpt-4o-2024-08-06"
|
| 618 |
+
)
|
| 619 |
|
| 620 |
with gr.Row():
|
| 621 |
+
max_pages = gr.Slider(0, 250, value=0, step=1, label="Max pages to read (0 = all)")
|
| 622 |
+
chunk_chars = gr.Slider(1200, 9000, value=3200, step=100, label="Chunk size (chars)")
|
| 623 |
+
max_context_chars = gr.Slider(5000, 45000, value=20000, step=1000, label="Max context sent to GPT (chars)")
|
| 624 |
|
| 625 |
+
vocab_json = gr.Textbox(label="Controlled vocabulary (JSON)", value=DEFAULT_CONTROLLED_VOCAB_JSON, lines=10)
|
| 626 |
+
field_spec = gr.Textbox(label="Extraction spec (you control the columns)", value=DEFAULT_FIELD_SPEC, lines=10)
|
| 627 |
|
| 628 |
+
extract_btn = gr.Button("Run Extraction (Grounded)")
|
|
|
|
|
|
|
| 629 |
status = gr.Textbox(label="Status", interactive=False)
|
| 630 |
|
| 631 |
+
table = gr.Dataframe(
|
| 632 |
+
label="Wide Table (download-friendly)",
|
| 633 |
+
interactive=False,
|
| 634 |
+
wrap=True,
|
| 635 |
+
show_row_numbers=True,
|
| 636 |
+
buttons=["fullscreen", "copy"]
|
| 637 |
+
)
|
| 638 |
+
with gr.Row():
|
| 639 |
+
out_csv = gr.File(label="Download: extraction_table.csv")
|
| 640 |
+
out_json = gr.File(label="Download: extraction_details.json (evidence + structured data)")
|
| 641 |
|
| 642 |
+
gr.Markdown("## Readable view (vertical) + evidence")
|
| 643 |
+
record_pick = gr.Dropdown(label="Select record", choices=[], value=None)
|
| 644 |
+
|
| 645 |
+
with gr.Row():
|
| 646 |
+
review_mode = gr.Checkbox(label="Review mode (enable editing)", value=False)
|
| 647 |
+
save_btn = gr.Button("Save changes to session table")
|
| 648 |
+
export_btn = gr.Button("Export reviewed CSV")
|
| 649 |
+
|
| 650 |
+
review_status = gr.Textbox(label="Review status", interactive=False)
|
| 651 |
+
|
| 652 |
+
vertical_view = gr.Dataframe(
|
| 653 |
+
headers=["Field", "Value"],
|
| 654 |
+
interactive=False,
|
| 655 |
+
wrap=True,
|
| 656 |
+
show_row_numbers=False,
|
| 657 |
+
label="Vertical record view (Field → Value)"
|
| 658 |
)
|
| 659 |
+
evidence_md = gr.Markdown()
|
| 660 |
|
| 661 |
+
reviewed_csv = gr.File(label="Download: reviewed_extraction_table.csv")
|
| 662 |
+
|
| 663 |
+
# Run extraction
|
| 664 |
extract_btn.click(
|
| 665 |
fn=run_extraction,
|
| 666 |
inputs=[files, api_key, model, field_spec, vocab_json, max_pages, chunk_chars, max_context_chars],
|
| 667 |
+
outputs=[table, out_csv, out_json, status, record_pick, state_records, state_details, vertical_view, evidence_md]
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
# On select record
|
| 671 |
+
record_pick.change(
|
| 672 |
+
fn=on_pick,
|
| 673 |
+
inputs=[record_pick, state_records, state_details],
|
| 674 |
+
outputs=[vertical_view, evidence_md]
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
# Toggle review mode editing
|
| 678 |
+
review_mode.change(
|
| 679 |
+
fn=toggle_review_mode,
|
| 680 |
+
inputs=[review_mode],
|
| 681 |
+
outputs=[vertical_view]
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
# Save edits back to wide table + state
|
| 685 |
+
save_btn.click(
|
| 686 |
+
fn=save_review_changes,
|
| 687 |
+
inputs=[record_pick, vertical_view, state_records],
|
| 688 |
+
outputs=[table, state_records, review_status]
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
# Export reviewed CSV
|
| 692 |
+
export_btn.click(
|
| 693 |
+
fn=export_reviewed_csv,
|
| 694 |
+
inputs=[state_records],
|
| 695 |
+
outputs=[reviewed_csv, review_status]
|
| 696 |
)
|
| 697 |
|
| 698 |
with gr.Tab("Cross-paper Synthesis"):
|
| 699 |
+
gr.Markdown("Upload the `extraction_details.json` from the Extract tab. Synthesis is based strictly on those grounded extractions.")
|
| 700 |
api_key2 = gr.Textbox(label="OpenAI API key (optional if set as OPENAI_API_KEY secret)", type="password")
|
| 701 |
model2 = gr.Dropdown(
|
| 702 |
label="Model",
|
|
|
|
| 713 |
outputs=[synth_md]
|
| 714 |
)
|
| 715 |
|
| 716 |
+
with gr.Tab("Pending tasks"):
|
| 717 |
+
gr.Markdown(
|
| 718 |
+
"## Product roadmap (pending tasks)\n\n"
|
| 719 |
+
"### 1) Granular data model (one row per chemical–endpoint pair)\n"
|
| 720 |
+
"- Change schema to return `records: [ {chemical, endpoint, ...} ]`\n"
|
| 721 |
+
"- Flatten into wide table; vertical viewer targets a single record\n\n"
|
| 722 |
+
"### 2) Stronger grounding & verification\n"
|
| 723 |
+
"- Require evidence per field (already)\n"
|
| 724 |
+
"- Add automatic evidence verification (quote must exist in excerpt)\n"
|
| 725 |
+
"- Add `UNVERIFIED` flags + force empty values when evidence fails\n\n"
|
| 726 |
+
"### 3) Controlled vocab expansion & mapping\n"
|
| 727 |
+
"- Add synonym lists and preferred terms\n"
|
| 728 |
+
"- Map extracted terms into: FDA taxonomy / OECD endpoints / MedDRA-like groupings\n"
|
| 729 |
+
"- Add a vocab editor + import/export vocab JSON\n\n"
|
| 730 |
+
"### 4) Column transforms (structured parsing)\n"
|
| 731 |
+
"- Parse dose metrics into `{metric, value, unit, route, duration}`\n"
|
| 732 |
+
"- Normalize units (e.g., mg/kg/day)\n"
|
| 733 |
+
"- Auto-split multi-chemical text into canonical list\n\n"
|
| 734 |
+
"### 5) Multi-document compare mode\n"
|
| 735 |
+
"- Compare by chemical or endpoint\n"
|
| 736 |
+
"- Create a consensus + disagreements table\n\n"
|
| 737 |
+
"### 6) PDF limitations\n"
|
| 738 |
+
"- Current: **text-based PDFs only**\n"
|
| 739 |
+
"- Optional future: OCR for scanned PDFs (adds heavy dependencies)\n"
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
if __name__ == "__main__":
|
| 743 |
port = int(os.environ.get("PORT", "7860"))
|
| 744 |
demo.queue().launch(server_name="0.0.0.0", server_port=port)
|