Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 11

Commit

8b1e2a5

verified ·

1 Parent(s): e46893f

Create pipeline.py

Browse files

Files changed (1) hide show

toxra_core/pipeline.py +619 -0

toxra_core/pipeline.py ADDED Viewed

	@@ -0,0 +1,619 @@

+"""
+toxra_core.pipeline — robust grounded extraction core for TOXRA.AI
+Implements:
+- PDF text extraction (text-based PDFs only)
+- Page-aware chunking with overlap
+- Keyword-based chunk selection to fit context limits
+- OpenAI Responses API structured extraction (json_schema)
+- Rich schema builder from Field Spec + Controlled Vocab
+- Endpoint filtering: families + specific OECD TGs
+- Row-mode logic: one_row_per_paper vs one_row_per_chemical_endpoint (policy + heuristics)
+- Evidence management: per-field quote + page; verification against provided context
+- Post-processing: normalize records, clamp confidence, cap runaway outputs
+"""
+from __future__ import annotations
+import os
+import re
+import json
+import time
+import hashlib
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Optional
+import pandas as pd
+from pypdf import PdfReader
+try:
+    from openai import OpenAI
+except Exception:  # pragma: no cover
+    OpenAI = None  # type: ignore
+# =============================
+# Tunables (env overrides)
+# =============================
+DEFAULT_CHUNK_SIZE = int(os.getenv("TOXRA_CHUNK_SIZE", "3200"))
+DEFAULT_CHUNK_OVERLAP = int(os.getenv("TOXRA_CHUNK_OVERLAP", "250"))
+DEFAULT_MAX_RECORDS_PER_PDF = int(os.getenv("TOXRA_MAX_RECORDS_PER_PDF", "120"))
+ENABLE_CHEM_SCAN = os.getenv("TOXRA_ENABLE_CHEM_SCAN", "1").strip() == "1"  # robust but costs extra call
+CAS_RE = re.compile(r"\b\d{2,7}-\d{2}-\d\b")
+WS_RE = re.compile(r"\s+")
+RISK_STANCE_ENUM = ["acceptable", "acceptable_with_uncertainty", "not_acceptable", "insufficient_data"]
+@dataclass
+class Chunk:
+    chunk_id: str
+    page: int
+    text: str
+# =============================
+# Utility
+# =============================
+def _clean_text(t: str) -> str:
+    t = (t or "").replace("\x00", " ")
+    t = WS_RE.sub(" ", t).strip()
+    return t
+def _sha1(s: str) -> str:
+    return hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:12]
+def _safe_json_loads(s: str, fallback: Any) -> Any:
+    try:
+        return json.loads(s) if s else fallback
+    except Exception:
+        return fallback
+# =============================
+# PDF extraction
+# =============================
+def extract_pages(pdf_path: str, max_pages: int) -> Tuple[List[Tuple[int, str]], int]:
+    reader = PdfReader(pdf_path)
+    total = len(reader.pages)
+    n = min(total, max_pages) if max_pages and max_pages > 0 else total
+    out: List[Tuple[int, str]] = []
+    for i in range(n):
+        try:
+            txt = reader.pages[i].extract_text() or ""
+        except Exception:
+            txt = ""
+        out.append((i + 1, txt))
+    return out, total
+def is_text_based(pages: List[Tuple[int, str]]) -> bool:
+    joined = " ".join([_clean_text(t) for _, t in pages if _clean_text(t)])
+    return len(joined) >= 200
+def chunk_pages(
+    pages: List[Tuple[int, str]],
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    overlap: int = DEFAULT_CHUNK_OVERLAP,
+) -> List[Chunk]:
+    chunks: List[Chunk] = []
+    for pno, raw in pages:
+        txt = _clean_text(raw)
+        if not txt:
+            continue
+        if len(txt) <= chunk_size:
+            chunks.append(Chunk(chunk_id=f"p{pno}_{_sha1(txt)}", page=pno, text=txt))
+            continue
+        start = 0
+        while start < len(txt):
+            end = min(len(txt), start + chunk_size)
+            seg = txt[start:end]
+            chunks.append(Chunk(chunk_id=f"p{pno}_{start}_{end}", page=pno, text=seg))
+            if end >= len(txt):
+                break
+            start = max(0, end - overlap)
+    return chunks
+def select_chunks(
+    chunks: List[Chunk],
+    max_context_chars: int,
+    query_terms: List[str],
+    always_take_first_page: bool = True,
+) -> Tuple[List[Chunk], Dict[str, Any]]:
+    if not chunks:
+        return [], {"reason": "no_chunks"}
+    q = [t.lower() for t in (query_terms or []) if t and t.strip()]
+    scored = []
+    for ch in chunks:
+        t = ch.text.lower()
+        score = 0
+        for term in q:
+            if term in t:
+                score += 1
+        scored.append((score, ch))
+    scored.sort(key=lambda x: (x[0], -len(x[1].text)), reverse=True)
+    selected: List[Chunk] = []
+    used = 0
+    if always_take_first_page:
+        first = [c for c in chunks if c.page == 1]
+        if first:
+            c0 = first[0]
+            if used + len(c0.text) + 60 <= max_context_chars:
+                selected.append(c0)
+                used += len(c0.text) + 60
+    for score, ch in scored:
+        if ch in selected:
+            continue
+        block_len = len(ch.text) + 60
+        if used + block_len > max_context_chars:
+            continue
+        selected.append(ch)
+        used += block_len
+        if used >= max_context_chars:
+            break
+    if not selected and chunks:
+        ch = chunks[0]
+        clip = ch.text[: max(0, max_context_chars - 60)]
+        selected = [Chunk(chunk_id=ch.chunk_id, page=ch.page, text=clip)]
+    debug = {
+        "max_context_chars": max_context_chars,
+        "query_terms": query_terms,
+        "selected_count": len(selected),
+        "selected_pages": sorted(list({c.page for c in selected})),
+    }
+    return selected, debug
+def build_context(selected_chunks: List[Chunk], file_name: str) -> str:
+    parts = [f"[FILE] {file_name}"]
+    selected_chunks = sorted(selected_chunks, key=lambda c: (c.page, c.chunk_id))
+    for ch in selected_chunks:
+        parts.append(f"\n[PAGE {ch.page} | {ch.chunk_id}]\n{ch.text}\n")
+    return "\n".join(parts).strip()
+# =============================
+# Admin JSON parsing + schema building
+# =============================
+def parse_admin_json(vocab_json: str, spec_json: str) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+    vocab_default = {
+        "risk_stance_enum": RISK_STANCE_ENUM,
+        "approach_enum": ["in_vivo", "in_vitro", "in_silico", "nams", "mixed", "not_reported"],
+        "genotoxicity_oecd_tg_in_vitro_enum": [],
+        "genotoxicity_oecd_tg_in_vivo_enum": [],
+    }
+    vocab = _safe_json_loads(vocab_json, vocab_default)
+    spec = _safe_json_loads(spec_json, [])
+    if not isinstance(vocab, dict):
+        vocab = vocab_default
+    if not isinstance(spec, list):
+        spec = []
+    return vocab, spec
+def _resolve_enum_list(vocab: Dict[str, Any], enum_values: str) -> List[str]:
+    enum_values = (enum_values or "").strip()
+    if not enum_values:
+        return []
+    if enum_values in vocab and isinstance(vocab[enum_values], list):
+        return [str(x) for x in vocab[enum_values]]
+    return [x.strip() for x in enum_values.split(",") if x.strip()]
+def build_output_schema(vocab: Dict[str, Any], spec: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Strict JSON schema for OpenAI Responses API.
+    NOTE: required MUST include every property key (OpenAI validator requirement).
+    """
+    def field_schema(f: Dict[str, Any]) -> Dict[str, Any]:
+        ftype = (f.get("type") or "str").strip()
+        enum_values = (f.get("enum_values") or "").strip()
+        if ftype == "str":
+            return {"type": ["string", "null"]}
+        if ftype == "num":
+            return {"type": ["number", "null"]}
+        if ftype == "bool":
+            return {"type": ["boolean", "null"]}
+        if ftype == "list[str]":
+            return {"type": ["array", "null"], "items": {"type": "string"}}
+        if ftype == "list[num]":
+            return {"type": ["array", "null"], "items": {"type": "number"}}
+        if ftype == "enum":
+            enum_list = _resolve_enum_list(vocab, enum_values)
+            return {"type": ["string", "null"], "enum": enum_list}
+        if ftype == "list[enum]":
+            enum_list = _resolve_enum_list(vocab, enum_values)
+            return {"type": ["array", "null"], "items": {"type": "string", "enum": enum_list}}
+        return {"type": ["string", "null"]}
+    record_props: Dict[str, Any] = {
+        "file": {"type": "string"},
+        "row_mode": {"type": "string", "enum": ["one_row_per_paper", "one_row_per_chemical_endpoint"]},
+        "chemical": {"type": ["string", "null"]},
+        "endpoint": {"type": ["string", "null"]},
+    }
+    for f in spec:
+        name = (f.get("field") or "").strip()
+        if not name:
+            continue
+        record_props[name] = field_schema(f)
+    required_keys = list(record_props.keys())
+    schema = {
+        "type": "object",
+        "properties": {
+            "records": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": record_props,
+                    "required": required_keys,
+                    "additionalProperties": False,
+                },
+            },
+            "evidence": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "record_index": {"type": "integer"},
+                        "field": {"type": "string"},
+                        "page": {"type": "integer"},
+                        "quote": {"type": "string"},
+                    },
+                    "required": ["record_index", "field", "page", "quote"],
+                    "additionalProperties": False,
+                },
+            },
+            "notes": {"type": "string"},
+        },
+        "required": ["records", "evidence", "notes"],
+        "additionalProperties": False,
+    }
+    return schema
+# =============================
+# Selection → query term expansion (for chunk selection)
+# =============================
+def keyword_terms_for_selection(endpoint_families: List[str], oecd_tgs: List[str], vocab: Dict[str, Any]) -> List[str]:
+    terms: List[str] = []
+    for f in endpoint_families or []:
+        terms.append(f)
+    for tg in oecd_tgs or []:
+        terms.append(tg)
+        m = re.search(r"\b(\d{3})\b", tg)
+        if m:
+            terms.append(m.group(1))
+    # NAMs/in silico cues
+    terms += ["in silico", "QSAR", "read-across", "NAMs", "NAMS", "AOP", "pathway", "transcript", "omics"]
+    # Pull common TG vocab terms to help ranking
+    for k in ["genotoxicity_oecd_tg_in_vitro_enum", "genotoxicity_oecd_tg_in_vivo_enum"]:
+        if k in vocab and isinstance(vocab[k], list):
+            for v in vocab[k][:25]:
+                terms.append(str(v))
+    # dedupe
+    out: List[str] = []
+    seen = set()
+    for t in terms:
+        tt = (t or "").strip()
+        if not tt:
+            continue
+        low = tt.lower()
+        if low in seen:
+            continue
+        seen.add(low)
+        out.append(tt)
+    return out
+# =============================
+# OpenAI client + calls
+# =============================
+def get_openai_client(api_key: str) -> OpenAI:
+    if OpenAI is None:
+        raise RuntimeError("openai package not installed in toxra_core runtime.")
+    key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
+    if not key:
+        raise ValueError("Missing OpenAI API key. Provide it or set OPENAI_API_KEY.")
+    return OpenAI(api_key=key)
+def openai_structured_extract(
+    client: OpenAI,
+    model: str,
+    schema: Dict[str, Any],
+    system_prompt: str,
+    user_prompt: str,
+) -> Dict[str, Any]:
+    resp = client.responses.create(
+        model=model,
+        input=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
+        response_format={"type": "json_schema", "json_schema": {"name": "toxra_extraction", "schema": schema, "strict": True}},
+    )
+    txt = (resp.output_text or "").strip()
+    return json.loads(txt)
+# =============================
+# Optional: quick chemical scan (robust row-mode seed)
+# =============================
+def quick_chem_scan(
+    client: OpenAI,
+    model: str,
+    context: str,
+) -> Dict[str, Any]:
+    chem_schema = {
+        "type": "object",
+        "properties": {
+            "chemicals": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {"name": {"type": "string"}, "cas": {"type": ["string", "null"]}},
+                    "required": ["name", "cas"],
+                    "additionalProperties": False,
+                },
+            },
+            "notes": {"type": "string"},
+        },
+        "required": ["chemicals", "notes"],
+        "additionalProperties": False,
+    }
+    sys = (
+        "Extract primary chemical names mentioned in the provided text. "
+        "Return up to 10 chemicals. Stay grounded; if unsure, omit."
+    )
+    user = f"TEXT:\n{context}\n\nReturn JSON per schema."
+    out = openai_structured_extract(client, model, chem_schema, sys, user)
+    return out if isinstance(out, dict) else {"chemicals": [], "notes": "invalid"}
+# =============================
+# Evidence verification
+# =============================
+def verify_evidence_quotes(evidence: List[Dict[str, Any]], selected_chunks: List[Chunk]) -> Dict[str, Any]:
+    hay = "\n".join([c.text for c in selected_chunks]).lower()
+    bad = 0
+    for e in evidence:
+        q = (e.get("quote") or "").strip().lower()
+        if q and q not in hay:
+            bad += 1
+    return {"evidence_items": len(evidence), "unverified_quotes": bad}
+# =============================
+# Normalization / post-processing
+# =============================
+def normalize_record(rec: Dict[str, Any], file_name: str, fallback_row_mode: str) -> Dict[str, Any]:
+    rec = dict(rec or {})
+    rec["file"] = rec.get("file") or file_name
+    rm = rec.get("row_mode") or fallback_row_mode
+    if rm not in ("one_row_per_paper", "one_row_per_chemical_endpoint"):
+        rm = fallback_row_mode
+    rec["row_mode"] = rm
+    if "chemical" not in rec:
+        rec["chemical"] = None
+    if "endpoint" not in rec:
+        rec["endpoint"] = None
+    if "risk_stance" in rec and rec["risk_stance"] is not None:
+        if rec["risk_stance"] not in RISK_STANCE_ENUM:
+            rec["risk_stance"] = "insufficient_data"
+    if "risk_confidence" in rec and rec["risk_confidence"] is not None:
+        try:
+            x = float(rec["risk_confidence"])
+            rec["risk_confidence"] = max(0.0, min(1.0, x))
+        except Exception:
+            rec["risk_confidence"] = None
+    # Clean "null" strings
+    for k, v in list(rec.items()):
+        if isinstance(v, str) and v.strip().lower() == "null":
+            rec[k] = None
+    return rec
+def cap_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    return records[:DEFAULT_MAX_RECORDS_PER_PDF] if len(records) > DEFAULT_MAX_RECORDS_PER_PDF else records
+def build_overview_df(records: List[Dict[str, Any]]) -> pd.DataFrame:
+    if not records:
+        return pd.DataFrame(columns=["file", "paper_title", "risk_stance", "risk_confidence", "row_mode", "chemical", "endpoint"])
+    df = pd.DataFrame(records)
+    cols = [c for c in ["file", "paper_title", "risk_stance", "risk_confidence", "row_mode", "chemical", "endpoint"] if c in df.columns]
+    if "chemicals" in df.columns and "chemical" not in cols:
+        cols.append("chemicals")
+    return df[cols].copy() if cols else df.head(50)
+# =============================
+# Entrypoint called by app.py
+# =============================
+def run_extraction(
+    files,
+    api_key: str,
+    model: str,
+    max_pages: int,
+    max_context_chars: int,
+    endpoint_families: List[str],
+    oecd_tgs: List[str],
+    vocab_json: str,
+    spec_json: str,
+) -> Tuple[Dict[str, Any], str, pd.DataFrame, str, str]:
+    """
+    Returns:
+      run_state (dict), status (str), overview_df (pd.DataFrame), csv_path (str), details_path (str)
+    """
+    if not files:
+        empty = {"records": [], "evidence": [], "details": []}
+        return empty, "Upload at least one PDF.", build_overview_df([]), "", ""
+    vocab, spec = parse_admin_json(vocab_json, spec_json)
+    schema = build_output_schema(vocab, spec)
+    client = get_openai_client(api_key)
+    query_terms = keyword_terms_for_selection(endpoint_families, oecd_tgs, vocab)
+    system_prompt = (
+        "You are a toxicology literature extraction assistant for an industry safety assessor.\n"
+        "Hard rules:\n"
+        "1) Stay strictly grounded to provided PAGE text. If missing, use null or 'not_reported'.\n"
+        "2) Neutral synthesis only; do not over-interpret.\n"
+        "3) Row-mode policy:\n"
+        "   - If paper focuses on a single primary chemical => one_row_per_paper.\n"
+        "   - If multiple chemicals => one_row_per_chemical_endpoint.\n"
+        "4) Endpoint filtering:\n"
+        "   - Only include endpoint-related records for user-selected endpoint families / OECD TGs.\n"
+        "   - If TGs are provided, prefer them; do not invent TG numbers.\n"
+        "5) Evidence:\n"
+        "   - Provide evidence quotes with page numbers for key fields (risk_stance, risk_summary, key_findings, conclusion, OECD TG fields).\n"
+        "6) For one_row_per_chemical_endpoint:\n"
+        "   - Each record should map to ONE chemical and ONE endpoint (family or TG).\n"
+        "   - Put the chemical name in 'chemical' and the endpoint label in 'endpoint'.\n"
+        "7) For one_row_per_paper:\n"
+        "   - Use 'chemical' only if the primary chemical is explicit; 'endpoint' may be null.\n"
+    )
+    all_records: List[Dict[str, Any]] = []
+    all_evidence: List[Dict[str, Any]] = []
+    details: List[Dict[str, Any]] = []
+    for f in files:
+        pdf_path = f.name
+        file_name = os.path.basename(pdf_path)
+        pages, total_pages = extract_pages(pdf_path, max_pages=max_pages)
+        if not is_text_based(pages):
+            rec = {"file": file_name, "row_mode": "one_row_per_paper", "chemical": None, "endpoint": None}
+            for row in spec:
+                fld = (row.get("field") or "").strip()
+                if not fld:
+                    continue
+                rec[fld] = "insufficient_data" if fld == "risk_stance" else None
+            all_records.append(rec)
+            details.append({"file": file_name, "text_based": False, "pages_total": total_pages, "pages_indexed": 0, "reason": "no_extractable_text"})
+            continue
+        chunks = chunk_pages(pages, chunk_size=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_CHUNK_OVERLAP)
+        selected_chunks, sel_debug = select_chunks(chunks, max_context_chars=max_context_chars, query_terms=query_terms)
+        context = build_context(selected_chunks, file_name=file_name)
+        # heuristic seed using CAS hits
+        cas_hits = sorted(list({m.group(0) for _, t in pages for m in CAS_RE.finditer(t or "")}))
+        fallback_row_mode = "one_row_per_paper" if len(cas_hits) <= 1 else "one_row_per_chemical_endpoint"
+        # optional LLM chem scan to seed row-mode more robustly
+        chem_scan = {"chemicals": [], "notes": "disabled"}
+        if ENABLE_CHEM_SCAN:
+            # keep scan context smaller
+            scan_ctx = context[: min(len(context), 12000)]
+            try:
+                chem_scan = quick_chem_scan(client, model, scan_ctx)
+                names = [c.get("name") for c in (chem_scan.get("chemicals") or []) if isinstance(c, dict)]
+                names = [n for n in names if isinstance(n, str) and n.strip()]
+                if len(names) > 1:
+                    fallback_row_mode = "one_row_per_chemical_endpoint"
+            except Exception as e:
+                chem_scan = {"chemicals": [], "notes": f"scan_failed: {e}"}
+        user_prompt = (
+            f"USER_SELECTED_ENDPOINTS:\n{json.dumps({'families': endpoint_families or [], 'oecd_tgs': oecd_tgs or []}, indent=2)}\n\n"
+            f"CHEM_SCAN_HINT:\n{json.dumps(chem_scan, indent=2)}\n\n"
+            f"FIELD_SPEC:\n{json.dumps(spec, indent=2)}\n\n"
+            f"PAGE_TEXT:\n{context}\n\n"
+            "Return JSON matching the schema."
+        )
+        t0 = time.time()
+        parsed = openai_structured_extract(client, model, schema, system_prompt, user_prompt)
+        dt = time.time() - t0
+        recs = cap_records([(r or {}) for r in (parsed.get("records") or [])])
+        ev = (parsed.get("evidence") or []) if isinstance(parsed.get("evidence"), list) else []
+        recs2 = [normalize_record(r, file_name, fallback_row_mode) for r in recs]
+        base_index = len(all_records)
+        all_records.extend(recs2)
+        ev2: List[Dict[str, Any]] = []
+        for e in ev:
+            if not isinstance(e, dict):
+                continue
+            try:
+                ridx = int(e.get("record_index", 0))
+            except Exception:
+                ridx = 0
+            e2 = dict(e)
+            e2["record_index"] = base_index + max(0, min(ridx, len(recs2) - 1 if recs2 else 0))
+            try:
+                e2["page"] = int(e2.get("page", 0))
+            except Exception:
+                e2["page"] = 0
+            e2["field"] = str(e2.get("field", ""))
+            e2["quote"] = str(e2.get("quote", "")).strip()
+            if e2["quote"]:
+                ev2.append(e2)
+        all_evidence.extend(ev2)
+        ver = verify_evidence_quotes(ev2, selected_chunks)
+        details.append({
+            "file": file_name,
+            "text_based": True,
+            "pages_total": total_pages,
+            "pages_indexed": min(total_pages, max_pages) if max_pages and max_pages > 0 else total_pages,
+            "chunk_size": DEFAULT_CHUNK_SIZE,
+            "chunk_overlap": DEFAULT_CHUNK_OVERLAP,
+            "selection": sel_debug,
+            "runtime_s": round(dt, 2),
+            "cas_hits": cas_hits[:30],
+            "chem_scan_notes": chem_scan.get("notes", ""),
+            "evidence_verification": ver,
+            "notes": parsed.get("notes", ""),
+        })
+    overview_df = build_overview_df(all_records)
+    ts = int(time.time())
+    csv_path = f"/tmp/toxra_extraction_{ts}.csv"
+    details_path = f"/tmp/toxra_details_{ts}.json"
+    pd.DataFrame(all_records).to_csv(csv_path, index=False)
+    with open(details_path, "w", encoding="utf-8") as f:
+        json.dump({"records": all_records, "evidence": all_evidence, "details": details}, f, indent=2)
+    status = f"✅ Extracted {len(all_records)} record(s) from {len(files)} PDF(s)."
+    run_state = {"records": all_records, "evidence": all_evidence, "details": details, "csv_path": csv_path, "details_path": details_path}
+    return run_state, status, overview_df, csv_path, details_path