Spaces:

hchevva
/

NLP_Project

Runtime error

App Files Files Community

hchevva commited on Feb 14

Commit

630d650

verified ·

1 Parent(s): 4bf9d97

Upload 43 files

Browse files

Files changed (43) hide show

mcp_tox_calc/__init__.py +1 -0
mcp_tox_calc/__pycache__/__init__.cpython-314.pyc +0 -0
mcp_tox_calc/__pycache__/equations.cpython-314.pyc +0 -0
mcp_tox_calc/__pycache__/logging.cpython-314.pyc +0 -0
mcp_tox_calc/__pycache__/server.cpython-314.pyc +0 -0
mcp_tox_calc/__pycache__/units.cpython-314.pyc +0 -0
mcp_tox_calc/equations.py +275 -0
mcp_tox_calc/logging.py +58 -0
mcp_tox_calc/server.py +183 -0
mcp_tox_calc/units.py +152 -0
regulatory_catalog/epa_cancer_v2005.json +70 -0
regulatory_catalog/fda_ctp_v2024_06.json +70 -0
scripts/__pycache__/replay_calc_log.cpython-314.pyc +0 -0
scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc +0 -0
scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc +0 -0
scripts/replay_calc_log.py +37 -0
scripts/run_cancer_risk_batch.py +57 -0
scripts/run_mcp_calc_server.py +13 -0
tests/__pycache__/conftest.cpython-314.pyc +0 -0
tests/__pycache__/test_equations.cpython-314.pyc +0 -0
tests/__pycache__/test_mcp_tools.cpython-314.pyc +0 -0
tests/__pycache__/test_nlp_pipeline.cpython-314.pyc +0 -0
tests/__pycache__/test_regulatory_mapper.cpython-314.pyc +0 -0
tests/__pycache__/test_units.cpython-314.pyc +0 -0
tests/conftest.py +6 -0
tests/fixtures/extraction_sample.json +41 -0
tests/test_equations.py +86 -0
tests/test_mcp_tools.py +38 -0
tests/test_nlp_pipeline.py +46 -0
tests/test_regulatory_mapper.py +16 -0
tests/test_units.py +31 -0
toxra_core/__init__.py +1 -0
toxra_core/__pycache__/__init__.cpython-314.pyc +0 -0
toxra_core/__pycache__/artifacts.cpython-314.pyc +0 -0
toxra_core/__pycache__/calculation_client.cpython-314.pyc +0 -0
toxra_core/__pycache__/contracts.cpython-314.pyc +0 -0
toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc +0 -0
toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc +0 -0
toxra_core/artifacts.py +51 -0
toxra_core/calculation_client.py +150 -0
toxra_core/contracts.py +67 -0
toxra_core/nlp_pipeline.py +284 -0
toxra_core/regulatory_mapper.py +254 -0

mcp_tox_calc/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Deterministic toxicology calculation engine exposed through a local MCP server."""

mcp_tox_calc/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (217 Bytes). View file

mcp_tox_calc/__pycache__/equations.cpython-314.pyc ADDED Viewed

Binary file (13.3 kB). View file

mcp_tox_calc/__pycache__/logging.cpython-314.pyc ADDED Viewed

Binary file (4.89 kB). View file

mcp_tox_calc/__pycache__/server.cpython-314.pyc ADDED Viewed

Binary file (8.62 kB). View file

mcp_tox_calc/__pycache__/units.cpython-314.pyc ADDED Viewed

Binary file (6.81 kB). View file

mcp_tox_calc/equations.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from typing import Any, Dict, List, Optional
+from toxra_core.contracts import classify_risk_tier
+from .units import (
+    UnitError,
+    normalize_air_concentration,
+    normalize_csf,
+    normalize_iur,
+    normalize_oral_exposure,
+    normalize_route,
+)
+FORMULA_VERSION = "1.0.0"
+class CalculationError(ValueError):
+    pass
+def _base_result(formula_id: str) -> Dict[str, Any]:
+    return {
+        "formula_id": formula_id,
+        "formula_version": FORMULA_VERSION,
+        "inputs_normalized": {},
+        "unit_conversions": [],
+        "result_value": None,
+        "risk_tier": "unknown",
+        "warnings": [],
+        "log_ref": "",
+    }
+def validate_risk_input(payload: Dict[str, Any]) -> Dict[str, Any]:
+    result = _base_result("validate_risk_input")
+    errors: List[str] = []
+    warnings: List[str] = []
+    try:
+        route = normalize_route(payload.get("route"))
+    except Exception as exc:
+        route = ""
+        errors.append(str(exc))
+    has_csf = payload.get("csf_value") not in (None, "") and payload.get("exposure_value") not in (None, "")
+    has_iur = payload.get("iur_value") not in (None, "") and payload.get("air_conc_value") not in (None, "")
+    if route == "oral" and not has_csf:
+        errors.append("Oral route requires csf_value and exposure_value for CSF pathway.")
+    if route == "inhalation" and not (has_iur or has_csf):
+        errors.append("Inhalation route requires iur_value+air_conc_value or csf_value+exposure_value.")
+    if has_csf and payload.get("csf_unit") in (None, ""):
+        warnings.append("csf_unit missing; assuming standard per (mg/kg-day).")
+    if has_iur and payload.get("air_conc_unit") in (None, ""):
+        warnings.append("air_conc_unit missing; assuming ug/m3.")
+    result["warnings"] = warnings
+    result["valid"] = len(errors) == 0
+    result["errors"] = errors
+    result["result_value"] = 1.0 if result["valid"] else 0.0
+    result["risk_tier"] = "unknown"
+    return result
+def calculate_epa_elcr_csf(payload: Dict[str, Any]) -> Dict[str, Any]:
+    result = _base_result("calculate_epa_elcr_csf")
+    try:
+        normalize_route(payload.get("route"))
+        exp_norm = normalize_oral_exposure(
+            payload.get("exposure_value"),
+            payload.get("exposure_unit"),
+            payload.get("body_weight_kg"),
+        )
+        csf_norm = normalize_csf(payload.get("csf_value"), payload.get("csf_unit"))
+        elcr = exp_norm["value_mg_per_kg_day"] * csf_norm["value_per_mg_per_kg_day"]
+        result["inputs_normalized"] = {
+            "cdi_mg_per_kg_day": exp_norm["value_mg_per_kg_day"],
+            "csf_per_mg_per_kg_day": csf_norm["value_per_mg_per_kg_day"],
+        }
+        result["unit_conversions"] = exp_norm["conversions"] + csf_norm["conversions"]
+        result["result_value"] = float(elcr)
+        result["risk_tier"] = classify_risk_tier(result["result_value"])
+        return result
+    except (UnitError, ValueError) as exc:
+        raise CalculationError(str(exc)) from exc
+def calculate_epa_elcr_iur(payload: Dict[str, Any]) -> Dict[str, Any]:
+    result = _base_result("calculate_epa_elcr_iur")
+    try:
+        route = normalize_route(payload.get("route"))
+        if route != "inhalation":
+            result["warnings"].append("IUR calculation is generally applicable to inhalation route.")
+        conc_norm = normalize_air_concentration(payload.get("air_conc_value"), payload.get("air_conc_unit"))
+        iur_norm = normalize_iur(payload.get("iur_value"), payload.get("iur_unit"))
+        elcr = conc_norm["value_ug_per_m3"] * iur_norm["value_per_ug_per_m3"]
+        result["inputs_normalized"] = {
+            "air_conc_ug_per_m3": conc_norm["value_ug_per_m3"],
+            "iur_per_ug_per_m3": iur_norm["value_per_ug_per_m3"],
+        }
+        result["unit_conversions"] = conc_norm["conversions"] + iur_norm["conversions"]
+        result["result_value"] = float(elcr)
+        result["risk_tier"] = classify_risk_tier(result["result_value"])
+        return result
+    except (UnitError, ValueError) as exc:
+        raise CalculationError(str(exc)) from exc
+def calculate_fda_ctp_elcr(payload: Dict[str, Any]) -> Dict[str, Any]:
+    result = _base_result("calculate_fda_ctp_elcr")
+    # Supports either a single row payload or a multi-constituent list under "constituents".
+    components: List[Dict[str, Any]] = []
+    total = 0.0
+    rows: List[Dict[str, Any]]
+    if isinstance(payload.get("constituents"), list) and payload.get("constituents"):
+        rows = [x for x in payload.get("constituents", []) if isinstance(x, dict)]
+    else:
+        rows = [payload]
+    for row in rows:
+        comp: Dict[str, Any] = {
+            "chemical_name": row.get("chemical_name", ""),
+            "route": row.get("route", ""),
+            "csf_result": None,
+            "iur_result": None,
+            "component_total": 0.0,
+        }
+        if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
+            csf_res = calculate_epa_elcr_csf(row)
+            comp["csf_result"] = csf_res
+            comp["component_total"] += float(csf_res["result_value"] or 0.0)
+        if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
+            iur_res = calculate_epa_elcr_iur(row)
+            comp["iur_result"] = iur_res
+            comp["component_total"] += float(iur_res["result_value"] or 0.0)
+        total += comp["component_total"]
+        components.append(comp)
+    result["inputs_normalized"] = {"component_count": len(components)}
+    result["unit_conversions"] = []
+    result["result_value"] = float(total)
+    result["risk_tier"] = classify_risk_tier(result["result_value"])
+    result["component_results"] = components
+    return result
+def get_formula_catalog() -> Dict[str, Any]:
+    return {
+        "formula_id": "get_formula_catalog",
+        "formula_version": FORMULA_VERSION,
+        "inputs_normalized": {},
+        "unit_conversions": [],
+        "result_value": None,
+        "risk_tier": "unknown",
+        "warnings": [],
+        "log_ref": "",
+        "formulas": [
+            {
+                "id": "calculate_epa_elcr_csf",
+                "equation": "ELCR = CDI (mg/kg-day) * CSF ((mg/kg-day)^-1)",
+                "notes": "Oral pathway using cancer slope factor.",
+            },
+            {
+                "id": "calculate_epa_elcr_iur",
+                "equation": "ELCR = Air Concentration (ug/m3) * IUR ((ug/m3)^-1)",
+                "notes": "Inhalation pathway using inhalation unit risk.",
+            },
+            {
+                "id": "calculate_fda_ctp_elcr",
+                "equation": "ELCR_total = sum(component ELCR)",
+                "notes": "Constituent-level aggregation wrapper for CTP-style profile assessment.",
+            },
+        ],
+    }
+def run_batch_cancer_risk(payload: Dict[str, Any]) -> Dict[str, Any]:
+    out = _base_result("run_batch_cancer_risk")
+    rows = payload.get("rows", []) if isinstance(payload, dict) else []
+    if not isinstance(rows, list):
+        raise CalculationError("rows must be a list of objects")
+    row_results: List[Dict[str, Any]] = []
+    n_ok = 0
+    n_err = 0
+    for i, row in enumerate(rows):
+        if not isinstance(row, dict):
+            row_results.append(
+                {
+                    "row_index": i,
+                    "status": "error",
+                    "error": "row must be an object",
+                }
+            )
+            n_err += 1
+            continue
+        v = validate_risk_input(row)
+        if not v.get("valid", False):
+            row_results.append(
+                {
+                    "row_index": i,
+                    "record_id": row.get("record_id", ""),
+                    "chemical_name": row.get("chemical_name", ""),
+                    "status": "error",
+                    "errors": v.get("errors", []),
+                    "warnings": v.get("warnings", []),
+                }
+            )
+            n_err += 1
+            continue
+        try:
+            csf_res: Optional[Dict[str, Any]] = None
+            iur_res: Optional[Dict[str, Any]] = None
+            if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
+                csf_res = calculate_epa_elcr_csf(row)
+            if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
+                iur_res = calculate_epa_elcr_iur(row)
+            fda_res = calculate_fda_ctp_elcr(row)
+            row_out = {
+                "row_index": i,
+                "record_id": row.get("record_id", ""),
+                "chemical_name": row.get("chemical_name", ""),
+                "casrn": row.get("casrn", ""),
+                "route": row.get("route", ""),
+                "status": "ok",
+                "epa_elcr_csf": (csf_res or {}).get("result_value", ""),
+                "epa_elcr_iur": (iur_res or {}).get("result_value", ""),
+                "fda_ctp_elcr": fda_res.get("result_value", ""),
+                "risk_tier": fda_res.get("risk_tier", "unknown"),
+                "formula_id": fda_res.get("formula_id", "calculate_fda_ctp_elcr"),
+                "formula_version": fda_res.get("formula_version", FORMULA_VERSION),
+                "inputs_normalized": fda_res.get("inputs_normalized", {}),
+                "unit_conversions": fda_res.get("unit_conversions", []),
+                "warnings": (v.get("warnings", []) + fda_res.get("warnings", [])),
+                "log_ref": "",
+            }
+            row_results.append(row_out)
+            n_ok += 1
+        except Exception as exc:
+            row_results.append(
+                {
+                    "row_index": i,
+                    "record_id": row.get("record_id", ""),
+                    "chemical_name": row.get("chemical_name", ""),
+                    "status": "error",
+                    "errors": [str(exc)],
+                }
+            )
+            n_err += 1
+    out["rows"] = row_results
+    out["summary"] = {
+        "total_rows": len(rows),
+        "ok_rows": n_ok,
+        "error_rows": n_err,
+    }
+    out["result_value"] = float(n_ok)
+    out["risk_tier"] = "unknown"
+    return out

mcp_tox_calc/logging.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import datetime as dt
+import json
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List
+class RunLogger:
+    def __init__(self, run_dir: str):
+        self.run_dir = Path(run_dir)
+        self.run_dir.mkdir(parents=True, exist_ok=True)
+        self.log_path = self.run_dir / "cancer_risk_log.jsonl"
+        self.report_path = self.run_dir / "cancer_risk_report.md"
+    def log_event(self, tool_name: str, request_args: Dict[str, Any], response: Dict[str, Any]) -> str:
+        event_id = f"evt_{uuid.uuid4().hex[:10]}"
+        row = {
+            "event_id": event_id,
+            "timestamp_utc": dt.datetime.utcnow().isoformat() + "Z",
+            "tool_name": tool_name,
+            "request": request_args,
+            "response": response,
+        }
+        with self.log_path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(row, ensure_ascii=True) + "\n")
+        return event_id
+    def write_report(self, summary: Dict[str, Any], rows: List[Dict[str, Any]]) -> Path:
+        lines = [
+            "# Cancer Risk Calculation Report",
+            "",
+            "## Run Summary",
+            f"- Total rows: {summary.get('total_rows', 0)}",
+            f"- Successful rows: {summary.get('ok_rows', 0)}",
+            f"- Error rows: {summary.get('error_rows', 0)}",
+            "",
+            "## High Priority Rows",
+        ]
+        high = [r for r in rows if str(r.get("risk_tier", "")).strip().lower() == "high_priority"]
+        if high:
+            for r in high[:100]:
+                lines.append(
+                    f"- record_id={r.get('record_id','')} chemical={r.get('chemical_name','')} fda_ctp_elcr={r.get('fda_ctp_elcr','')}"
+                )
+        else:
+            lines.append("- None")
+        lines += ["", "## Data Quality Alerts"]
+        errors = [r for r in rows if r.get("status") == "error"]
+        if errors:
+            for r in errors[:100]:
+                lines.append(f"- row_index={r.get('row_index')} errors={r.get('errors', [])}")
+        else:
+            lines.append("- No row-level errors.")
+        self.report_path.write_text("\n".join(lines), encoding="utf-8")
+        return self.report_path

mcp_tox_calc/server.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Callable, Dict
+from mcp_tox_calc.equations import (
+    CalculationError,
+    calculate_epa_elcr_csf,
+    calculate_epa_elcr_iur,
+    calculate_fda_ctp_elcr,
+    get_formula_catalog,
+    run_batch_cancer_risk,
+    validate_risk_input,
+)
+from mcp_tox_calc.logging import RunLogger
+ToolFn = Callable[[Dict[str, Any]], Dict[str, Any]]
+class ToxCalcMCPServer:
+    def __init__(self, run_dir: str):
+        self.run_dir = str(Path(run_dir))
+        self.logger = RunLogger(self.run_dir)
+        self.tools: Dict[str, Dict[str, Any]] = {
+            "validate_risk_input": {
+                "description": "Validate a row payload for deterministic cancer risk calculations.",
+                "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
+                "fn": validate_risk_input,
+            },
+            "calculate_epa_elcr_csf": {
+                "description": "Compute ELCR using EPA CSF pathway.",
+                "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
+                "fn": calculate_epa_elcr_csf,
+            },
+            "calculate_epa_elcr_iur": {
+                "description": "Compute ELCR using EPA IUR pathway.",
+                "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
+                "fn": calculate_epa_elcr_iur,
+            },
+            "calculate_fda_ctp_elcr": {
+                "description": "Compute ELCR profile using FDA CTP-style constituent aggregation.",
+                "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
+                "fn": calculate_fda_ctp_elcr,
+            },
+            "run_batch_cancer_risk": {
+                "description": "Run deterministic cancer risk calculations across a batch of rows.",
+                "inputSchema": {
+                    "type": "object",
+                    "properties": {"rows": {"type": "array", "items": {"type": "object"}}},
+                    "required": ["rows"],
+                    "additionalProperties": True,
+                },
+                "fn": run_batch_cancer_risk,
+            },
+            "get_formula_catalog": {
+                "description": "Return available formula catalog and version.",
+                "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
+                "fn": lambda _args: get_formula_catalog(),
+            },
+        }
+    def handle_request(self, req: Dict[str, Any]) -> Dict[str, Any]:
+        method = req.get("method")
+        req_id = req.get("id")
+        if method == "initialize":
+            return {
+                "jsonrpc": "2.0",
+                "id": req_id,
+                "result": {
+                    "protocolVersion": "2024-11-05",
+                    "serverInfo": {"name": "toxra-calc-mcp", "version": "0.1.0"},
+                    "capabilities": {"tools": {}},
+                },
+            }
+        if method == "tools/list":
+            tools = []
+            for name, meta in self.tools.items():
+                tools.append(
+                    {
+                        "name": name,
+                        "description": meta["description"],
+                        "inputSchema": meta["inputSchema"],
+                    }
+                )
+            return {"jsonrpc": "2.0", "id": req_id, "result": {"tools": tools}}
+        if method == "tools/call":
+            params = req.get("params", {}) or {}
+            name = params.get("name")
+            args = params.get("arguments", {}) or {}
+            if name not in self.tools:
+                return {
+                    "jsonrpc": "2.0",
+                    "id": req_id,
+                    "error": {"code": -32602, "message": f"Unknown tool: {name}"},
+                }
+            fn: ToolFn = self.tools[name]["fn"]
+            try:
+                result = fn(args)
+                if not isinstance(result, dict):
+                    result = {"value": result}
+                # Attach structured log reference per tool call.
+                log_ref = self.logger.log_event(name, args, result)
+                result.setdefault("log_ref", log_ref)
+                if name == "run_batch_cancer_risk":
+                    rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
+                    for row in rows:
+                        if isinstance(row, dict):
+                            row.setdefault("formula_id", "calculate_fda_ctp_elcr")
+                            row.setdefault("formula_version", result.get("formula_version", "1.0.0"))
+                            row.setdefault("inputs_normalized", {})
+                            row.setdefault("unit_conversions", [])
+                            row.setdefault("result_value", row.get("fda_ctp_elcr", ""))
+                            row.setdefault("risk_tier", row.get("risk_tier", "unknown"))
+                            row.setdefault("warnings", row.get("warnings", []))
+                            row.setdefault("log_ref", log_ref)
+                    report_path = self.logger.write_report(result.get("summary", {}), rows)
+                    result["artifacts"] = {
+                        "run_dir": self.run_dir,
+                        "log_jsonl": str(self.logger.log_path),
+                        "report_md": str(report_path),
+                    }
+                content = [{"type": "json", "json": result}]
+                return {"jsonrpc": "2.0", "id": req_id, "result": {"content": content}}
+            except CalculationError as exc:
+                return {
+                    "jsonrpc": "2.0",
+                    "id": req_id,
+                    "error": {"code": -32001, "message": str(exc)},
+                }
+            except Exception as exc:
+                return {
+                    "jsonrpc": "2.0",
+                    "id": req_id,
+                    "error": {"code": -32099, "message": f"Unexpected tool error: {exc}"},
+                }
+        return {
+            "jsonrpc": "2.0",
+            "id": req_id,
+            "error": {"code": -32601, "message": f"Method not found: {method}"},
+        }
+def _serve_stdio(server: ToxCalcMCPServer) -> None:
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            req = json.loads(line)
+            resp = server.handle_request(req)
+        except Exception as exc:
+            resp = {
+                "jsonrpc": "2.0",
+                "id": None,
+                "error": {"code": -32700, "message": f"Parse/dispatch error: {exc}"},
+            }
+        sys.stdout.write(json.dumps(resp) + "\n")
+        sys.stdout.flush()
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Local MCP server for deterministic toxicology calculations")
+    parser.add_argument("--stdio", action="store_true", default=False, help="Run stdio JSON-RPC loop")
+    parser.add_argument("--run-dir", default="runs/mcp_server", help="Run artifact directory")
+    args = parser.parse_args()
+    server = ToxCalcMCPServer(run_dir=args.run_dir)
+    _serve_stdio(server)
+if __name__ == "__main__":
+    main()

mcp_tox_calc/units.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import re
+from typing import Any, Dict
+class UnitError(ValueError):
+    pass
+def _to_float(v: Any, field: str) -> float:
+    try:
+        x = float(v)
+    except Exception as exc:
+        raise UnitError(f"{field} must be numeric.") from exc
+    return x
+def _norm_unit(unit: Any) -> str:
+    s = str(unit or "").strip().lower()
+    s = s.replace(" ", "")
+    s = s.replace("μ", "u")
+    return s
+def normalize_route(route: Any) -> str:
+    r = str(route or "").strip().lower()
+    if r in {"oral", "inhalation"}:
+        return r
+    raise UnitError("route must be oral or inhalation")
+def normalize_oral_exposure(exposure_value: Any, exposure_unit: Any, body_weight_kg: Any) -> Dict[str, Any]:
+    value = _to_float(exposure_value, "exposure_value")
+    unit = _norm_unit(exposure_unit)
+    bw = _to_float(body_weight_kg, "body_weight_kg") if body_weight_kg not in (None, "") else None
+    conversions = []
+    if unit in {"mg/kg-day", "mg/kg/d", "mg/kgday", "mgkgday"}:
+        cdi = value
+    elif unit in {"ug/kg-day", "ug/kg/d", "ug/kgday", "ugkgday"}:
+        cdi = value / 1000.0
+        conversions.append("exposure ug/kg-day -> mg/kg-day")
+    elif unit in {"mg/day", "mg/d", "mgday"}:
+        if bw is None or bw <= 0:
+            raise UnitError("body_weight_kg is required for exposure unit mg/day")
+        cdi = value / bw
+        conversions.append("exposure mg/day -> mg/kg-day")
+    elif unit in {"ug/day", "ug/d", "ugday"}:
+        if bw is None or bw <= 0:
+            raise UnitError("body_weight_kg is required for exposure unit ug/day")
+        cdi = (value / 1000.0) / bw
+        conversions.append("exposure ug/day -> mg/kg-day")
+    else:
+        raise UnitError(f"Unsupported oral exposure unit: {exposure_unit}")
+    return {
+        "value_mg_per_kg_day": cdi,
+        "unit": "mg/kg-day",
+        "conversions": conversions,
+    }
+def normalize_air_concentration(air_conc_value: Any, air_conc_unit: Any) -> Dict[str, Any]:
+    value = _to_float(air_conc_value, "air_conc_value")
+    unit = _norm_unit(air_conc_unit)
+    conversions = []
+    if unit in {"ug/m3", "ugm3", "ug/m^3"}:
+        conc = value
+    elif unit in {"mg/m3", "mgm3", "mg/m^3"}:
+        conc = value * 1000.0
+        conversions.append("air concentration mg/m3 -> ug/m3")
+    elif unit in {"ng/m3", "ngm3", "ng/m^3"}:
+        conc = value / 1000.0
+        conversions.append("air concentration ng/m3 -> ug/m3")
+    else:
+        raise UnitError(f"Unsupported air concentration unit: {air_conc_unit}")
+    return {
+        "value_ug_per_m3": conc,
+        "unit": "ug/m3",
+        "conversions": conversions,
+    }
+def normalize_csf(csf_value: Any, csf_unit: Any) -> Dict[str, Any]:
+    value = _to_float(csf_value, "csf_value")
+    unit = _norm_unit(csf_unit)
+    conversions = []
+    if unit in {
+        "(mg/kg-day)^-1",
+        "1/(mg/kg-day)",
+        "per(mg/kg-day)",
+        "permg/kg-day",
+        "(mgkgday)^-1",
+        "1/mgkgday",
+    }:
+        out = value
+    elif unit in {
+        "(ug/kg-day)^-1",
+        "1/(ug/kg-day)",
+        "per(ug/kg-day)",
+        "(ugkgday)^-1",
+        "1/ugkgday",
+    }:
+        out = value * 1000.0
+        conversions.append("CSF per (ug/kg-day) -> per (mg/kg-day)")
+    elif unit in {"", "na", "n/a"}:
+        out = value
+    else:
+        raise UnitError(f"Unsupported csf unit: {csf_unit}")
+    return {
+        "value_per_mg_per_kg_day": out,
+        "unit": "(mg/kg-day)^-1",
+        "conversions": conversions,
+    }
+def normalize_iur(iur_value: Any, iur_unit: Any) -> Dict[str, Any]:
+    value = _to_float(iur_value, "iur_value")
+    unit = _norm_unit(iur_unit)
+    conversions = []
+    if unit in {
+        "(ug/m3)^-1",
+        "1/(ug/m3)",
+        "per(ug/m3)",
+        "1/ugm3",
+        "(ugm3)^-1",
+    }:
+        out = value
+    elif unit in {
+        "(mg/m3)^-1",
+        "1/(mg/m3)",
+        "per(mg/m3)",
+        "1/mgm3",
+        "(mgm3)^-1",
+    }:
+        out = value / 1000.0
+        conversions.append("IUR per (mg/m3) -> per (ug/m3)")
+    elif unit in {"", "na", "n/a"}:
+        out = value
+    else:
+        raise UnitError(f"Unsupported iur unit: {iur_unit}")
+    return {
+        "value_per_ug_per_m3": out,
+        "unit": "(ug/m3)^-1",
+        "conversions": conversions,
+    }

regulatory_catalog/epa_cancer_v2005.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "framework": "EPA",
+  "version": "2005",
+  "source": "EPA Guidelines for Carcinogen Risk Assessment (2005)",
+  "clauses": [
+    {
+      "clause_id": "EPA.CANCER.CSF.001",
+      "framework": "EPA",
+      "title": "Oral cancer slope factor applicability",
+      "description": "Assessment should include oral-dose based evidence and potency context for CSF application.",
+      "required_fields": [
+        "dose_metrics",
+        "exposure_route",
+        "carcinogenicity_result"
+      ],
+      "required_evidence_terms": [
+        "cancer slope factor",
+        "mg/kg-day",
+        "oral",
+        "dose"
+      ],
+      "acceptance_rule": "any_required_fields",
+      "applicability": {},
+      "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
+    },
+    {
+      "clause_id": "EPA.CANCER.IUR.001",
+      "framework": "EPA",
+      "title": "Inhalation unit risk applicability",
+      "description": "Assessment should include inhalation exposure metrics suitable for IUR-based risk quantification.",
+      "required_fields": [
+        "exposure_route",
+        "dose_metrics",
+        "carcinogenicity_notes"
+      ],
+      "required_evidence_terms": [
+        "inhalation unit risk",
+        "ug/m3",
+        "inhalation",
+        "air concentration"
+      ],
+      "acceptance_rule": "any_required_fields",
+      "applicability": {
+        "field": "exposure_route",
+        "equals": "inhalation"
+      },
+      "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
+    },
+    {
+      "clause_id": "EPA.CANCER.WOE.001",
+      "framework": "EPA",
+      "title": "Weight-of-evidence integration",
+      "description": "Narrative should integrate evidence quality, uncertainty, and plausibility of carcinogenic potential.",
+      "required_fields": [
+        "key_findings",
+        "conclusion",
+        "risk_summary"
+      ],
+      "required_evidence_terms": [
+        "weight of evidence",
+        "uncertainty",
+        "mode of action",
+        "cancer"
+      ],
+      "acceptance_rule": "any_required_fields",
+      "applicability": {},
+      "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
+    }
+  ]
+}

regulatory_catalog/fda_ctp_v2024_06.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "framework": "FDA CTP",
+  "version": "2024-06",
+  "source": "FDA CTP Regulatory Science Policy Memoranda (June 3, 2024)",
+  "clauses": [
+    {
+      "clause_id": "FDA.CTP.GENOTOX.001",
+      "framework": "FDA CTP",
+      "title": "Genotoxicity hazard identification evidence",
+      "description": "Evidence should characterize in vitro and in vivo genotoxicity evidence and integrated interpretation.",
+      "required_fields": [
+        "genotox_oecd_tg_in_vitro",
+        "genotox_oecd_tg_in_vivo",
+        "genotoxicity_result",
+        "genotoxicity_result_notes"
+      ],
+      "required_evidence_terms": [
+        "genotoxic",
+        "ames",
+        "micronucleus",
+        "comet",
+        "oecd tg"
+      ],
+      "acceptance_rule": "all_required_fields",
+      "applicability": {},
+      "source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
+    },
+    {
+      "clause_id": "FDA.CTP.CARCIN.001",
+      "framework": "FDA CTP",
+      "title": "Carcinogenicity tiering narrative support",
+      "description": "Carcinogenicity conclusions should be supported by study findings and risk narrative.",
+      "required_fields": [
+        "carcinogenicity_result",
+        "carcinogenicity_notes",
+        "key_findings",
+        "conclusion"
+      ],
+      "required_evidence_terms": [
+        "carcinogenic",
+        "tumor",
+        "cancer",
+        "risk"
+      ],
+      "acceptance_rule": "all_required_fields",
+      "applicability": {},
+      "source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
+    },
+    {
+      "clause_id": "FDA.CTP.ELCR.001",
+      "framework": "FDA CTP",
+      "title": "ELCR-ready quantitative evidence elements",
+      "description": "ELCR computations require quantitative exposure and potency anchors with transparent assumptions.",
+      "required_fields": [
+        "dose_metrics",
+        "risk_summary",
+        "exposure_route"
+      ],
+      "required_evidence_terms": [
+        "excess lifetime cancer risk",
+        "slope factor",
+        "unit risk",
+        "exposure"
+      ],
+      "acceptance_rule": "any_required_fields",
+      "applicability": {},
+      "source_reference": "FDA CTP Calculating Excess Lifetime Cancer Risk in ENDS PMTAs memo (June 3, 2024)"
+    }
+  ]
+}

scripts/__pycache__/replay_calc_log.cpython-314.pyc ADDED Viewed

Binary file (2.24 kB). View file

scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc ADDED Viewed

Binary file (3.36 kB). View file

scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc ADDED Viewed

Binary file (252 Bytes). View file

scripts/replay_calc_log.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Replay and summarize cancer risk MCP log JSONL")
+    parser.add_argument("--log-jsonl", required=True, help="Path to cancer_risk_log.jsonl")
+    args = parser.parse_args()
+    path = Path(args.log_jsonl)
+    if not path.exists():
+        raise FileNotFoundError(f"Log file not found: {path}")
+    events = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        events.append(json.loads(line))
+    tool_counts = Counter([e.get("tool_name", "unknown") for e in events])
+    print("# MCP Calculation Log Replay")
+    print(f"events={len(events)}")
+    for tool, n in sorted(tool_counts.items()):
+        print(f"- {tool}: {n}")
+    if events:
+        print("\nlast_event=")
+        print(json.dumps(events[-1], indent=2))
+if __name__ == "__main__":
+    main()

scripts/run_cancer_risk_batch.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from pathlib import Path
+import pandas as pd
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json
+from toxra_core.calculation_client import run_batch_cancer_risk
+from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run deterministic cancer risk batch using local MCP server")
+    parser.add_argument("--input-csv", required=True, help="Path to cancer risk input CSV")
+    parser.add_argument("--run-id", default="", help="Optional run ID")
+    parser.add_argument("--runs-dir", default="runs", help="Runs base directory")
+    args = parser.parse_args()
+    inp = Path(args.input_csv)
+    if not inp.exists():
+        raise FileNotFoundError(f"Input CSV not found: {inp}")
+    df = pd.read_csv(inp)
+    missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing required columns: {missing}")
+    run_dir = make_run_dir(run_id=args.run_id or None, base_dir=args.runs_dir)
+    rows = df.fillna("").to_dict("records")
+    result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
+    res_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
+    out_df = pd.DataFrame(res_rows)
+    out_csv = run_dir / "cancer_risk_results.csv"
+    out_json = run_dir / "cancer_risk_results.json"
+    write_dataframe_csv(out_csv, out_df)
+    write_json(out_json, result)
+    print(json.dumps({
+        "run_dir": str(run_dir),
+        "results_csv": str(out_csv),
+        "results_json": str(out_json),
+        "log_jsonl": result.get("artifacts", {}).get("log_jsonl", ""),
+        "report_md": result.get("artifacts", {}).get("report_md", ""),
+        "summary": result.get("summary", {}),
+    }, indent=2))
+if __name__ == "__main__":
+    main()

scripts/run_mcp_calc_server.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from mcp_tox_calc.server import main
+if __name__ == "__main__":
+    main()

tests/__pycache__/conftest.cpython-314.pyc ADDED Viewed

Binary file (492 Bytes). View file

tests/__pycache__/test_equations.cpython-314.pyc ADDED Viewed

Binary file (2.4 kB). View file

tests/__pycache__/test_mcp_tools.cpython-314.pyc ADDED Viewed

Binary file (1.88 kB). View file

tests/__pycache__/test_nlp_pipeline.cpython-314.pyc ADDED Viewed

Binary file (2.43 kB). View file

tests/__pycache__/test_regulatory_mapper.cpython-314.pyc ADDED Viewed

Binary file (982 Bytes). View file

tests/__pycache__/test_units.cpython-314.pyc ADDED Viewed

Binary file (1.79 kB). View file

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))

tests/fixtures/extraction_sample.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "papers": [
+    {
+      "_file": "paper_a.pdf",
+      "paper_title": "Test Paper A",
+      "risk_stance": "acceptable_with_uncertainty",
+      "risk_confidence": 0.72,
+      "risk_summary": "Contains partial carcinogenicity evidence.",
+      "extracted": {
+        "chemicals": ["Nicotine"],
+        "genotox_oecd_tg_in_vitro": ["OECD_TG_471_Bacterial Reverse mutation test(AMES test)"],
+        "genotox_oecd_tg_in_vivo": ["not_reported"],
+        "genotoxicity_result": "equivocal",
+        "genotoxicity_result_notes": "AMES mixed outcomes.",
+        "carcinogenicity_result": "insufficient_data",
+        "carcinogenicity_notes": "Long-term bioassay absent.",
+        "dose_metrics": ["NOAEL 10 mg/kg-day"],
+        "exposure_route": "oral",
+        "key_findings": "Potential DNA response observed.",
+        "conclusion": "Needs additional testing."
+      },
+      "evidence": [
+        {
+          "field": "genotoxicity_result",
+          "quote": "The AMES assay showed equivocal mutagenicity outcomes.",
+          "pages": "4-5"
+        },
+        {
+          "field": "dose_metrics",
+          "quote": "NOAEL was reported at 10 mg/kg-day.",
+          "pages": "6"
+        }
+      ]
+    }
+  ],
+  "toxra_extensions": {
+    "nlp_diagnostics": [],
+    "regulatory_gap_assessment": {},
+    "risk_calculation_refs": []
+  }
+}

tests/test_equations.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from mcp_tox_calc.equations import (
+    calculate_epa_elcr_csf,
+    calculate_epa_elcr_iur,
+    calculate_fda_ctp_elcr,
+    run_batch_cancer_risk,
+)
+def test_calculate_epa_elcr_csf_basic():
+    out = calculate_epa_elcr_csf(
+        {
+            "route": "oral",
+            "exposure_value": 0.01,
+            "exposure_unit": "mg/kg-day",
+            "body_weight_kg": 70,
+            "csf_value": 1.5,
+            "csf_unit": "(mg/kg-day)^-1",
+        }
+    )
+    assert round(out["result_value"], 8) == 0.015
+def test_calculate_epa_elcr_iur_basic():
+    out = calculate_epa_elcr_iur(
+        {
+            "route": "inhalation",
+            "air_conc_value": 100,
+            "air_conc_unit": "ug/m3",
+            "iur_value": 1e-6,
+            "iur_unit": "(ug/m3)^-1",
+        }
+    )
+    assert round(out["result_value"], 8) == 0.0001
+def test_fda_wrapper_aggregates_components():
+    out = calculate_fda_ctp_elcr(
+        {
+            "constituents": [
+                {
+                    "route": "oral",
+                    "exposure_value": 0.01,
+                    "exposure_unit": "mg/kg-day",
+                    "body_weight_kg": 70,
+                    "csf_value": 1.0,
+                    "csf_unit": "(mg/kg-day)^-1",
+                },
+                {
+                    "route": "inhalation",
+                    "air_conc_value": 50,
+                    "air_conc_unit": "ug/m3",
+                    "iur_value": 1e-6,
+                    "iur_unit": "(ug/m3)^-1",
+                },
+            ]
+        }
+    )
+    assert out["result_value"] > 0
+    assert len(out["component_results"]) == 2
+def test_batch_handles_mixed_rows():
+    out = run_batch_cancer_risk(
+        {
+            "rows": [
+                {
+                    "record_id": "r1",
+                    "chemical_name": "ChemA",
+                    "route": "oral",
+                    "exposure_value": 0.02,
+                    "exposure_unit": "mg/kg-day",
+                    "body_weight_kg": 70,
+                    "csf_value": 1.1,
+                    "csf_unit": "(mg/kg-day)^-1",
+                },
+                {
+                    "record_id": "r2",
+                    "chemical_name": "ChemB",
+                    "route": "oral"
+                },
+            ]
+        }
+    )
+    assert out["summary"]["total_rows"] == 2
+    assert out["summary"]["ok_rows"] == 1
+    assert out["summary"]["error_rows"] == 1

tests/test_mcp_tools.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from pathlib import Path
+from toxra_core.artifacts import make_run_dir
+from toxra_core.calculation_client import MCPCalculationClient
+def test_mcp_client_lists_tools_and_runs_batch(tmp_path):
+    run_dir = make_run_dir(run_id="test_mcp", base_dir=str(tmp_path))
+    with MCPCalculationClient(run_dir=str(run_dir)) as client:
+        tools = client.list_tools()
+        names = {t.get("name") for t in tools}
+        assert "run_batch_cancer_risk" in names
+        result = client.call_tool(
+            "run_batch_cancer_risk",
+            {
+                "rows": [
+                    {
+                        "record_id": "r1",
+                        "chemical_name": "ChemA",
+                        "route": "oral",
+                        "exposure_value": 0.01,
+                        "exposure_unit": "mg/kg-day",
+                        "body_weight_kg": 70,
+                        "csf_value": 1.2,
+                        "csf_unit": "(mg/kg-day)^-1",
+                        "iur_value": "",
+                        "air_conc_value": "",
+                        "air_conc_unit": "",
+                    }
+                ]
+            },
+        )
+    assert result["summary"]["total_rows"] == 1
+    assert result["summary"]["ok_rows"] == 1
+    assert Path(result["artifacts"]["log_jsonl"]).exists()
+    assert Path(result["artifacts"]["report_md"]).exists()

tests/test_nlp_pipeline.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import numpy as np
+from toxra_core.nlp_pipeline import (
+    expand_regulatory_queries,
+    extract_evidence_span,
+    hybrid_rank_text_items,
+)
+def test_expand_regulatory_queries_adds_families():
+    queries, families = expand_regulatory_queries(
+        base_queries=["genotoxicity risk"],
+        endpoint_modules=["Genotoxicity (OECD TG)"],
+        frameworks=["FDA CTP"],
+    )
+    assert len(queries) > 1
+    assert "endpoint" in families
+    assert families["endpoint"]
+def test_extract_evidence_span_hit_and_fallback():
+    text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four."
+    hit = extract_evidence_span(text, "AMES")
+    assert "AMES" in hit["text"]
+    fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken")
+    assert fb["text"]
+def test_hybrid_rank_text_items_lexical_only():
+    items = [
+        {"text": "This section discusses liver toxicity and NOAEL values."},
+        {"text": "Completely unrelated formulation text."},
+    ]
+    selected, diag = hybrid_rank_text_items(items, query="NOAEL liver")
+    assert selected
+    assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"}
+def test_hybrid_rank_text_items_with_embeddings():
+    items = [{"text": "A"}, {"text": "B"}, {"text": "C"}]
+    emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32)
+    q = np.array([1.0, 0.0], dtype=np.float32)
+    selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q)
+    assert selected
+    assert diag["ranking_method"] == "hybrid_rrf"

tests/test_regulatory_mapper.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+from pathlib import Path
+from toxra_core.regulatory_mapper import map_extraction_to_framework
+def test_regulatory_mapping_outputs_matrix_and_report():
+    fixture = Path("tests/fixtures/extraction_sample.json")
+    payload = json.loads(fixture.read_text(encoding="utf-8"))
+    df, report, md = map_extraction_to_framework(payload, framework="FDA CTP")
+    assert not df.empty
+    assert "clause_id" in df.columns
+    assert report["framework"] == "FDA CTP"
+    assert "Status Summary" in md

tests/test_units.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from mcp_tox_calc.units import (
+    normalize_air_concentration,
+    normalize_csf,
+    normalize_iur,
+    normalize_oral_exposure,
+)
+def test_normalize_oral_exposure_ug_per_kg_day():
+    out = normalize_oral_exposure(2500, "ug/kg-day", 70)
+    assert round(out["value_mg_per_kg_day"], 6) == 2.5
+def test_normalize_oral_exposure_mg_day_with_bw():
+    out = normalize_oral_exposure(7, "mg/day", 70)
+    assert round(out["value_mg_per_kg_day"], 6) == 0.1
+def test_normalize_air_concentration_mg_to_ug():
+    out = normalize_air_concentration(0.2, "mg/m3")
+    assert round(out["value_ug_per_m3"], 6) == 200.0
+def test_normalize_csf_from_ug_basis():
+    out = normalize_csf(0.001, "(ug/kg-day)^-1")
+    assert round(out["value_per_mg_per_kg_day"], 6) == 1.0
+def test_normalize_iur_from_mg_basis():
+    out = normalize_iur(0.002, "(mg/m3)^-1")
+    assert round(out["value_per_ug_per_m3"], 9) == 0.000002

toxra_core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Core modules for Toxra extraction, mapping, NLP, and calculation orchestration."""

toxra_core/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (215 Bytes). View file

toxra_core/__pycache__/artifacts.cpython-314.pyc ADDED Viewed

Binary file (4.62 kB). View file

toxra_core/__pycache__/calculation_client.cpython-314.pyc ADDED Viewed

Binary file (8.73 kB). View file

toxra_core/__pycache__/contracts.cpython-314.pyc ADDED Viewed

Binary file (4.04 kB). View file

toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc ADDED Viewed

Binary file (17.9 kB). View file

toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc ADDED Viewed

Binary file (13.6 kB). View file

toxra_core/artifacts.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+from pathlib import Path
+from typing import Any, Dict, Iterable, Optional
+import pandas as pd
+from .contracts import default_run_id
+def make_run_dir(run_id: Optional[str] = None, base_dir: str = "runs") -> Path:
+    rid = run_id or default_run_id("run")
+    out = Path(base_dir) / rid
+    out.mkdir(parents=True, exist_ok=True)
+    return out
+def write_json(path: Path, data: Any) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+    return path
+def write_markdown(path: Path, text: str) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text or "", encoding="utf-8")
+    return path
+def append_jsonl(path: Path, row: Dict[str, Any]) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(row, ensure_ascii=True) + "\n")
+    return path
+def write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=True) + "\n")
+    return path
+def write_dataframe_csv(path: Path, df: pd.DataFrame) -> Path:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(path, index=False)
+    return path
+def load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))

toxra_core/calculation_client.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import json
+import subprocess
+import sys
+from typing import Any, Dict, List, Optional
+class MCPClientError(RuntimeError):
+    pass
+class MCPCalculationClient:
+    def __init__(
+        self,
+        run_dir: str,
+        python_executable: Optional[str] = None,
+        server_script: Optional[str] = None,
+    ):
+        self.run_dir = str(run_dir)
+        self.python_executable = python_executable or sys.executable
+        if server_script:
+            self.server_script = str(server_script)
+        else:
+            self.server_script = None
+        self._proc: Optional[subprocess.Popen] = None
+        self._id = 0
+    def start(self) -> None:
+        if self._proc is not None:
+            return
+        if self.server_script:
+            cmd = [
+                self.python_executable,
+                self.server_script,
+                "--stdio",
+                "--run-dir",
+                self.run_dir,
+            ]
+        else:
+            cmd = [
+                self.python_executable,
+                "-m",
+                "mcp_tox_calc.server",
+                "--stdio",
+                "--run-dir",
+                self.run_dir,
+            ]
+        self._proc = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+        )
+        self._request("initialize", {"protocolVersion": "2024-11-05", "clientInfo": {"name": "toxra-app", "version": "0.1.0"}})
+    def stop(self) -> None:
+        if self._proc is None:
+            return
+        try:
+            if self._proc.stdin:
+                self._proc.stdin.close()
+            self._proc.terminate()
+            self._proc.wait(timeout=3)
+        except Exception:
+            try:
+                self._proc.kill()
+            except Exception:
+                pass
+        finally:
+            self._proc = None
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        self.stop()
+    def _request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        if self._proc is None:
+            raise MCPClientError("MCP server not started.")
+        if self._proc.stdin is None or self._proc.stdout is None:
+            raise MCPClientError("MCP server pipes unavailable.")
+        self._id += 1
+        req_id = self._id
+        request = {
+            "jsonrpc": "2.0",
+            "id": req_id,
+            "method": method,
+            "params": params,
+        }
+        self._proc.stdin.write(json.dumps(request) + "\n")
+        self._proc.stdin.flush()
+        while True:
+            line = self._proc.stdout.readline()
+            if line == "":
+                err = ""
+                if self._proc.stderr is not None:
+                    try:
+                        err = self._proc.stderr.read()[-1500:]
+                    except Exception:
+                        err = ""
+                raise MCPClientError(f"No response from MCP server. stderr={err}")
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                resp = json.loads(line)
+            except Exception:
+                continue
+            if resp.get("id") != req_id:
+                continue
+            if "error" in resp:
+                raise MCPClientError(str(resp["error"]))
+            result = resp.get("result", {})
+            if not isinstance(result, dict):
+                return {"result": result}
+            return result
+    def list_tools(self) -> List[Dict[str, Any]]:
+        result = self._request("tools/list", {})
+        tools = result.get("tools", [])
+        return tools if isinstance(tools, list) else []
+    def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
+        result = self._request("tools/call", {"name": name, "arguments": arguments})
+        content = result.get("content", []) if isinstance(result, dict) else []
+        if isinstance(content, list) and content:
+            first = content[0]
+            if isinstance(first, dict) and first.get("type") == "json":
+                data = first.get("json", {})
+                return data if isinstance(data, dict) else {"value": data}
+        return result if isinstance(result, dict) else {"value": result}
+def run_batch_cancer_risk(rows: List[Dict[str, Any]], run_dir: str) -> Dict[str, Any]:
+    with MCPCalculationClient(run_dir=run_dir) as client:
+        return client.call_tool("run_batch_cancer_risk", {"rows": rows})

toxra_core/contracts.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import datetime as _dt
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+CANCER_RISK_TEMPLATE_COLUMNS: List[str] = [
+    "record_id",
+    "chemical_name",
+    "casrn",
+    "route",
+    "exposure_value",
+    "exposure_unit",
+    "body_weight_kg",
+    "csf_value",
+    "csf_unit",
+    "iur_value",
+    "air_conc_value",
+    "air_conc_unit",
+    "source_reference",
+]
+def default_run_id(prefix: str = "run") -> str:
+    ts = _dt.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    return f"{prefix}_{ts}_{uuid.uuid4().hex[:8]}"
+@dataclass
+class RegulatoryClause:
+    clause_id: str
+    framework: str
+    title: str
+    description: str
+    required_fields: List[str] = field(default_factory=list)
+    required_evidence_terms: List[str] = field(default_factory=list)
+    acceptance_rule: str = "all_required_fields"
+    applicability: Dict[str, Any] = field(default_factory=dict)
+    source_reference: str = ""
+@dataclass
+class ClauseEvaluation:
+    clause_id: str
+    framework: str
+    status: str
+    fields_present: List[str] = field(default_factory=list)
+    missing_fields: List[str] = field(default_factory=list)
+    evidence_hits: List[str] = field(default_factory=list)
+    prompt: str = ""
+    reason: str = ""
+RISK_TIER_THRESHOLDS = {
+    "low": 1e-6,
+    "moderate": 1e-4,
+}
+def classify_risk_tier(value: Optional[float]) -> str:
+    if value is None:
+        return "unknown"
+    if value < RISK_TIER_THRESHOLDS["low"]:
+        return "de_minimis"
+    if value <= RISK_TIER_THRESHOLDS["moderate"]:
+        return "monitor"
+    return "high_priority"

toxra_core/nlp_pipeline.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import re
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer
+except Exception:  # pragma: no cover - fallback path for minimal runtime
+    TfidfVectorizer = None
+ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
+    "Genotoxicity (OECD TG)": [
+        "genotoxicity",
+        "mutagenicity",
+        "AMES",
+        "micronucleus",
+        "comet assay",
+        "chromosomal aberration",
+        "OECD TG 471 473 476 487 490 474 489",
+    ],
+    "NAMs / In Silico": [
+        "in silico",
+        "QSAR",
+        "read-across",
+        "AOP",
+        "PBPK",
+        "high-throughput",
+        "omics",
+        "organ-on-chip",
+        "microphysiological",
+    ],
+    "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
+    "Repeated dose toxicity": [
+        "repeated dose",
+        "subchronic",
+        "chronic",
+        "NOAEL",
+        "LOAEL",
+        "target organ",
+        "90-day",
+        "28-day",
+    ],
+    "Irritation / Sensitization": ["skin irritation", "eye irritation", "sensitization", "LLNA", "Draize"],
+    "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogenic", "prenatal", "postnatal"],
+    "Carcinogenicity": ["carcinogenicity", "tumor", "neoplasm", "cancer", "two-year bioassay"],
+}
+FRAMEWORK_QUERY_HINTS: Dict[str, List[str]] = {
+    "FDA CTP": [
+        "genotoxicity hazard identification",
+        "carcinogenicity tiering",
+        "excess lifetime cancer risk",
+        "constituent comparison",
+        "weight of evidence",
+    ],
+    "EPA": [
+        "cancer slope factor",
+        "inhalation unit risk",
+        "lifetime excess cancer risk",
+        "mode of action",
+        "weight of evidence descriptors",
+    ],
+}
+EQUATION_INPUT_HINTS: List[str] = [
+    "exposure concentration",
+    "daily intake",
+    "mg/kg-day",
+    "ug/m3",
+    "cancer slope factor",
+    "inhalation unit risk",
+    "body weight",
+]
+def clean_text(t: str) -> str:
+    t = (t or "").replace("\x00", " ")
+    return re.sub(r"\s+", " ", t).strip()
+def split_sentences(text: str) -> List[str]:
+    t = clean_text(text)
+    if not t:
+        return []
+    return [x.strip() for x in re.split(r"(?<=[\.!\?])\s+", t) if x.strip()]
+def _tokenize(s: str) -> List[str]:
+    return [w for w in re.findall(r"[a-zA-Z0-9\-]+", (s or "").lower()) if len(w) >= 3]
+def extract_evidence_span(page_text: str, query: str, page: Optional[int] = None, n_sentences: int = 5) -> Dict[str, Any]:
+    sents = split_sentences(page_text)
+    if not sents:
+        return {"text": "", "page": page, "start_sentence": 0, "mode": "empty"}
+    qwords = _tokenize(query)
+    hit_i = None
+    for i, s in enumerate(sents):
+        sl = s.lower()
+        if any(w in sl for w in qwords):
+            hit_i = i
+            break
+    if hit_i is None:
+        snippet = " ".join(sents[:n_sentences])
+        return {"text": snippet, "page": page, "start_sentence": 0, "mode": "fallback"}
+    start = max(0, hit_i - 2)
+    end = min(len(sents), hit_i + 3)
+    snippet = " ".join(sents[start:end])
+    return {"text": snippet, "page": page, "start_sentence": start, "mode": "hit"}
+def build_query_families(
+    base_queries: List[str],
+    endpoint_modules: Optional[List[str]] = None,
+    frameworks: Optional[List[str]] = None,
+) -> Dict[str, List[str]]:
+    endpoint_modules = endpoint_modules or []
+    frameworks = frameworks or []
+    endpoint_terms: List[str] = []
+    for ep in endpoint_modules:
+        endpoint_terms.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
+    framework_terms: List[str] = []
+    for fw in frameworks:
+        framework_terms.extend(FRAMEWORK_QUERY_HINTS.get(fw, []))
+    return {
+        "base": [q for q in base_queries if (q or "").strip()],
+        "endpoint": endpoint_terms,
+        "framework": framework_terms,
+        "equation_inputs": EQUATION_INPUT_HINTS,
+    }
+def expand_regulatory_queries(
+    base_queries: List[str],
+    endpoint_modules: Optional[List[str]] = None,
+    frameworks: Optional[List[str]] = None,
+    extra_terms: Optional[List[str]] = None,
+) -> Tuple[List[str], Dict[str, List[str]]]:
+    families = build_query_families(base_queries, endpoint_modules, frameworks)
+    queries: List[str] = []
+    for vals in families.values():
+        queries.extend(vals)
+    queries.extend(extra_terms or [])
+    deduped: List[str] = []
+    seen = set()
+    for q in queries:
+        x = (q or "").strip()
+        if not x:
+            continue
+        k = x.lower()
+        if k in seen:
+            continue
+        seen.add(k)
+        deduped.append(x)
+    return deduped, families
+def _lexical_ranks(texts: List[str], query: str) -> Tuple[List[int], np.ndarray]:
+    if not texts:
+        return [], np.array([], dtype=np.float32)
+    if TfidfVectorizer is None:
+        q_tokens = set(_tokenize(query))
+        sims = []
+        for t in texts:
+            tl = t.lower()
+            sims.append(float(sum(1 for tok in q_tokens if tok in tl)))
+        arr = np.array(sims, dtype=np.float32)
+        order = list(np.argsort(arr)[::-1])
+        return order, arr
+    vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=25000)
+    x = vec.fit_transform(texts)
+    qv = vec.transform([query])
+    sims = (x @ qv.T).toarray().ravel().astype(np.float32)
+    order = list(np.argsort(sims)[::-1])
+    return order, sims
+def _embedding_ranks(item_embeddings: np.ndarray, query_embedding: np.ndarray) -> Tuple[List[int], np.ndarray]:
+    if item_embeddings.size == 0:
+        return [], np.array([], dtype=np.float32)
+    q = np.asarray(query_embedding, dtype=np.float32)
+    qn = np.linalg.norm(q) + 1e-12
+    q = q / qn
+    mat = np.asarray(item_embeddings, dtype=np.float32)
+    norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
+    mat = mat / norms
+    sims = (mat @ q).astype(np.float32)
+    order = list(np.argsort(sims)[::-1])
+    return order, sims
+def _rrf_score(rank_lists: List[List[int]], k: int = 60) -> Dict[int, float]:
+    out: Dict[int, float] = {}
+    for rank_list in rank_lists:
+        for rank_pos, idx in enumerate(rank_list):
+            out[idx] = out.get(idx, 0.0) + (1.0 / (k + rank_pos + 1.0))
+    return out
+def _family_coverage_score(texts: List[str], families: Dict[str, List[str]]) -> Dict[str, float]:
+    merged = " ".join([clean_text(t).lower() for t in texts])
+    out: Dict[str, float] = {}
+    for family, queries in families.items():
+        if not queries:
+            out[family] = 0.0
+            continue
+        hits = 0
+        for q in queries:
+            tokens = _tokenize(q)
+            if any(t in merged for t in tokens):
+                hits += 1
+        out[family] = round(hits / max(1, len(queries)), 4)
+    return out
+def hybrid_rank_text_items(
+    items: List[Dict[str, Any]],
+    query: str,
+    families: Optional[Dict[str, List[str]]] = None,
+    top_k: int = 12,
+    item_embeddings: Optional[np.ndarray] = None,
+    query_embedding: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    if not items:
+        return [], {
+            "ranking_method": "empty",
+            "selected_indices": [],
+            "coverage_by_query_family": families or {},
+            "coverage_score": 0.0,
+            "component_scores": {},
+        }
+    texts = [clean_text(i.get("text", "")) for i in items]
+    lex_order, lex_scores = _lexical_ranks(texts, query)
+    rank_lists = [lex_order]
+    method = "lexical_only"
+    emb_scores = None
+    if item_embeddings is not None and query_embedding is not None:
+        try:
+            emb_order, emb_scores = _embedding_ranks(item_embeddings, query_embedding)
+            rank_lists.append(emb_order)
+            method = "hybrid_rrf"
+        except Exception:
+            emb_scores = None
+    rrf = _rrf_score(rank_lists)
+    final_order = sorted(rrf.keys(), key=lambda idx: rrf[idx], reverse=True)
+    selected_indices = final_order[: max(1, int(top_k))]
+    selected: List[Dict[str, Any]] = []
+    for idx in selected_indices:
+        row = dict(items[idx])
+        row["_nlp_rrf_score"] = float(rrf.get(idx, 0.0))
+        row["_nlp_lex_score"] = float(lex_scores[idx]) if len(lex_scores) > idx else 0.0
+        if emb_scores is not None and len(emb_scores) > idx:
+            row["_nlp_emb_score"] = float(emb_scores[idx])
+        selected.append(row)
+    fam = families or {"base": [query]}
+    cov = _family_coverage_score([x.get("text", "") for x in selected], fam)
+    cov_score = round(float(np.mean(list(cov.values()))) if cov else 0.0, 4)
+    diagnostics = {
+        "ranking_method": method,
+        "selected_indices": selected_indices,
+        "coverage_by_query_family": cov,
+        "coverage_score": cov_score,
+        "component_scores": {
+            "lexical": [float(lex_scores[i]) for i in selected_indices if len(lex_scores) > i],
+            "embedding": [float(emb_scores[i]) for i in selected_indices if emb_scores is not None and len(emb_scores) > i],
+        },
+    }
+    return selected, diagnostics

toxra_core/regulatory_mapper.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import pandas as pd
+from .contracts import ClauseEvaluation, RegulatoryClause
+FRAMEWORK_TO_FILE = {
+    "FDA CTP": "fda_ctp_v2024_06.json",
+    "EPA": "epa_cancer_v2005.json",
+}
+EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
+def _is_non_empty(v: Any) -> bool:
+    if v is None:
+        return False
+    if isinstance(v, list):
+        vals = [str(x).strip() for x in v if str(x).strip()]
+        if not vals:
+            return False
+        return not all(x.lower() in EMPTY_STRINGS for x in vals)
+    s = str(v).strip()
+    if not s:
+        return False
+    return s.lower() not in EMPTY_STRINGS
+def _normalize_payload(extraction_payload: Any) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    if isinstance(extraction_payload, dict):
+        papers = extraction_payload.get("papers", [])
+        if isinstance(papers, list):
+            ext = extraction_payload.get("toxra_extensions", {})
+            return papers, (ext if isinstance(ext, dict) else {})
+    if isinstance(extraction_payload, list):
+        return extraction_payload, {}
+    raise ValueError("Unsupported extraction payload format. Expected list or object with papers.")
+def load_framework_catalog(framework: str, catalog_dir: str = "regulatory_catalog") -> List[RegulatoryClause]:
+    fname = FRAMEWORK_TO_FILE.get(framework)
+    if not fname:
+        raise ValueError(f"Unsupported framework: {framework}")
+    path = Path(catalog_dir) / fname
+    if not path.exists():
+        raise FileNotFoundError(f"Catalog not found: {path}")
+    data = json.loads(path.read_text(encoding="utf-8"))
+    clauses = data.get("clauses", []) if isinstance(data, dict) else []
+    out: List[RegulatoryClause] = []
+    for c in clauses:
+        out.append(
+            RegulatoryClause(
+                clause_id=str(c.get("clause_id", "")).strip(),
+                framework=str(c.get("framework", framework)).strip(),
+                title=str(c.get("title", "")).strip(),
+                description=str(c.get("description", "")).strip(),
+                required_fields=list(c.get("required_fields", []) or []),
+                required_evidence_terms=list(c.get("required_evidence_terms", []) or []),
+                acceptance_rule=str(c.get("acceptance_rule", "all_required_fields")).strip(),
+                applicability=dict(c.get("applicability", {}) or {}),
+                source_reference=str(c.get("source_reference", "")).strip(),
+            )
+        )
+    return out
+def _clause_applicable(extracted: Dict[str, Any], clause: RegulatoryClause) -> bool:
+    app = clause.applicability or {}
+    if not app:
+        return True
+    field = str(app.get("field", "")).strip()
+    equals = app.get("equals", None)
+    if not field:
+        return True
+    val = extracted.get(field)
+    if isinstance(val, list):
+        vals = [str(x).strip().lower() for x in val]
+        return str(equals).strip().lower() in vals
+    return str(val).strip().lower() == str(equals).strip().lower()
+def _evaluate_clause(
+    extracted: Dict[str, Any],
+    evidence: List[Dict[str, Any]],
+    clause: RegulatoryClause,
+    override_notes: str = "",
+) -> ClauseEvaluation:
+    if not _clause_applicable(extracted, clause):
+        return ClauseEvaluation(
+            clause_id=clause.clause_id,
+            framework=clause.framework,
+            status="not_applicable",
+            reason="Applicability condition not met.",
+        )
+    present: List[str] = []
+    missing: List[str] = []
+    for f in clause.required_fields:
+        if _is_non_empty(extracted.get(f)):
+            present.append(f)
+        else:
+            missing.append(f)
+    evidence_hits: List[str] = []
+    ev_text = " ".join([str(x.get("quote", "")) for x in evidence]).lower()
+    for term in clause.required_evidence_terms:
+        t = str(term).strip().lower()
+        if t and t in ev_text:
+            evidence_hits.append(term)
+    if clause.required_fields:
+        if clause.acceptance_rule == "any_required_fields":
+            field_ok = len(present) > 0
+        else:
+            field_ok = len(missing) == 0
+    else:
+        field_ok = True
+    evidence_ok = True
+    if clause.required_evidence_terms:
+        evidence_ok = len(evidence_hits) > 0
+    if field_ok and evidence_ok:
+        status = "covered"
+    elif present or evidence_hits:
+        status = "partial"
+    else:
+        status = "missing"
+    missing_prompt = ""
+    if status in {"missing", "partial"}:
+        need_fields = ", ".join(missing) if missing else "additional corroborating evidence"
+        missing_prompt = (
+            f"Provide evidence for clause {clause.clause_id} ({clause.title}). "
+            f"Missing: {need_fields}."
+        )
+        if override_notes.strip():
+            missing_prompt += f" Notes: {override_notes.strip()}"
+    return ClauseEvaluation(
+        clause_id=clause.clause_id,
+        framework=clause.framework,
+        status=status,
+        fields_present=present,
+        missing_fields=missing,
+        evidence_hits=evidence_hits,
+        prompt=missing_prompt,
+        reason="",
+    )
+def _paper_record_id(paper: Dict[str, Any]) -> str:
+    file_name = str(paper.get("_file", "unknown.pdf"))
+    extracted = paper.get("extracted", {}) or {}
+    chems = extracted.get("chemicals", [])
+    chem = "-"
+    if isinstance(chems, list) and chems:
+        chem = str(chems[0]).strip() or "-"
+    return f"{file_name} | {chem} | Paper"
+def map_extraction_to_framework(
+    extraction_payload: Any,
+    framework: str,
+    catalog_dir: str = "regulatory_catalog",
+    override_notes: str = "",
+) -> Tuple[pd.DataFrame, Dict[str, Any], str]:
+    papers, existing_ext = _normalize_payload(extraction_payload)
+    clauses = load_framework_catalog(framework, catalog_dir=catalog_dir)
+    rows: List[Dict[str, Any]] = []
+    status_counts = {"covered": 0, "partial": 0, "missing": 0, "not_applicable": 0}
+    prompts: List[str] = []
+    for p in papers:
+        extracted = p.get("extracted", {}) or {}
+        evidence = p.get("evidence", []) or []
+        rec_id = _paper_record_id(p)
+        file_name = str(p.get("_file", ""))
+        title = str(p.get("paper_title", ""))
+        for clause in clauses:
+            ev = _evaluate_clause(extracted, evidence, clause, override_notes=override_notes)
+            status_counts[ev.status] = status_counts.get(ev.status, 0) + 1
+            if ev.prompt:
+                prompts.append(ev.prompt)
+            rows.append(
+                {
+                    "framework": framework,
+                    "clause_id": clause.clause_id,
+                    "clause_title": clause.title,
+                    "file": file_name,
+                    "paper_title": title,
+                    "record_id": rec_id,
+                    "status": ev.status,
+                    "fields_present": "; ".join(ev.fields_present),
+                    "missing_fields": "; ".join(ev.missing_fields),
+                    "evidence_hits": "; ".join(ev.evidence_hits),
+                    "prompt": ev.prompt,
+                    "source_reference": clause.source_reference,
+                }
+            )
+    df = pd.DataFrame(
+        rows,
+        columns=[
+            "framework",
+            "clause_id",
+            "clause_title",
+            "file",
+            "paper_title",
+            "record_id",
+            "status",
+            "fields_present",
+            "missing_fields",
+            "evidence_hits",
+            "prompt",
+            "source_reference",
+        ],
+    )
+    report = {
+        "framework": framework,
+        "summary": status_counts,
+        "missing_prompts": prompts,
+        "existing_toxra_extensions": existing_ext,
+    }
+    md_lines = [
+        f"# {framework} Regulatory Gap Assessment",
+        "",
+        "## Status Summary",
+        f"- Covered: {status_counts.get('covered', 0)}",
+        f"- Partial: {status_counts.get('partial', 0)}",
+        f"- Missing: {status_counts.get('missing', 0)}",
+        f"- Not applicable: {status_counts.get('not_applicable', 0)}",
+        "",
+        "## Priority Data Gaps",
+    ]
+    if prompts:
+        for p in prompts[:50]:
+            md_lines.append(f"- {p}")
+    else:
+        md_lines.append("- No immediate gaps identified.")
+    markdown = "\n".join(md_lines)
+    return df, report, markdown