Spaces:
Running
Running
Upload 43 files
Browse files- mcp_tox_calc/__init__.py +1 -0
- mcp_tox_calc/__pycache__/__init__.cpython-314.pyc +0 -0
- mcp_tox_calc/__pycache__/equations.cpython-314.pyc +0 -0
- mcp_tox_calc/__pycache__/logging.cpython-314.pyc +0 -0
- mcp_tox_calc/__pycache__/server.cpython-314.pyc +0 -0
- mcp_tox_calc/__pycache__/units.cpython-314.pyc +0 -0
- mcp_tox_calc/equations.py +275 -0
- mcp_tox_calc/logging.py +58 -0
- mcp_tox_calc/server.py +183 -0
- mcp_tox_calc/units.py +152 -0
- regulatory_catalog/epa_cancer_v2005.json +70 -0
- regulatory_catalog/fda_ctp_v2024_06.json +70 -0
- scripts/__pycache__/replay_calc_log.cpython-314.pyc +0 -0
- scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc +0 -0
- scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc +0 -0
- scripts/replay_calc_log.py +37 -0
- scripts/run_cancer_risk_batch.py +57 -0
- scripts/run_mcp_calc_server.py +13 -0
- tests/__pycache__/conftest.cpython-314.pyc +0 -0
- tests/__pycache__/test_equations.cpython-314.pyc +0 -0
- tests/__pycache__/test_mcp_tools.cpython-314.pyc +0 -0
- tests/__pycache__/test_nlp_pipeline.cpython-314.pyc +0 -0
- tests/__pycache__/test_regulatory_mapper.cpython-314.pyc +0 -0
- tests/__pycache__/test_units.cpython-314.pyc +0 -0
- tests/conftest.py +6 -0
- tests/fixtures/extraction_sample.json +41 -0
- tests/test_equations.py +86 -0
- tests/test_mcp_tools.py +38 -0
- tests/test_nlp_pipeline.py +46 -0
- tests/test_regulatory_mapper.py +16 -0
- tests/test_units.py +31 -0
- toxra_core/__init__.py +1 -0
- toxra_core/__pycache__/__init__.cpython-314.pyc +0 -0
- toxra_core/__pycache__/artifacts.cpython-314.pyc +0 -0
- toxra_core/__pycache__/calculation_client.cpython-314.pyc +0 -0
- toxra_core/__pycache__/contracts.cpython-314.pyc +0 -0
- toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc +0 -0
- toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc +0 -0
- toxra_core/artifacts.py +51 -0
- toxra_core/calculation_client.py +150 -0
- toxra_core/contracts.py +67 -0
- toxra_core/nlp_pipeline.py +284 -0
- toxra_core/regulatory_mapper.py +254 -0
mcp_tox_calc/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic toxicology calculation engine exposed through a local MCP server."""
|
mcp_tox_calc/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (217 Bytes). View file
|
|
|
mcp_tox_calc/__pycache__/equations.cpython-314.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
mcp_tox_calc/__pycache__/logging.cpython-314.pyc
ADDED
|
Binary file (4.89 kB). View file
|
|
|
mcp_tox_calc/__pycache__/server.cpython-314.pyc
ADDED
|
Binary file (8.62 kB). View file
|
|
|
mcp_tox_calc/__pycache__/units.cpython-314.pyc
ADDED
|
Binary file (6.81 kB). View file
|
|
|
mcp_tox_calc/equations.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List, Optional
|
| 2 |
+
|
| 3 |
+
from toxra_core.contracts import classify_risk_tier
|
| 4 |
+
|
| 5 |
+
from .units import (
|
| 6 |
+
UnitError,
|
| 7 |
+
normalize_air_concentration,
|
| 8 |
+
normalize_csf,
|
| 9 |
+
normalize_iur,
|
| 10 |
+
normalize_oral_exposure,
|
| 11 |
+
normalize_route,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
FORMULA_VERSION = "1.0.0"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class CalculationError(ValueError):
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _base_result(formula_id: str) -> Dict[str, Any]:
|
| 22 |
+
return {
|
| 23 |
+
"formula_id": formula_id,
|
| 24 |
+
"formula_version": FORMULA_VERSION,
|
| 25 |
+
"inputs_normalized": {},
|
| 26 |
+
"unit_conversions": [],
|
| 27 |
+
"result_value": None,
|
| 28 |
+
"risk_tier": "unknown",
|
| 29 |
+
"warnings": [],
|
| 30 |
+
"log_ref": "",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def validate_risk_input(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 35 |
+
result = _base_result("validate_risk_input")
|
| 36 |
+
errors: List[str] = []
|
| 37 |
+
warnings: List[str] = []
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
route = normalize_route(payload.get("route"))
|
| 41 |
+
except Exception as exc:
|
| 42 |
+
route = ""
|
| 43 |
+
errors.append(str(exc))
|
| 44 |
+
|
| 45 |
+
has_csf = payload.get("csf_value") not in (None, "") and payload.get("exposure_value") not in (None, "")
|
| 46 |
+
has_iur = payload.get("iur_value") not in (None, "") and payload.get("air_conc_value") not in (None, "")
|
| 47 |
+
|
| 48 |
+
if route == "oral" and not has_csf:
|
| 49 |
+
errors.append("Oral route requires csf_value and exposure_value for CSF pathway.")
|
| 50 |
+
|
| 51 |
+
if route == "inhalation" and not (has_iur or has_csf):
|
| 52 |
+
errors.append("Inhalation route requires iur_value+air_conc_value or csf_value+exposure_value.")
|
| 53 |
+
|
| 54 |
+
if has_csf and payload.get("csf_unit") in (None, ""):
|
| 55 |
+
warnings.append("csf_unit missing; assuming standard per (mg/kg-day).")
|
| 56 |
+
|
| 57 |
+
if has_iur and payload.get("air_conc_unit") in (None, ""):
|
| 58 |
+
warnings.append("air_conc_unit missing; assuming ug/m3.")
|
| 59 |
+
|
| 60 |
+
result["warnings"] = warnings
|
| 61 |
+
result["valid"] = len(errors) == 0
|
| 62 |
+
result["errors"] = errors
|
| 63 |
+
result["result_value"] = 1.0 if result["valid"] else 0.0
|
| 64 |
+
result["risk_tier"] = "unknown"
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def calculate_epa_elcr_csf(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 69 |
+
result = _base_result("calculate_epa_elcr_csf")
|
| 70 |
+
try:
|
| 71 |
+
normalize_route(payload.get("route"))
|
| 72 |
+
exp_norm = normalize_oral_exposure(
|
| 73 |
+
payload.get("exposure_value"),
|
| 74 |
+
payload.get("exposure_unit"),
|
| 75 |
+
payload.get("body_weight_kg"),
|
| 76 |
+
)
|
| 77 |
+
csf_norm = normalize_csf(payload.get("csf_value"), payload.get("csf_unit"))
|
| 78 |
+
|
| 79 |
+
elcr = exp_norm["value_mg_per_kg_day"] * csf_norm["value_per_mg_per_kg_day"]
|
| 80 |
+
result["inputs_normalized"] = {
|
| 81 |
+
"cdi_mg_per_kg_day": exp_norm["value_mg_per_kg_day"],
|
| 82 |
+
"csf_per_mg_per_kg_day": csf_norm["value_per_mg_per_kg_day"],
|
| 83 |
+
}
|
| 84 |
+
result["unit_conversions"] = exp_norm["conversions"] + csf_norm["conversions"]
|
| 85 |
+
result["result_value"] = float(elcr)
|
| 86 |
+
result["risk_tier"] = classify_risk_tier(result["result_value"])
|
| 87 |
+
return result
|
| 88 |
+
except (UnitError, ValueError) as exc:
|
| 89 |
+
raise CalculationError(str(exc)) from exc
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def calculate_epa_elcr_iur(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 93 |
+
result = _base_result("calculate_epa_elcr_iur")
|
| 94 |
+
try:
|
| 95 |
+
route = normalize_route(payload.get("route"))
|
| 96 |
+
if route != "inhalation":
|
| 97 |
+
result["warnings"].append("IUR calculation is generally applicable to inhalation route.")
|
| 98 |
+
|
| 99 |
+
conc_norm = normalize_air_concentration(payload.get("air_conc_value"), payload.get("air_conc_unit"))
|
| 100 |
+
iur_norm = normalize_iur(payload.get("iur_value"), payload.get("iur_unit"))
|
| 101 |
+
|
| 102 |
+
elcr = conc_norm["value_ug_per_m3"] * iur_norm["value_per_ug_per_m3"]
|
| 103 |
+
result["inputs_normalized"] = {
|
| 104 |
+
"air_conc_ug_per_m3": conc_norm["value_ug_per_m3"],
|
| 105 |
+
"iur_per_ug_per_m3": iur_norm["value_per_ug_per_m3"],
|
| 106 |
+
}
|
| 107 |
+
result["unit_conversions"] = conc_norm["conversions"] + iur_norm["conversions"]
|
| 108 |
+
result["result_value"] = float(elcr)
|
| 109 |
+
result["risk_tier"] = classify_risk_tier(result["result_value"])
|
| 110 |
+
return result
|
| 111 |
+
except (UnitError, ValueError) as exc:
|
| 112 |
+
raise CalculationError(str(exc)) from exc
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def calculate_fda_ctp_elcr(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 116 |
+
result = _base_result("calculate_fda_ctp_elcr")
|
| 117 |
+
|
| 118 |
+
# Supports either a single row payload or a multi-constituent list under "constituents".
|
| 119 |
+
components: List[Dict[str, Any]] = []
|
| 120 |
+
total = 0.0
|
| 121 |
+
|
| 122 |
+
rows: List[Dict[str, Any]]
|
| 123 |
+
if isinstance(payload.get("constituents"), list) and payload.get("constituents"):
|
| 124 |
+
rows = [x for x in payload.get("constituents", []) if isinstance(x, dict)]
|
| 125 |
+
else:
|
| 126 |
+
rows = [payload]
|
| 127 |
+
|
| 128 |
+
for row in rows:
|
| 129 |
+
comp: Dict[str, Any] = {
|
| 130 |
+
"chemical_name": row.get("chemical_name", ""),
|
| 131 |
+
"route": row.get("route", ""),
|
| 132 |
+
"csf_result": None,
|
| 133 |
+
"iur_result": None,
|
| 134 |
+
"component_total": 0.0,
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
|
| 138 |
+
csf_res = calculate_epa_elcr_csf(row)
|
| 139 |
+
comp["csf_result"] = csf_res
|
| 140 |
+
comp["component_total"] += float(csf_res["result_value"] or 0.0)
|
| 141 |
+
|
| 142 |
+
if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
|
| 143 |
+
iur_res = calculate_epa_elcr_iur(row)
|
| 144 |
+
comp["iur_result"] = iur_res
|
| 145 |
+
comp["component_total"] += float(iur_res["result_value"] or 0.0)
|
| 146 |
+
|
| 147 |
+
total += comp["component_total"]
|
| 148 |
+
components.append(comp)
|
| 149 |
+
|
| 150 |
+
result["inputs_normalized"] = {"component_count": len(components)}
|
| 151 |
+
result["unit_conversions"] = []
|
| 152 |
+
result["result_value"] = float(total)
|
| 153 |
+
result["risk_tier"] = classify_risk_tier(result["result_value"])
|
| 154 |
+
result["component_results"] = components
|
| 155 |
+
return result
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def get_formula_catalog() -> Dict[str, Any]:
|
| 159 |
+
return {
|
| 160 |
+
"formula_id": "get_formula_catalog",
|
| 161 |
+
"formula_version": FORMULA_VERSION,
|
| 162 |
+
"inputs_normalized": {},
|
| 163 |
+
"unit_conversions": [],
|
| 164 |
+
"result_value": None,
|
| 165 |
+
"risk_tier": "unknown",
|
| 166 |
+
"warnings": [],
|
| 167 |
+
"log_ref": "",
|
| 168 |
+
"formulas": [
|
| 169 |
+
{
|
| 170 |
+
"id": "calculate_epa_elcr_csf",
|
| 171 |
+
"equation": "ELCR = CDI (mg/kg-day) * CSF ((mg/kg-day)^-1)",
|
| 172 |
+
"notes": "Oral pathway using cancer slope factor.",
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"id": "calculate_epa_elcr_iur",
|
| 176 |
+
"equation": "ELCR = Air Concentration (ug/m3) * IUR ((ug/m3)^-1)",
|
| 177 |
+
"notes": "Inhalation pathway using inhalation unit risk.",
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"id": "calculate_fda_ctp_elcr",
|
| 181 |
+
"equation": "ELCR_total = sum(component ELCR)",
|
| 182 |
+
"notes": "Constituent-level aggregation wrapper for CTP-style profile assessment.",
|
| 183 |
+
},
|
| 184 |
+
],
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def run_batch_cancer_risk(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 189 |
+
out = _base_result("run_batch_cancer_risk")
|
| 190 |
+
rows = payload.get("rows", []) if isinstance(payload, dict) else []
|
| 191 |
+
if not isinstance(rows, list):
|
| 192 |
+
raise CalculationError("rows must be a list of objects")
|
| 193 |
+
|
| 194 |
+
row_results: List[Dict[str, Any]] = []
|
| 195 |
+
n_ok = 0
|
| 196 |
+
n_err = 0
|
| 197 |
+
|
| 198 |
+
for i, row in enumerate(rows):
|
| 199 |
+
if not isinstance(row, dict):
|
| 200 |
+
row_results.append(
|
| 201 |
+
{
|
| 202 |
+
"row_index": i,
|
| 203 |
+
"status": "error",
|
| 204 |
+
"error": "row must be an object",
|
| 205 |
+
}
|
| 206 |
+
)
|
| 207 |
+
n_err += 1
|
| 208 |
+
continue
|
| 209 |
+
|
| 210 |
+
v = validate_risk_input(row)
|
| 211 |
+
if not v.get("valid", False):
|
| 212 |
+
row_results.append(
|
| 213 |
+
{
|
| 214 |
+
"row_index": i,
|
| 215 |
+
"record_id": row.get("record_id", ""),
|
| 216 |
+
"chemical_name": row.get("chemical_name", ""),
|
| 217 |
+
"status": "error",
|
| 218 |
+
"errors": v.get("errors", []),
|
| 219 |
+
"warnings": v.get("warnings", []),
|
| 220 |
+
}
|
| 221 |
+
)
|
| 222 |
+
n_err += 1
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
csf_res: Optional[Dict[str, Any]] = None
|
| 227 |
+
iur_res: Optional[Dict[str, Any]] = None
|
| 228 |
+
if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
|
| 229 |
+
csf_res = calculate_epa_elcr_csf(row)
|
| 230 |
+
if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
|
| 231 |
+
iur_res = calculate_epa_elcr_iur(row)
|
| 232 |
+
|
| 233 |
+
fda_res = calculate_fda_ctp_elcr(row)
|
| 234 |
+
|
| 235 |
+
row_out = {
|
| 236 |
+
"row_index": i,
|
| 237 |
+
"record_id": row.get("record_id", ""),
|
| 238 |
+
"chemical_name": row.get("chemical_name", ""),
|
| 239 |
+
"casrn": row.get("casrn", ""),
|
| 240 |
+
"route": row.get("route", ""),
|
| 241 |
+
"status": "ok",
|
| 242 |
+
"epa_elcr_csf": (csf_res or {}).get("result_value", ""),
|
| 243 |
+
"epa_elcr_iur": (iur_res or {}).get("result_value", ""),
|
| 244 |
+
"fda_ctp_elcr": fda_res.get("result_value", ""),
|
| 245 |
+
"risk_tier": fda_res.get("risk_tier", "unknown"),
|
| 246 |
+
"formula_id": fda_res.get("formula_id", "calculate_fda_ctp_elcr"),
|
| 247 |
+
"formula_version": fda_res.get("formula_version", FORMULA_VERSION),
|
| 248 |
+
"inputs_normalized": fda_res.get("inputs_normalized", {}),
|
| 249 |
+
"unit_conversions": fda_res.get("unit_conversions", []),
|
| 250 |
+
"warnings": (v.get("warnings", []) + fda_res.get("warnings", [])),
|
| 251 |
+
"log_ref": "",
|
| 252 |
+
}
|
| 253 |
+
row_results.append(row_out)
|
| 254 |
+
n_ok += 1
|
| 255 |
+
except Exception as exc:
|
| 256 |
+
row_results.append(
|
| 257 |
+
{
|
| 258 |
+
"row_index": i,
|
| 259 |
+
"record_id": row.get("record_id", ""),
|
| 260 |
+
"chemical_name": row.get("chemical_name", ""),
|
| 261 |
+
"status": "error",
|
| 262 |
+
"errors": [str(exc)],
|
| 263 |
+
}
|
| 264 |
+
)
|
| 265 |
+
n_err += 1
|
| 266 |
+
|
| 267 |
+
out["rows"] = row_results
|
| 268 |
+
out["summary"] = {
|
| 269 |
+
"total_rows": len(rows),
|
| 270 |
+
"ok_rows": n_ok,
|
| 271 |
+
"error_rows": n_err,
|
| 272 |
+
}
|
| 273 |
+
out["result_value"] = float(n_ok)
|
| 274 |
+
out["risk_tier"] = "unknown"
|
| 275 |
+
return out
|
mcp_tox_calc/logging.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime as dt
|
| 2 |
+
import json
|
| 3 |
+
import uuid
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class RunLogger:
|
| 9 |
+
def __init__(self, run_dir: str):
|
| 10 |
+
self.run_dir = Path(run_dir)
|
| 11 |
+
self.run_dir.mkdir(parents=True, exist_ok=True)
|
| 12 |
+
self.log_path = self.run_dir / "cancer_risk_log.jsonl"
|
| 13 |
+
self.report_path = self.run_dir / "cancer_risk_report.md"
|
| 14 |
+
|
| 15 |
+
def log_event(self, tool_name: str, request_args: Dict[str, Any], response: Dict[str, Any]) -> str:
|
| 16 |
+
event_id = f"evt_{uuid.uuid4().hex[:10]}"
|
| 17 |
+
row = {
|
| 18 |
+
"event_id": event_id,
|
| 19 |
+
"timestamp_utc": dt.datetime.utcnow().isoformat() + "Z",
|
| 20 |
+
"tool_name": tool_name,
|
| 21 |
+
"request": request_args,
|
| 22 |
+
"response": response,
|
| 23 |
+
}
|
| 24 |
+
with self.log_path.open("a", encoding="utf-8") as f:
|
| 25 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 26 |
+
return event_id
|
| 27 |
+
|
| 28 |
+
def write_report(self, summary: Dict[str, Any], rows: List[Dict[str, Any]]) -> Path:
|
| 29 |
+
lines = [
|
| 30 |
+
"# Cancer Risk Calculation Report",
|
| 31 |
+
"",
|
| 32 |
+
"## Run Summary",
|
| 33 |
+
f"- Total rows: {summary.get('total_rows', 0)}",
|
| 34 |
+
f"- Successful rows: {summary.get('ok_rows', 0)}",
|
| 35 |
+
f"- Error rows: {summary.get('error_rows', 0)}",
|
| 36 |
+
"",
|
| 37 |
+
"## High Priority Rows",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
high = [r for r in rows if str(r.get("risk_tier", "")).strip().lower() == "high_priority"]
|
| 41 |
+
if high:
|
| 42 |
+
for r in high[:100]:
|
| 43 |
+
lines.append(
|
| 44 |
+
f"- record_id={r.get('record_id','')} chemical={r.get('chemical_name','')} fda_ctp_elcr={r.get('fda_ctp_elcr','')}"
|
| 45 |
+
)
|
| 46 |
+
else:
|
| 47 |
+
lines.append("- None")
|
| 48 |
+
|
| 49 |
+
lines += ["", "## Data Quality Alerts"]
|
| 50 |
+
errors = [r for r in rows if r.get("status") == "error"]
|
| 51 |
+
if errors:
|
| 52 |
+
for r in errors[:100]:
|
| 53 |
+
lines.append(f"- row_index={r.get('row_index')} errors={r.get('errors', [])}")
|
| 54 |
+
else:
|
| 55 |
+
lines.append("- No row-level errors.")
|
| 56 |
+
|
| 57 |
+
self.report_path.write_text("\n".join(lines), encoding="utf-8")
|
| 58 |
+
return self.report_path
|
mcp_tox_calc/server.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Callable, Dict
|
| 6 |
+
|
| 7 |
+
from mcp_tox_calc.equations import (
|
| 8 |
+
CalculationError,
|
| 9 |
+
calculate_epa_elcr_csf,
|
| 10 |
+
calculate_epa_elcr_iur,
|
| 11 |
+
calculate_fda_ctp_elcr,
|
| 12 |
+
get_formula_catalog,
|
| 13 |
+
run_batch_cancer_risk,
|
| 14 |
+
validate_risk_input,
|
| 15 |
+
)
|
| 16 |
+
from mcp_tox_calc.logging import RunLogger
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
ToolFn = Callable[[Dict[str, Any]], Dict[str, Any]]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ToxCalcMCPServer:
|
| 23 |
+
def __init__(self, run_dir: str):
|
| 24 |
+
self.run_dir = str(Path(run_dir))
|
| 25 |
+
self.logger = RunLogger(self.run_dir)
|
| 26 |
+
self.tools: Dict[str, Dict[str, Any]] = {
|
| 27 |
+
"validate_risk_input": {
|
| 28 |
+
"description": "Validate a row payload for deterministic cancer risk calculations.",
|
| 29 |
+
"inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
|
| 30 |
+
"fn": validate_risk_input,
|
| 31 |
+
},
|
| 32 |
+
"calculate_epa_elcr_csf": {
|
| 33 |
+
"description": "Compute ELCR using EPA CSF pathway.",
|
| 34 |
+
"inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
|
| 35 |
+
"fn": calculate_epa_elcr_csf,
|
| 36 |
+
},
|
| 37 |
+
"calculate_epa_elcr_iur": {
|
| 38 |
+
"description": "Compute ELCR using EPA IUR pathway.",
|
| 39 |
+
"inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
|
| 40 |
+
"fn": calculate_epa_elcr_iur,
|
| 41 |
+
},
|
| 42 |
+
"calculate_fda_ctp_elcr": {
|
| 43 |
+
"description": "Compute ELCR profile using FDA CTP-style constituent aggregation.",
|
| 44 |
+
"inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
|
| 45 |
+
"fn": calculate_fda_ctp_elcr,
|
| 46 |
+
},
|
| 47 |
+
"run_batch_cancer_risk": {
|
| 48 |
+
"description": "Run deterministic cancer risk calculations across a batch of rows.",
|
| 49 |
+
"inputSchema": {
|
| 50 |
+
"type": "object",
|
| 51 |
+
"properties": {"rows": {"type": "array", "items": {"type": "object"}}},
|
| 52 |
+
"required": ["rows"],
|
| 53 |
+
"additionalProperties": True,
|
| 54 |
+
},
|
| 55 |
+
"fn": run_batch_cancer_risk,
|
| 56 |
+
},
|
| 57 |
+
"get_formula_catalog": {
|
| 58 |
+
"description": "Return available formula catalog and version.",
|
| 59 |
+
"inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
|
| 60 |
+
"fn": lambda _args: get_formula_catalog(),
|
| 61 |
+
},
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
def handle_request(self, req: Dict[str, Any]) -> Dict[str, Any]:
|
| 65 |
+
method = req.get("method")
|
| 66 |
+
req_id = req.get("id")
|
| 67 |
+
|
| 68 |
+
if method == "initialize":
|
| 69 |
+
return {
|
| 70 |
+
"jsonrpc": "2.0",
|
| 71 |
+
"id": req_id,
|
| 72 |
+
"result": {
|
| 73 |
+
"protocolVersion": "2024-11-05",
|
| 74 |
+
"serverInfo": {"name": "toxra-calc-mcp", "version": "0.1.0"},
|
| 75 |
+
"capabilities": {"tools": {}},
|
| 76 |
+
},
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
if method == "tools/list":
|
| 80 |
+
tools = []
|
| 81 |
+
for name, meta in self.tools.items():
|
| 82 |
+
tools.append(
|
| 83 |
+
{
|
| 84 |
+
"name": name,
|
| 85 |
+
"description": meta["description"],
|
| 86 |
+
"inputSchema": meta["inputSchema"],
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
return {"jsonrpc": "2.0", "id": req_id, "result": {"tools": tools}}
|
| 90 |
+
|
| 91 |
+
if method == "tools/call":
|
| 92 |
+
params = req.get("params", {}) or {}
|
| 93 |
+
name = params.get("name")
|
| 94 |
+
args = params.get("arguments", {}) or {}
|
| 95 |
+
|
| 96 |
+
if name not in self.tools:
|
| 97 |
+
return {
|
| 98 |
+
"jsonrpc": "2.0",
|
| 99 |
+
"id": req_id,
|
| 100 |
+
"error": {"code": -32602, "message": f"Unknown tool: {name}"},
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
fn: ToolFn = self.tools[name]["fn"]
|
| 104 |
+
try:
|
| 105 |
+
result = fn(args)
|
| 106 |
+
if not isinstance(result, dict):
|
| 107 |
+
result = {"value": result}
|
| 108 |
+
|
| 109 |
+
# Attach structured log reference per tool call.
|
| 110 |
+
log_ref = self.logger.log_event(name, args, result)
|
| 111 |
+
result.setdefault("log_ref", log_ref)
|
| 112 |
+
|
| 113 |
+
if name == "run_batch_cancer_risk":
|
| 114 |
+
rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
|
| 115 |
+
for row in rows:
|
| 116 |
+
if isinstance(row, dict):
|
| 117 |
+
row.setdefault("formula_id", "calculate_fda_ctp_elcr")
|
| 118 |
+
row.setdefault("formula_version", result.get("formula_version", "1.0.0"))
|
| 119 |
+
row.setdefault("inputs_normalized", {})
|
| 120 |
+
row.setdefault("unit_conversions", [])
|
| 121 |
+
row.setdefault("result_value", row.get("fda_ctp_elcr", ""))
|
| 122 |
+
row.setdefault("risk_tier", row.get("risk_tier", "unknown"))
|
| 123 |
+
row.setdefault("warnings", row.get("warnings", []))
|
| 124 |
+
row.setdefault("log_ref", log_ref)
|
| 125 |
+
report_path = self.logger.write_report(result.get("summary", {}), rows)
|
| 126 |
+
result["artifacts"] = {
|
| 127 |
+
"run_dir": self.run_dir,
|
| 128 |
+
"log_jsonl": str(self.logger.log_path),
|
| 129 |
+
"report_md": str(report_path),
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
content = [{"type": "json", "json": result}]
|
| 133 |
+
return {"jsonrpc": "2.0", "id": req_id, "result": {"content": content}}
|
| 134 |
+
except CalculationError as exc:
|
| 135 |
+
return {
|
| 136 |
+
"jsonrpc": "2.0",
|
| 137 |
+
"id": req_id,
|
| 138 |
+
"error": {"code": -32001, "message": str(exc)},
|
| 139 |
+
}
|
| 140 |
+
except Exception as exc:
|
| 141 |
+
return {
|
| 142 |
+
"jsonrpc": "2.0",
|
| 143 |
+
"id": req_id,
|
| 144 |
+
"error": {"code": -32099, "message": f"Unexpected tool error: {exc}"},
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
return {
|
| 148 |
+
"jsonrpc": "2.0",
|
| 149 |
+
"id": req_id,
|
| 150 |
+
"error": {"code": -32601, "message": f"Method not found: {method}"},
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _serve_stdio(server: ToxCalcMCPServer) -> None:
|
| 155 |
+
for line in sys.stdin:
|
| 156 |
+
line = line.strip()
|
| 157 |
+
if not line:
|
| 158 |
+
continue
|
| 159 |
+
try:
|
| 160 |
+
req = json.loads(line)
|
| 161 |
+
resp = server.handle_request(req)
|
| 162 |
+
except Exception as exc:
|
| 163 |
+
resp = {
|
| 164 |
+
"jsonrpc": "2.0",
|
| 165 |
+
"id": None,
|
| 166 |
+
"error": {"code": -32700, "message": f"Parse/dispatch error: {exc}"},
|
| 167 |
+
}
|
| 168 |
+
sys.stdout.write(json.dumps(resp) + "\n")
|
| 169 |
+
sys.stdout.flush()
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def main() -> None:
|
| 173 |
+
parser = argparse.ArgumentParser(description="Local MCP server for deterministic toxicology calculations")
|
| 174 |
+
parser.add_argument("--stdio", action="store_true", default=False, help="Run stdio JSON-RPC loop")
|
| 175 |
+
parser.add_argument("--run-dir", default="runs/mcp_server", help="Run artifact directory")
|
| 176 |
+
args = parser.parse_args()
|
| 177 |
+
|
| 178 |
+
server = ToxCalcMCPServer(run_dir=args.run_dir)
|
| 179 |
+
_serve_stdio(server)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__ == "__main__":
|
| 183 |
+
main()
|
mcp_tox_calc/units.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class UnitError(ValueError):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _to_float(v: Any, field: str) -> float:
|
| 10 |
+
try:
|
| 11 |
+
x = float(v)
|
| 12 |
+
except Exception as exc:
|
| 13 |
+
raise UnitError(f"{field} must be numeric.") from exc
|
| 14 |
+
return x
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _norm_unit(unit: Any) -> str:
|
| 18 |
+
s = str(unit or "").strip().lower()
|
| 19 |
+
s = s.replace(" ", "")
|
| 20 |
+
s = s.replace("μ", "u")
|
| 21 |
+
return s
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def normalize_route(route: Any) -> str:
|
| 25 |
+
r = str(route or "").strip().lower()
|
| 26 |
+
if r in {"oral", "inhalation"}:
|
| 27 |
+
return r
|
| 28 |
+
raise UnitError("route must be oral or inhalation")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def normalize_oral_exposure(exposure_value: Any, exposure_unit: Any, body_weight_kg: Any) -> Dict[str, Any]:
|
| 32 |
+
value = _to_float(exposure_value, "exposure_value")
|
| 33 |
+
unit = _norm_unit(exposure_unit)
|
| 34 |
+
bw = _to_float(body_weight_kg, "body_weight_kg") if body_weight_kg not in (None, "") else None
|
| 35 |
+
|
| 36 |
+
conversions = []
|
| 37 |
+
|
| 38 |
+
if unit in {"mg/kg-day", "mg/kg/d", "mg/kgday", "mgkgday"}:
|
| 39 |
+
cdi = value
|
| 40 |
+
elif unit in {"ug/kg-day", "ug/kg/d", "ug/kgday", "ugkgday"}:
|
| 41 |
+
cdi = value / 1000.0
|
| 42 |
+
conversions.append("exposure ug/kg-day -> mg/kg-day")
|
| 43 |
+
elif unit in {"mg/day", "mg/d", "mgday"}:
|
| 44 |
+
if bw is None or bw <= 0:
|
| 45 |
+
raise UnitError("body_weight_kg is required for exposure unit mg/day")
|
| 46 |
+
cdi = value / bw
|
| 47 |
+
conversions.append("exposure mg/day -> mg/kg-day")
|
| 48 |
+
elif unit in {"ug/day", "ug/d", "ugday"}:
|
| 49 |
+
if bw is None or bw <= 0:
|
| 50 |
+
raise UnitError("body_weight_kg is required for exposure unit ug/day")
|
| 51 |
+
cdi = (value / 1000.0) / bw
|
| 52 |
+
conversions.append("exposure ug/day -> mg/kg-day")
|
| 53 |
+
else:
|
| 54 |
+
raise UnitError(f"Unsupported oral exposure unit: {exposure_unit}")
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
"value_mg_per_kg_day": cdi,
|
| 58 |
+
"unit": "mg/kg-day",
|
| 59 |
+
"conversions": conversions,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def normalize_air_concentration(air_conc_value: Any, air_conc_unit: Any) -> Dict[str, Any]:
|
| 64 |
+
value = _to_float(air_conc_value, "air_conc_value")
|
| 65 |
+
unit = _norm_unit(air_conc_unit)
|
| 66 |
+
conversions = []
|
| 67 |
+
|
| 68 |
+
if unit in {"ug/m3", "ugm3", "ug/m^3"}:
|
| 69 |
+
conc = value
|
| 70 |
+
elif unit in {"mg/m3", "mgm3", "mg/m^3"}:
|
| 71 |
+
conc = value * 1000.0
|
| 72 |
+
conversions.append("air concentration mg/m3 -> ug/m3")
|
| 73 |
+
elif unit in {"ng/m3", "ngm3", "ng/m^3"}:
|
| 74 |
+
conc = value / 1000.0
|
| 75 |
+
conversions.append("air concentration ng/m3 -> ug/m3")
|
| 76 |
+
else:
|
| 77 |
+
raise UnitError(f"Unsupported air concentration unit: {air_conc_unit}")
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"value_ug_per_m3": conc,
|
| 81 |
+
"unit": "ug/m3",
|
| 82 |
+
"conversions": conversions,
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def normalize_csf(csf_value: Any, csf_unit: Any) -> Dict[str, Any]:
|
| 87 |
+
value = _to_float(csf_value, "csf_value")
|
| 88 |
+
unit = _norm_unit(csf_unit)
|
| 89 |
+
conversions = []
|
| 90 |
+
|
| 91 |
+
if unit in {
|
| 92 |
+
"(mg/kg-day)^-1",
|
| 93 |
+
"1/(mg/kg-day)",
|
| 94 |
+
"per(mg/kg-day)",
|
| 95 |
+
"permg/kg-day",
|
| 96 |
+
"(mgkgday)^-1",
|
| 97 |
+
"1/mgkgday",
|
| 98 |
+
}:
|
| 99 |
+
out = value
|
| 100 |
+
elif unit in {
|
| 101 |
+
"(ug/kg-day)^-1",
|
| 102 |
+
"1/(ug/kg-day)",
|
| 103 |
+
"per(ug/kg-day)",
|
| 104 |
+
"(ugkgday)^-1",
|
| 105 |
+
"1/ugkgday",
|
| 106 |
+
}:
|
| 107 |
+
out = value * 1000.0
|
| 108 |
+
conversions.append("CSF per (ug/kg-day) -> per (mg/kg-day)")
|
| 109 |
+
elif unit in {"", "na", "n/a"}:
|
| 110 |
+
out = value
|
| 111 |
+
else:
|
| 112 |
+
raise UnitError(f"Unsupported csf unit: {csf_unit}")
|
| 113 |
+
|
| 114 |
+
return {
|
| 115 |
+
"value_per_mg_per_kg_day": out,
|
| 116 |
+
"unit": "(mg/kg-day)^-1",
|
| 117 |
+
"conversions": conversions,
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def normalize_iur(iur_value: Any, iur_unit: Any) -> Dict[str, Any]:
|
| 122 |
+
value = _to_float(iur_value, "iur_value")
|
| 123 |
+
unit = _norm_unit(iur_unit)
|
| 124 |
+
conversions = []
|
| 125 |
+
|
| 126 |
+
if unit in {
|
| 127 |
+
"(ug/m3)^-1",
|
| 128 |
+
"1/(ug/m3)",
|
| 129 |
+
"per(ug/m3)",
|
| 130 |
+
"1/ugm3",
|
| 131 |
+
"(ugm3)^-1",
|
| 132 |
+
}:
|
| 133 |
+
out = value
|
| 134 |
+
elif unit in {
|
| 135 |
+
"(mg/m3)^-1",
|
| 136 |
+
"1/(mg/m3)",
|
| 137 |
+
"per(mg/m3)",
|
| 138 |
+
"1/mgm3",
|
| 139 |
+
"(mgm3)^-1",
|
| 140 |
+
}:
|
| 141 |
+
out = value / 1000.0
|
| 142 |
+
conversions.append("IUR per (mg/m3) -> per (ug/m3)")
|
| 143 |
+
elif unit in {"", "na", "n/a"}:
|
| 144 |
+
out = value
|
| 145 |
+
else:
|
| 146 |
+
raise UnitError(f"Unsupported iur unit: {iur_unit}")
|
| 147 |
+
|
| 148 |
+
return {
|
| 149 |
+
"value_per_ug_per_m3": out,
|
| 150 |
+
"unit": "(ug/m3)^-1",
|
| 151 |
+
"conversions": conversions,
|
| 152 |
+
}
|
regulatory_catalog/epa_cancer_v2005.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"framework": "EPA",
|
| 3 |
+
"version": "2005",
|
| 4 |
+
"source": "EPA Guidelines for Carcinogen Risk Assessment (2005)",
|
| 5 |
+
"clauses": [
|
| 6 |
+
{
|
| 7 |
+
"clause_id": "EPA.CANCER.CSF.001",
|
| 8 |
+
"framework": "EPA",
|
| 9 |
+
"title": "Oral cancer slope factor applicability",
|
| 10 |
+
"description": "Assessment should include oral-dose based evidence and potency context for CSF application.",
|
| 11 |
+
"required_fields": [
|
| 12 |
+
"dose_metrics",
|
| 13 |
+
"exposure_route",
|
| 14 |
+
"carcinogenicity_result"
|
| 15 |
+
],
|
| 16 |
+
"required_evidence_terms": [
|
| 17 |
+
"cancer slope factor",
|
| 18 |
+
"mg/kg-day",
|
| 19 |
+
"oral",
|
| 20 |
+
"dose"
|
| 21 |
+
],
|
| 22 |
+
"acceptance_rule": "any_required_fields",
|
| 23 |
+
"applicability": {},
|
| 24 |
+
"source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"clause_id": "EPA.CANCER.IUR.001",
|
| 28 |
+
"framework": "EPA",
|
| 29 |
+
"title": "Inhalation unit risk applicability",
|
| 30 |
+
"description": "Assessment should include inhalation exposure metrics suitable for IUR-based risk quantification.",
|
| 31 |
+
"required_fields": [
|
| 32 |
+
"exposure_route",
|
| 33 |
+
"dose_metrics",
|
| 34 |
+
"carcinogenicity_notes"
|
| 35 |
+
],
|
| 36 |
+
"required_evidence_terms": [
|
| 37 |
+
"inhalation unit risk",
|
| 38 |
+
"ug/m3",
|
| 39 |
+
"inhalation",
|
| 40 |
+
"air concentration"
|
| 41 |
+
],
|
| 42 |
+
"acceptance_rule": "any_required_fields",
|
| 43 |
+
"applicability": {
|
| 44 |
+
"field": "exposure_route",
|
| 45 |
+
"equals": "inhalation"
|
| 46 |
+
},
|
| 47 |
+
"source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"clause_id": "EPA.CANCER.WOE.001",
|
| 51 |
+
"framework": "EPA",
|
| 52 |
+
"title": "Weight-of-evidence integration",
|
| 53 |
+
"description": "Narrative should integrate evidence quality, uncertainty, and plausibility of carcinogenic potential.",
|
| 54 |
+
"required_fields": [
|
| 55 |
+
"key_findings",
|
| 56 |
+
"conclusion",
|
| 57 |
+
"risk_summary"
|
| 58 |
+
],
|
| 59 |
+
"required_evidence_terms": [
|
| 60 |
+
"weight of evidence",
|
| 61 |
+
"uncertainty",
|
| 62 |
+
"mode of action",
|
| 63 |
+
"cancer"
|
| 64 |
+
],
|
| 65 |
+
"acceptance_rule": "any_required_fields",
|
| 66 |
+
"applicability": {},
|
| 67 |
+
"source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
}
|
regulatory_catalog/fda_ctp_v2024_06.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"framework": "FDA CTP",
|
| 3 |
+
"version": "2024-06",
|
| 4 |
+
"source": "FDA CTP Regulatory Science Policy Memoranda (June 3, 2024)",
|
| 5 |
+
"clauses": [
|
| 6 |
+
{
|
| 7 |
+
"clause_id": "FDA.CTP.GENOTOX.001",
|
| 8 |
+
"framework": "FDA CTP",
|
| 9 |
+
"title": "Genotoxicity hazard identification evidence",
|
| 10 |
+
"description": "Evidence should characterize in vitro and in vivo genotoxicity evidence and integrated interpretation.",
|
| 11 |
+
"required_fields": [
|
| 12 |
+
"genotox_oecd_tg_in_vitro",
|
| 13 |
+
"genotox_oecd_tg_in_vivo",
|
| 14 |
+
"genotoxicity_result",
|
| 15 |
+
"genotoxicity_result_notes"
|
| 16 |
+
],
|
| 17 |
+
"required_evidence_terms": [
|
| 18 |
+
"genotoxic",
|
| 19 |
+
"ames",
|
| 20 |
+
"micronucleus",
|
| 21 |
+
"comet",
|
| 22 |
+
"oecd tg"
|
| 23 |
+
],
|
| 24 |
+
"acceptance_rule": "all_required_fields",
|
| 25 |
+
"applicability": {},
|
| 26 |
+
"source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"clause_id": "FDA.CTP.CARCIN.001",
|
| 30 |
+
"framework": "FDA CTP",
|
| 31 |
+
"title": "Carcinogenicity tiering narrative support",
|
| 32 |
+
"description": "Carcinogenicity conclusions should be supported by study findings and risk narrative.",
|
| 33 |
+
"required_fields": [
|
| 34 |
+
"carcinogenicity_result",
|
| 35 |
+
"carcinogenicity_notes",
|
| 36 |
+
"key_findings",
|
| 37 |
+
"conclusion"
|
| 38 |
+
],
|
| 39 |
+
"required_evidence_terms": [
|
| 40 |
+
"carcinogenic",
|
| 41 |
+
"tumor",
|
| 42 |
+
"cancer",
|
| 43 |
+
"risk"
|
| 44 |
+
],
|
| 45 |
+
"acceptance_rule": "all_required_fields",
|
| 46 |
+
"applicability": {},
|
| 47 |
+
"source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"clause_id": "FDA.CTP.ELCR.001",
|
| 51 |
+
"framework": "FDA CTP",
|
| 52 |
+
"title": "ELCR-ready quantitative evidence elements",
|
| 53 |
+
"description": "ELCR computations require quantitative exposure and potency anchors with transparent assumptions.",
|
| 54 |
+
"required_fields": [
|
| 55 |
+
"dose_metrics",
|
| 56 |
+
"risk_summary",
|
| 57 |
+
"exposure_route"
|
| 58 |
+
],
|
| 59 |
+
"required_evidence_terms": [
|
| 60 |
+
"excess lifetime cancer risk",
|
| 61 |
+
"slope factor",
|
| 62 |
+
"unit risk",
|
| 63 |
+
"exposure"
|
| 64 |
+
],
|
| 65 |
+
"acceptance_rule": "any_required_fields",
|
| 66 |
+
"applicability": {},
|
| 67 |
+
"source_reference": "FDA CTP Calculating Excess Lifetime Cancer Risk in ENDS PMTAs memo (June 3, 2024)"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
}
|
scripts/__pycache__/replay_calc_log.cpython-314.pyc
ADDED
|
Binary file (2.24 kB). View file
|
|
|
scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc
ADDED
|
Binary file (3.36 kB). View file
|
|
|
scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc
ADDED
|
Binary file (252 Bytes). View file
|
|
|
scripts/replay_calc_log.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import argparse
|
| 3 |
+
import json
|
| 4 |
+
from collections import Counter
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def main() -> None:
|
| 9 |
+
parser = argparse.ArgumentParser(description="Replay and summarize cancer risk MCP log JSONL")
|
| 10 |
+
parser.add_argument("--log-jsonl", required=True, help="Path to cancer_risk_log.jsonl")
|
| 11 |
+
args = parser.parse_args()
|
| 12 |
+
|
| 13 |
+
path = Path(args.log_jsonl)
|
| 14 |
+
if not path.exists():
|
| 15 |
+
raise FileNotFoundError(f"Log file not found: {path}")
|
| 16 |
+
|
| 17 |
+
events = []
|
| 18 |
+
for line in path.read_text(encoding="utf-8").splitlines():
|
| 19 |
+
line = line.strip()
|
| 20 |
+
if not line:
|
| 21 |
+
continue
|
| 22 |
+
events.append(json.loads(line))
|
| 23 |
+
|
| 24 |
+
tool_counts = Counter([e.get("tool_name", "unknown") for e in events])
|
| 25 |
+
|
| 26 |
+
print("# MCP Calculation Log Replay")
|
| 27 |
+
print(f"events={len(events)}")
|
| 28 |
+
for tool, n in sorted(tool_counts.items()):
|
| 29 |
+
print(f"- {tool}: {n}")
|
| 30 |
+
|
| 31 |
+
if events:
|
| 32 |
+
print("\nlast_event=")
|
| 33 |
+
print(json.dumps(events[-1], indent=2))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
main()
|
scripts/run_cancer_risk_batch.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import argparse
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 10 |
+
if str(ROOT) not in sys.path:
|
| 11 |
+
sys.path.insert(0, str(ROOT))
|
| 12 |
+
|
| 13 |
+
from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json
|
| 14 |
+
from toxra_core.calculation_client import run_batch_cancer_risk
|
| 15 |
+
from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def main() -> None:
|
| 19 |
+
parser = argparse.ArgumentParser(description="Run deterministic cancer risk batch using local MCP server")
|
| 20 |
+
parser.add_argument("--input-csv", required=True, help="Path to cancer risk input CSV")
|
| 21 |
+
parser.add_argument("--run-id", default="", help="Optional run ID")
|
| 22 |
+
parser.add_argument("--runs-dir", default="runs", help="Runs base directory")
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
|
| 25 |
+
inp = Path(args.input_csv)
|
| 26 |
+
if not inp.exists():
|
| 27 |
+
raise FileNotFoundError(f"Input CSV not found: {inp}")
|
| 28 |
+
|
| 29 |
+
df = pd.read_csv(inp)
|
| 30 |
+
missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
|
| 31 |
+
if missing:
|
| 32 |
+
raise ValueError(f"Missing required columns: {missing}")
|
| 33 |
+
|
| 34 |
+
run_dir = make_run_dir(run_id=args.run_id or None, base_dir=args.runs_dir)
|
| 35 |
+
rows = df.fillna("").to_dict("records")
|
| 36 |
+
|
| 37 |
+
result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
|
| 38 |
+
res_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
|
| 39 |
+
out_df = pd.DataFrame(res_rows)
|
| 40 |
+
|
| 41 |
+
out_csv = run_dir / "cancer_risk_results.csv"
|
| 42 |
+
out_json = run_dir / "cancer_risk_results.json"
|
| 43 |
+
write_dataframe_csv(out_csv, out_df)
|
| 44 |
+
write_json(out_json, result)
|
| 45 |
+
|
| 46 |
+
print(json.dumps({
|
| 47 |
+
"run_dir": str(run_dir),
|
| 48 |
+
"results_csv": str(out_csv),
|
| 49 |
+
"results_json": str(out_json),
|
| 50 |
+
"log_jsonl": result.get("artifacts", {}).get("log_jsonl", ""),
|
| 51 |
+
"report_md": result.get("artifacts", {}).get("report_md", ""),
|
| 52 |
+
"summary": result.get("summary", {}),
|
| 53 |
+
}, indent=2))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
main()
|
scripts/run_mcp_calc_server.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 6 |
+
if str(ROOT) not in sys.path:
|
| 7 |
+
sys.path.insert(0, str(ROOT))
|
| 8 |
+
|
| 9 |
+
from mcp_tox_calc.server import main
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
main()
|
tests/__pycache__/conftest.cpython-314.pyc
ADDED
|
Binary file (492 Bytes). View file
|
|
|
tests/__pycache__/test_equations.cpython-314.pyc
ADDED
|
Binary file (2.4 kB). View file
|
|
|
tests/__pycache__/test_mcp_tools.cpython-314.pyc
ADDED
|
Binary file (1.88 kB). View file
|
|
|
tests/__pycache__/test_nlp_pipeline.cpython-314.pyc
ADDED
|
Binary file (2.43 kB). View file
|
|
|
tests/__pycache__/test_regulatory_mapper.cpython-314.pyc
ADDED
|
Binary file (982 Bytes). View file
|
|
|
tests/__pycache__/test_units.cpython-314.pyc
ADDED
|
Binary file (1.79 kB). View file
|
|
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 5 |
+
if str(ROOT) not in sys.path:
|
| 6 |
+
sys.path.insert(0, str(ROOT))
|
tests/fixtures/extraction_sample.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"papers": [
|
| 3 |
+
{
|
| 4 |
+
"_file": "paper_a.pdf",
|
| 5 |
+
"paper_title": "Test Paper A",
|
| 6 |
+
"risk_stance": "acceptable_with_uncertainty",
|
| 7 |
+
"risk_confidence": 0.72,
|
| 8 |
+
"risk_summary": "Contains partial carcinogenicity evidence.",
|
| 9 |
+
"extracted": {
|
| 10 |
+
"chemicals": ["Nicotine"],
|
| 11 |
+
"genotox_oecd_tg_in_vitro": ["OECD_TG_471_Bacterial Reverse mutation test(AMES test)"],
|
| 12 |
+
"genotox_oecd_tg_in_vivo": ["not_reported"],
|
| 13 |
+
"genotoxicity_result": "equivocal",
|
| 14 |
+
"genotoxicity_result_notes": "AMES mixed outcomes.",
|
| 15 |
+
"carcinogenicity_result": "insufficient_data",
|
| 16 |
+
"carcinogenicity_notes": "Long-term bioassay absent.",
|
| 17 |
+
"dose_metrics": ["NOAEL 10 mg/kg-day"],
|
| 18 |
+
"exposure_route": "oral",
|
| 19 |
+
"key_findings": "Potential DNA response observed.",
|
| 20 |
+
"conclusion": "Needs additional testing."
|
| 21 |
+
},
|
| 22 |
+
"evidence": [
|
| 23 |
+
{
|
| 24 |
+
"field": "genotoxicity_result",
|
| 25 |
+
"quote": "The AMES assay showed equivocal mutagenicity outcomes.",
|
| 26 |
+
"pages": "4-5"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"field": "dose_metrics",
|
| 30 |
+
"quote": "NOAEL was reported at 10 mg/kg-day.",
|
| 31 |
+
"pages": "6"
|
| 32 |
+
}
|
| 33 |
+
]
|
| 34 |
+
}
|
| 35 |
+
],
|
| 36 |
+
"toxra_extensions": {
|
| 37 |
+
"nlp_diagnostics": [],
|
| 38 |
+
"regulatory_gap_assessment": {},
|
| 39 |
+
"risk_calculation_refs": []
|
| 40 |
+
}
|
| 41 |
+
}
|
tests/test_equations.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mcp_tox_calc.equations import (
|
| 2 |
+
calculate_epa_elcr_csf,
|
| 3 |
+
calculate_epa_elcr_iur,
|
| 4 |
+
calculate_fda_ctp_elcr,
|
| 5 |
+
run_batch_cancer_risk,
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_calculate_epa_elcr_csf_basic():
|
| 10 |
+
out = calculate_epa_elcr_csf(
|
| 11 |
+
{
|
| 12 |
+
"route": "oral",
|
| 13 |
+
"exposure_value": 0.01,
|
| 14 |
+
"exposure_unit": "mg/kg-day",
|
| 15 |
+
"body_weight_kg": 70,
|
| 16 |
+
"csf_value": 1.5,
|
| 17 |
+
"csf_unit": "(mg/kg-day)^-1",
|
| 18 |
+
}
|
| 19 |
+
)
|
| 20 |
+
assert round(out["result_value"], 8) == 0.015
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_calculate_epa_elcr_iur_basic():
|
| 24 |
+
out = calculate_epa_elcr_iur(
|
| 25 |
+
{
|
| 26 |
+
"route": "inhalation",
|
| 27 |
+
"air_conc_value": 100,
|
| 28 |
+
"air_conc_unit": "ug/m3",
|
| 29 |
+
"iur_value": 1e-6,
|
| 30 |
+
"iur_unit": "(ug/m3)^-1",
|
| 31 |
+
}
|
| 32 |
+
)
|
| 33 |
+
assert round(out["result_value"], 8) == 0.0001
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_fda_wrapper_aggregates_components():
|
| 37 |
+
out = calculate_fda_ctp_elcr(
|
| 38 |
+
{
|
| 39 |
+
"constituents": [
|
| 40 |
+
{
|
| 41 |
+
"route": "oral",
|
| 42 |
+
"exposure_value": 0.01,
|
| 43 |
+
"exposure_unit": "mg/kg-day",
|
| 44 |
+
"body_weight_kg": 70,
|
| 45 |
+
"csf_value": 1.0,
|
| 46 |
+
"csf_unit": "(mg/kg-day)^-1",
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"route": "inhalation",
|
| 50 |
+
"air_conc_value": 50,
|
| 51 |
+
"air_conc_unit": "ug/m3",
|
| 52 |
+
"iur_value": 1e-6,
|
| 53 |
+
"iur_unit": "(ug/m3)^-1",
|
| 54 |
+
},
|
| 55 |
+
]
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
assert out["result_value"] > 0
|
| 59 |
+
assert len(out["component_results"]) == 2
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_batch_handles_mixed_rows():
|
| 63 |
+
out = run_batch_cancer_risk(
|
| 64 |
+
{
|
| 65 |
+
"rows": [
|
| 66 |
+
{
|
| 67 |
+
"record_id": "r1",
|
| 68 |
+
"chemical_name": "ChemA",
|
| 69 |
+
"route": "oral",
|
| 70 |
+
"exposure_value": 0.02,
|
| 71 |
+
"exposure_unit": "mg/kg-day",
|
| 72 |
+
"body_weight_kg": 70,
|
| 73 |
+
"csf_value": 1.1,
|
| 74 |
+
"csf_unit": "(mg/kg-day)^-1",
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"record_id": "r2",
|
| 78 |
+
"chemical_name": "ChemB",
|
| 79 |
+
"route": "oral"
|
| 80 |
+
},
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
assert out["summary"]["total_rows"] == 2
|
| 85 |
+
assert out["summary"]["ok_rows"] == 1
|
| 86 |
+
assert out["summary"]["error_rows"] == 1
|
tests/test_mcp_tools.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from toxra_core.artifacts import make_run_dir
|
| 4 |
+
from toxra_core.calculation_client import MCPCalculationClient
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_mcp_client_lists_tools_and_runs_batch(tmp_path):
|
| 8 |
+
run_dir = make_run_dir(run_id="test_mcp", base_dir=str(tmp_path))
|
| 9 |
+
with MCPCalculationClient(run_dir=str(run_dir)) as client:
|
| 10 |
+
tools = client.list_tools()
|
| 11 |
+
names = {t.get("name") for t in tools}
|
| 12 |
+
assert "run_batch_cancer_risk" in names
|
| 13 |
+
|
| 14 |
+
result = client.call_tool(
|
| 15 |
+
"run_batch_cancer_risk",
|
| 16 |
+
{
|
| 17 |
+
"rows": [
|
| 18 |
+
{
|
| 19 |
+
"record_id": "r1",
|
| 20 |
+
"chemical_name": "ChemA",
|
| 21 |
+
"route": "oral",
|
| 22 |
+
"exposure_value": 0.01,
|
| 23 |
+
"exposure_unit": "mg/kg-day",
|
| 24 |
+
"body_weight_kg": 70,
|
| 25 |
+
"csf_value": 1.2,
|
| 26 |
+
"csf_unit": "(mg/kg-day)^-1",
|
| 27 |
+
"iur_value": "",
|
| 28 |
+
"air_conc_value": "",
|
| 29 |
+
"air_conc_unit": "",
|
| 30 |
+
}
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
assert result["summary"]["total_rows"] == 1
|
| 36 |
+
assert result["summary"]["ok_rows"] == 1
|
| 37 |
+
assert Path(result["artifacts"]["log_jsonl"]).exists()
|
| 38 |
+
assert Path(result["artifacts"]["report_md"]).exists()
|
tests/test_nlp_pipeline.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from toxra_core.nlp_pipeline import (
|
| 4 |
+
expand_regulatory_queries,
|
| 5 |
+
extract_evidence_span,
|
| 6 |
+
hybrid_rank_text_items,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_expand_regulatory_queries_adds_families():
|
| 11 |
+
queries, families = expand_regulatory_queries(
|
| 12 |
+
base_queries=["genotoxicity risk"],
|
| 13 |
+
endpoint_modules=["Genotoxicity (OECD TG)"],
|
| 14 |
+
frameworks=["FDA CTP"],
|
| 15 |
+
)
|
| 16 |
+
assert len(queries) > 1
|
| 17 |
+
assert "endpoint" in families
|
| 18 |
+
assert families["endpoint"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_extract_evidence_span_hit_and_fallback():
|
| 22 |
+
text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four."
|
| 23 |
+
hit = extract_evidence_span(text, "AMES")
|
| 24 |
+
assert "AMES" in hit["text"]
|
| 25 |
+
|
| 26 |
+
fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken")
|
| 27 |
+
assert fb["text"]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_hybrid_rank_text_items_lexical_only():
|
| 31 |
+
items = [
|
| 32 |
+
{"text": "This section discusses liver toxicity and NOAEL values."},
|
| 33 |
+
{"text": "Completely unrelated formulation text."},
|
| 34 |
+
]
|
| 35 |
+
selected, diag = hybrid_rank_text_items(items, query="NOAEL liver")
|
| 36 |
+
assert selected
|
| 37 |
+
assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_hybrid_rank_text_items_with_embeddings():
|
| 41 |
+
items = [{"text": "A"}, {"text": "B"}, {"text": "C"}]
|
| 42 |
+
emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32)
|
| 43 |
+
q = np.array([1.0, 0.0], dtype=np.float32)
|
| 44 |
+
selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q)
|
| 45 |
+
assert selected
|
| 46 |
+
assert diag["ranking_method"] == "hybrid_rrf"
|
tests/test_regulatory_mapper.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from toxra_core.regulatory_mapper import map_extraction_to_framework
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_regulatory_mapping_outputs_matrix_and_report():
|
| 8 |
+
fixture = Path("tests/fixtures/extraction_sample.json")
|
| 9 |
+
payload = json.loads(fixture.read_text(encoding="utf-8"))
|
| 10 |
+
|
| 11 |
+
df, report, md = map_extraction_to_framework(payload, framework="FDA CTP")
|
| 12 |
+
|
| 13 |
+
assert not df.empty
|
| 14 |
+
assert "clause_id" in df.columns
|
| 15 |
+
assert report["framework"] == "FDA CTP"
|
| 16 |
+
assert "Status Summary" in md
|
tests/test_units.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mcp_tox_calc.units import (
|
| 2 |
+
normalize_air_concentration,
|
| 3 |
+
normalize_csf,
|
| 4 |
+
normalize_iur,
|
| 5 |
+
normalize_oral_exposure,
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_normalize_oral_exposure_ug_per_kg_day():
|
| 10 |
+
out = normalize_oral_exposure(2500, "ug/kg-day", 70)
|
| 11 |
+
assert round(out["value_mg_per_kg_day"], 6) == 2.5
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_normalize_oral_exposure_mg_day_with_bw():
|
| 15 |
+
out = normalize_oral_exposure(7, "mg/day", 70)
|
| 16 |
+
assert round(out["value_mg_per_kg_day"], 6) == 0.1
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_normalize_air_concentration_mg_to_ug():
|
| 20 |
+
out = normalize_air_concentration(0.2, "mg/m3")
|
| 21 |
+
assert round(out["value_ug_per_m3"], 6) == 200.0
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_normalize_csf_from_ug_basis():
|
| 25 |
+
out = normalize_csf(0.001, "(ug/kg-day)^-1")
|
| 26 |
+
assert round(out["value_per_mg_per_kg_day"], 6) == 1.0
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_normalize_iur_from_mg_basis():
|
| 30 |
+
out = normalize_iur(0.002, "(mg/m3)^-1")
|
| 31 |
+
assert round(out["value_per_ug_per_m3"], 9) == 0.000002
|
toxra_core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Core modules for Toxra extraction, mapping, NLP, and calculation orchestration."""
|
toxra_core/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (215 Bytes). View file
|
|
|
toxra_core/__pycache__/artifacts.cpython-314.pyc
ADDED
|
Binary file (4.62 kB). View file
|
|
|
toxra_core/__pycache__/calculation_client.cpython-314.pyc
ADDED
|
Binary file (8.73 kB). View file
|
|
|
toxra_core/__pycache__/contracts.cpython-314.pyc
ADDED
|
Binary file (4.04 kB). View file
|
|
|
toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc
ADDED
|
Binary file (17.9 kB). View file
|
|
|
toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc
ADDED
|
Binary file (13.6 kB). View file
|
|
|
toxra_core/artifacts.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any, Dict, Iterable, Optional
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from .contracts import default_run_id
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def make_run_dir(run_id: Optional[str] = None, base_dir: str = "runs") -> Path:
|
| 11 |
+
rid = run_id or default_run_id("run")
|
| 12 |
+
out = Path(base_dir) / rid
|
| 13 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 14 |
+
return out
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def write_json(path: Path, data: Any) -> Path:
|
| 18 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
| 20 |
+
return path
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def write_markdown(path: Path, text: str) -> Path:
|
| 24 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
path.write_text(text or "", encoding="utf-8")
|
| 26 |
+
return path
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def append_jsonl(path: Path, row: Dict[str, Any]) -> Path:
|
| 30 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
with path.open("a", encoding="utf-8") as f:
|
| 32 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 33 |
+
return path
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> Path:
|
| 37 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
with path.open("w", encoding="utf-8") as f:
|
| 39 |
+
for row in rows:
|
| 40 |
+
f.write(json.dumps(row, ensure_ascii=True) + "\n")
|
| 41 |
+
return path
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def write_dataframe_csv(path: Path, df: pd.DataFrame) -> Path:
|
| 45 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 46 |
+
df.to_csv(path, index=False)
|
| 47 |
+
return path
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_json(path: Path) -> Any:
|
| 51 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
toxra_core/calculation_client.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import subprocess
|
| 3 |
+
import sys
|
| 4 |
+
from typing import Any, Dict, List, Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class MCPClientError(RuntimeError):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MCPCalculationClient:
|
| 12 |
+
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
run_dir: str,
|
| 15 |
+
python_executable: Optional[str] = None,
|
| 16 |
+
server_script: Optional[str] = None,
|
| 17 |
+
):
|
| 18 |
+
self.run_dir = str(run_dir)
|
| 19 |
+
self.python_executable = python_executable or sys.executable
|
| 20 |
+
if server_script:
|
| 21 |
+
self.server_script = str(server_script)
|
| 22 |
+
else:
|
| 23 |
+
self.server_script = None
|
| 24 |
+
self._proc: Optional[subprocess.Popen] = None
|
| 25 |
+
self._id = 0
|
| 26 |
+
|
| 27 |
+
def start(self) -> None:
|
| 28 |
+
if self._proc is not None:
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
if self.server_script:
|
| 32 |
+
cmd = [
|
| 33 |
+
self.python_executable,
|
| 34 |
+
self.server_script,
|
| 35 |
+
"--stdio",
|
| 36 |
+
"--run-dir",
|
| 37 |
+
self.run_dir,
|
| 38 |
+
]
|
| 39 |
+
else:
|
| 40 |
+
cmd = [
|
| 41 |
+
self.python_executable,
|
| 42 |
+
"-m",
|
| 43 |
+
"mcp_tox_calc.server",
|
| 44 |
+
"--stdio",
|
| 45 |
+
"--run-dir",
|
| 46 |
+
self.run_dir,
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
self._proc = subprocess.Popen(
|
| 50 |
+
cmd,
|
| 51 |
+
stdin=subprocess.PIPE,
|
| 52 |
+
stdout=subprocess.PIPE,
|
| 53 |
+
stderr=subprocess.PIPE,
|
| 54 |
+
text=True,
|
| 55 |
+
bufsize=1,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
self._request("initialize", {"protocolVersion": "2024-11-05", "clientInfo": {"name": "toxra-app", "version": "0.1.0"}})
|
| 59 |
+
|
| 60 |
+
def stop(self) -> None:
|
| 61 |
+
if self._proc is None:
|
| 62 |
+
return
|
| 63 |
+
try:
|
| 64 |
+
if self._proc.stdin:
|
| 65 |
+
self._proc.stdin.close()
|
| 66 |
+
self._proc.terminate()
|
| 67 |
+
self._proc.wait(timeout=3)
|
| 68 |
+
except Exception:
|
| 69 |
+
try:
|
| 70 |
+
self._proc.kill()
|
| 71 |
+
except Exception:
|
| 72 |
+
pass
|
| 73 |
+
finally:
|
| 74 |
+
self._proc = None
|
| 75 |
+
|
| 76 |
+
def __enter__(self):
|
| 77 |
+
self.start()
|
| 78 |
+
return self
|
| 79 |
+
|
| 80 |
+
def __exit__(self, exc_type, exc, tb):
|
| 81 |
+
self.stop()
|
| 82 |
+
|
| 83 |
+
def _request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
|
| 84 |
+
if self._proc is None:
|
| 85 |
+
raise MCPClientError("MCP server not started.")
|
| 86 |
+
if self._proc.stdin is None or self._proc.stdout is None:
|
| 87 |
+
raise MCPClientError("MCP server pipes unavailable.")
|
| 88 |
+
|
| 89 |
+
self._id += 1
|
| 90 |
+
req_id = self._id
|
| 91 |
+
request = {
|
| 92 |
+
"jsonrpc": "2.0",
|
| 93 |
+
"id": req_id,
|
| 94 |
+
"method": method,
|
| 95 |
+
"params": params,
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
self._proc.stdin.write(json.dumps(request) + "\n")
|
| 99 |
+
self._proc.stdin.flush()
|
| 100 |
+
|
| 101 |
+
while True:
|
| 102 |
+
line = self._proc.stdout.readline()
|
| 103 |
+
if line == "":
|
| 104 |
+
err = ""
|
| 105 |
+
if self._proc.stderr is not None:
|
| 106 |
+
try:
|
| 107 |
+
err = self._proc.stderr.read()[-1500:]
|
| 108 |
+
except Exception:
|
| 109 |
+
err = ""
|
| 110 |
+
raise MCPClientError(f"No response from MCP server. stderr={err}")
|
| 111 |
+
|
| 112 |
+
line = line.strip()
|
| 113 |
+
if not line:
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
resp = json.loads(line)
|
| 118 |
+
except Exception:
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
if resp.get("id") != req_id:
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
if "error" in resp:
|
| 125 |
+
raise MCPClientError(str(resp["error"]))
|
| 126 |
+
|
| 127 |
+
result = resp.get("result", {})
|
| 128 |
+
if not isinstance(result, dict):
|
| 129 |
+
return {"result": result}
|
| 130 |
+
return result
|
| 131 |
+
|
| 132 |
+
def list_tools(self) -> List[Dict[str, Any]]:
|
| 133 |
+
result = self._request("tools/list", {})
|
| 134 |
+
tools = result.get("tools", [])
|
| 135 |
+
return tools if isinstance(tools, list) else []
|
| 136 |
+
|
| 137 |
+
def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
| 138 |
+
result = self._request("tools/call", {"name": name, "arguments": arguments})
|
| 139 |
+
content = result.get("content", []) if isinstance(result, dict) else []
|
| 140 |
+
if isinstance(content, list) and content:
|
| 141 |
+
first = content[0]
|
| 142 |
+
if isinstance(first, dict) and first.get("type") == "json":
|
| 143 |
+
data = first.get("json", {})
|
| 144 |
+
return data if isinstance(data, dict) else {"value": data}
|
| 145 |
+
return result if isinstance(result, dict) else {"value": result}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def run_batch_cancer_risk(rows: List[Dict[str, Any]], run_dir: str) -> Dict[str, Any]:
|
| 149 |
+
with MCPCalculationClient(run_dir=run_dir) as client:
|
| 150 |
+
return client.call_tool("run_batch_cancer_risk", {"rows": rows})
|
toxra_core/contracts.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime as _dt
|
| 2 |
+
import uuid
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Any, Dict, List, Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
CANCER_RISK_TEMPLATE_COLUMNS: List[str] = [
|
| 8 |
+
"record_id",
|
| 9 |
+
"chemical_name",
|
| 10 |
+
"casrn",
|
| 11 |
+
"route",
|
| 12 |
+
"exposure_value",
|
| 13 |
+
"exposure_unit",
|
| 14 |
+
"body_weight_kg",
|
| 15 |
+
"csf_value",
|
| 16 |
+
"csf_unit",
|
| 17 |
+
"iur_value",
|
| 18 |
+
"air_conc_value",
|
| 19 |
+
"air_conc_unit",
|
| 20 |
+
"source_reference",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def default_run_id(prefix: str = "run") -> str:
|
| 25 |
+
ts = _dt.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
| 26 |
+
return f"{prefix}_{ts}_{uuid.uuid4().hex[:8]}"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class RegulatoryClause:
|
| 31 |
+
clause_id: str
|
| 32 |
+
framework: str
|
| 33 |
+
title: str
|
| 34 |
+
description: str
|
| 35 |
+
required_fields: List[str] = field(default_factory=list)
|
| 36 |
+
required_evidence_terms: List[str] = field(default_factory=list)
|
| 37 |
+
acceptance_rule: str = "all_required_fields"
|
| 38 |
+
applicability: Dict[str, Any] = field(default_factory=dict)
|
| 39 |
+
source_reference: str = ""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class ClauseEvaluation:
|
| 44 |
+
clause_id: str
|
| 45 |
+
framework: str
|
| 46 |
+
status: str
|
| 47 |
+
fields_present: List[str] = field(default_factory=list)
|
| 48 |
+
missing_fields: List[str] = field(default_factory=list)
|
| 49 |
+
evidence_hits: List[str] = field(default_factory=list)
|
| 50 |
+
prompt: str = ""
|
| 51 |
+
reason: str = ""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
RISK_TIER_THRESHOLDS = {
|
| 55 |
+
"low": 1e-6,
|
| 56 |
+
"moderate": 1e-4,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def classify_risk_tier(value: Optional[float]) -> str:
|
| 61 |
+
if value is None:
|
| 62 |
+
return "unknown"
|
| 63 |
+
if value < RISK_TIER_THRESHOLDS["low"]:
|
| 64 |
+
return "de_minimis"
|
| 65 |
+
if value <= RISK_TIER_THRESHOLDS["moderate"]:
|
| 66 |
+
return "monitor"
|
| 67 |
+
return "high_priority"
|
toxra_core/nlp_pipeline.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
try:
|
| 6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 7 |
+
except Exception: # pragma: no cover - fallback path for minimal runtime
|
| 8 |
+
TfidfVectorizer = None
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
|
| 12 |
+
"Genotoxicity (OECD TG)": [
|
| 13 |
+
"genotoxicity",
|
| 14 |
+
"mutagenicity",
|
| 15 |
+
"AMES",
|
| 16 |
+
"micronucleus",
|
| 17 |
+
"comet assay",
|
| 18 |
+
"chromosomal aberration",
|
| 19 |
+
"OECD TG 471 473 476 487 490 474 489",
|
| 20 |
+
],
|
| 21 |
+
"NAMs / In Silico": [
|
| 22 |
+
"in silico",
|
| 23 |
+
"QSAR",
|
| 24 |
+
"read-across",
|
| 25 |
+
"AOP",
|
| 26 |
+
"PBPK",
|
| 27 |
+
"high-throughput",
|
| 28 |
+
"omics",
|
| 29 |
+
"organ-on-chip",
|
| 30 |
+
"microphysiological",
|
| 31 |
+
],
|
| 32 |
+
"Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
|
| 33 |
+
"Repeated dose toxicity": [
|
| 34 |
+
"repeated dose",
|
| 35 |
+
"subchronic",
|
| 36 |
+
"chronic",
|
| 37 |
+
"NOAEL",
|
| 38 |
+
"LOAEL",
|
| 39 |
+
"target organ",
|
| 40 |
+
"90-day",
|
| 41 |
+
"28-day",
|
| 42 |
+
],
|
| 43 |
+
"Irritation / Sensitization": ["skin irritation", "eye irritation", "sensitization", "LLNA", "Draize"],
|
| 44 |
+
"Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogenic", "prenatal", "postnatal"],
|
| 45 |
+
"Carcinogenicity": ["carcinogenicity", "tumor", "neoplasm", "cancer", "two-year bioassay"],
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
FRAMEWORK_QUERY_HINTS: Dict[str, List[str]] = {
|
| 49 |
+
"FDA CTP": [
|
| 50 |
+
"genotoxicity hazard identification",
|
| 51 |
+
"carcinogenicity tiering",
|
| 52 |
+
"excess lifetime cancer risk",
|
| 53 |
+
"constituent comparison",
|
| 54 |
+
"weight of evidence",
|
| 55 |
+
],
|
| 56 |
+
"EPA": [
|
| 57 |
+
"cancer slope factor",
|
| 58 |
+
"inhalation unit risk",
|
| 59 |
+
"lifetime excess cancer risk",
|
| 60 |
+
"mode of action",
|
| 61 |
+
"weight of evidence descriptors",
|
| 62 |
+
],
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
EQUATION_INPUT_HINTS: List[str] = [
|
| 66 |
+
"exposure concentration",
|
| 67 |
+
"daily intake",
|
| 68 |
+
"mg/kg-day",
|
| 69 |
+
"ug/m3",
|
| 70 |
+
"cancer slope factor",
|
| 71 |
+
"inhalation unit risk",
|
| 72 |
+
"body weight",
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def clean_text(t: str) -> str:
|
| 77 |
+
t = (t or "").replace("\x00", " ")
|
| 78 |
+
return re.sub(r"\s+", " ", t).strip()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def split_sentences(text: str) -> List[str]:
|
| 82 |
+
t = clean_text(text)
|
| 83 |
+
if not t:
|
| 84 |
+
return []
|
| 85 |
+
return [x.strip() for x in re.split(r"(?<=[\.!\?])\s+", t) if x.strip()]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _tokenize(s: str) -> List[str]:
|
| 89 |
+
return [w for w in re.findall(r"[a-zA-Z0-9\-]+", (s or "").lower()) if len(w) >= 3]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def extract_evidence_span(page_text: str, query: str, page: Optional[int] = None, n_sentences: int = 5) -> Dict[str, Any]:
|
| 93 |
+
sents = split_sentences(page_text)
|
| 94 |
+
if not sents:
|
| 95 |
+
return {"text": "", "page": page, "start_sentence": 0, "mode": "empty"}
|
| 96 |
+
|
| 97 |
+
qwords = _tokenize(query)
|
| 98 |
+
hit_i = None
|
| 99 |
+
for i, s in enumerate(sents):
|
| 100 |
+
sl = s.lower()
|
| 101 |
+
if any(w in sl for w in qwords):
|
| 102 |
+
hit_i = i
|
| 103 |
+
break
|
| 104 |
+
|
| 105 |
+
if hit_i is None:
|
| 106 |
+
snippet = " ".join(sents[:n_sentences])
|
| 107 |
+
return {"text": snippet, "page": page, "start_sentence": 0, "mode": "fallback"}
|
| 108 |
+
|
| 109 |
+
start = max(0, hit_i - 2)
|
| 110 |
+
end = min(len(sents), hit_i + 3)
|
| 111 |
+
snippet = " ".join(sents[start:end])
|
| 112 |
+
return {"text": snippet, "page": page, "start_sentence": start, "mode": "hit"}
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def build_query_families(
|
| 116 |
+
base_queries: List[str],
|
| 117 |
+
endpoint_modules: Optional[List[str]] = None,
|
| 118 |
+
frameworks: Optional[List[str]] = None,
|
| 119 |
+
) -> Dict[str, List[str]]:
|
| 120 |
+
endpoint_modules = endpoint_modules or []
|
| 121 |
+
frameworks = frameworks or []
|
| 122 |
+
|
| 123 |
+
endpoint_terms: List[str] = []
|
| 124 |
+
for ep in endpoint_modules:
|
| 125 |
+
endpoint_terms.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
|
| 126 |
+
|
| 127 |
+
framework_terms: List[str] = []
|
| 128 |
+
for fw in frameworks:
|
| 129 |
+
framework_terms.extend(FRAMEWORK_QUERY_HINTS.get(fw, []))
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
"base": [q for q in base_queries if (q or "").strip()],
|
| 133 |
+
"endpoint": endpoint_terms,
|
| 134 |
+
"framework": framework_terms,
|
| 135 |
+
"equation_inputs": EQUATION_INPUT_HINTS,
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def expand_regulatory_queries(
|
| 140 |
+
base_queries: List[str],
|
| 141 |
+
endpoint_modules: Optional[List[str]] = None,
|
| 142 |
+
frameworks: Optional[List[str]] = None,
|
| 143 |
+
extra_terms: Optional[List[str]] = None,
|
| 144 |
+
) -> Tuple[List[str], Dict[str, List[str]]]:
|
| 145 |
+
families = build_query_families(base_queries, endpoint_modules, frameworks)
|
| 146 |
+
queries: List[str] = []
|
| 147 |
+
for vals in families.values():
|
| 148 |
+
queries.extend(vals)
|
| 149 |
+
queries.extend(extra_terms or [])
|
| 150 |
+
|
| 151 |
+
deduped: List[str] = []
|
| 152 |
+
seen = set()
|
| 153 |
+
for q in queries:
|
| 154 |
+
x = (q or "").strip()
|
| 155 |
+
if not x:
|
| 156 |
+
continue
|
| 157 |
+
k = x.lower()
|
| 158 |
+
if k in seen:
|
| 159 |
+
continue
|
| 160 |
+
seen.add(k)
|
| 161 |
+
deduped.append(x)
|
| 162 |
+
|
| 163 |
+
return deduped, families
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _lexical_ranks(texts: List[str], query: str) -> Tuple[List[int], np.ndarray]:
|
| 167 |
+
if not texts:
|
| 168 |
+
return [], np.array([], dtype=np.float32)
|
| 169 |
+
if TfidfVectorizer is None:
|
| 170 |
+
q_tokens = set(_tokenize(query))
|
| 171 |
+
sims = []
|
| 172 |
+
for t in texts:
|
| 173 |
+
tl = t.lower()
|
| 174 |
+
sims.append(float(sum(1 for tok in q_tokens if tok in tl)))
|
| 175 |
+
arr = np.array(sims, dtype=np.float32)
|
| 176 |
+
order = list(np.argsort(arr)[::-1])
|
| 177 |
+
return order, arr
|
| 178 |
+
|
| 179 |
+
vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=25000)
|
| 180 |
+
x = vec.fit_transform(texts)
|
| 181 |
+
qv = vec.transform([query])
|
| 182 |
+
sims = (x @ qv.T).toarray().ravel().astype(np.float32)
|
| 183 |
+
order = list(np.argsort(sims)[::-1])
|
| 184 |
+
return order, sims
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _embedding_ranks(item_embeddings: np.ndarray, query_embedding: np.ndarray) -> Tuple[List[int], np.ndarray]:
|
| 188 |
+
if item_embeddings.size == 0:
|
| 189 |
+
return [], np.array([], dtype=np.float32)
|
| 190 |
+
q = np.asarray(query_embedding, dtype=np.float32)
|
| 191 |
+
qn = np.linalg.norm(q) + 1e-12
|
| 192 |
+
q = q / qn
|
| 193 |
+
mat = np.asarray(item_embeddings, dtype=np.float32)
|
| 194 |
+
norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
|
| 195 |
+
mat = mat / norms
|
| 196 |
+
sims = (mat @ q).astype(np.float32)
|
| 197 |
+
order = list(np.argsort(sims)[::-1])
|
| 198 |
+
return order, sims
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _rrf_score(rank_lists: List[List[int]], k: int = 60) -> Dict[int, float]:
|
| 202 |
+
out: Dict[int, float] = {}
|
| 203 |
+
for rank_list in rank_lists:
|
| 204 |
+
for rank_pos, idx in enumerate(rank_list):
|
| 205 |
+
out[idx] = out.get(idx, 0.0) + (1.0 / (k + rank_pos + 1.0))
|
| 206 |
+
return out
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _family_coverage_score(texts: List[str], families: Dict[str, List[str]]) -> Dict[str, float]:
|
| 210 |
+
merged = " ".join([clean_text(t).lower() for t in texts])
|
| 211 |
+
out: Dict[str, float] = {}
|
| 212 |
+
for family, queries in families.items():
|
| 213 |
+
if not queries:
|
| 214 |
+
out[family] = 0.0
|
| 215 |
+
continue
|
| 216 |
+
hits = 0
|
| 217 |
+
for q in queries:
|
| 218 |
+
tokens = _tokenize(q)
|
| 219 |
+
if any(t in merged for t in tokens):
|
| 220 |
+
hits += 1
|
| 221 |
+
out[family] = round(hits / max(1, len(queries)), 4)
|
| 222 |
+
return out
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def hybrid_rank_text_items(
|
| 226 |
+
items: List[Dict[str, Any]],
|
| 227 |
+
query: str,
|
| 228 |
+
families: Optional[Dict[str, List[str]]] = None,
|
| 229 |
+
top_k: int = 12,
|
| 230 |
+
item_embeddings: Optional[np.ndarray] = None,
|
| 231 |
+
query_embedding: Optional[np.ndarray] = None,
|
| 232 |
+
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
| 233 |
+
if not items:
|
| 234 |
+
return [], {
|
| 235 |
+
"ranking_method": "empty",
|
| 236 |
+
"selected_indices": [],
|
| 237 |
+
"coverage_by_query_family": families or {},
|
| 238 |
+
"coverage_score": 0.0,
|
| 239 |
+
"component_scores": {},
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
texts = [clean_text(i.get("text", "")) for i in items]
|
| 243 |
+
|
| 244 |
+
lex_order, lex_scores = _lexical_ranks(texts, query)
|
| 245 |
+
rank_lists = [lex_order]
|
| 246 |
+
method = "lexical_only"
|
| 247 |
+
|
| 248 |
+
emb_scores = None
|
| 249 |
+
if item_embeddings is not None and query_embedding is not None:
|
| 250 |
+
try:
|
| 251 |
+
emb_order, emb_scores = _embedding_ranks(item_embeddings, query_embedding)
|
| 252 |
+
rank_lists.append(emb_order)
|
| 253 |
+
method = "hybrid_rrf"
|
| 254 |
+
except Exception:
|
| 255 |
+
emb_scores = None
|
| 256 |
+
|
| 257 |
+
rrf = _rrf_score(rank_lists)
|
| 258 |
+
final_order = sorted(rrf.keys(), key=lambda idx: rrf[idx], reverse=True)
|
| 259 |
+
selected_indices = final_order[: max(1, int(top_k))]
|
| 260 |
+
|
| 261 |
+
selected: List[Dict[str, Any]] = []
|
| 262 |
+
for idx in selected_indices:
|
| 263 |
+
row = dict(items[idx])
|
| 264 |
+
row["_nlp_rrf_score"] = float(rrf.get(idx, 0.0))
|
| 265 |
+
row["_nlp_lex_score"] = float(lex_scores[idx]) if len(lex_scores) > idx else 0.0
|
| 266 |
+
if emb_scores is not None and len(emb_scores) > idx:
|
| 267 |
+
row["_nlp_emb_score"] = float(emb_scores[idx])
|
| 268 |
+
selected.append(row)
|
| 269 |
+
|
| 270 |
+
fam = families or {"base": [query]}
|
| 271 |
+
cov = _family_coverage_score([x.get("text", "") for x in selected], fam)
|
| 272 |
+
cov_score = round(float(np.mean(list(cov.values()))) if cov else 0.0, 4)
|
| 273 |
+
|
| 274 |
+
diagnostics = {
|
| 275 |
+
"ranking_method": method,
|
| 276 |
+
"selected_indices": selected_indices,
|
| 277 |
+
"coverage_by_query_family": cov,
|
| 278 |
+
"coverage_score": cov_score,
|
| 279 |
+
"component_scores": {
|
| 280 |
+
"lexical": [float(lex_scores[i]) for i in selected_indices if len(lex_scores) > i],
|
| 281 |
+
"embedding": [float(emb_scores[i]) for i in selected_indices if emb_scores is not None and len(emb_scores) > i],
|
| 282 |
+
},
|
| 283 |
+
}
|
| 284 |
+
return selected, diagnostics
|
toxra_core/regulatory_mapper.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from .contracts import ClauseEvaluation, RegulatoryClause
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
FRAMEWORK_TO_FILE = {
|
| 11 |
+
"FDA CTP": "fda_ctp_v2024_06.json",
|
| 12 |
+
"EPA": "epa_cancer_v2005.json",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _is_non_empty(v: Any) -> bool:
|
| 19 |
+
if v is None:
|
| 20 |
+
return False
|
| 21 |
+
if isinstance(v, list):
|
| 22 |
+
vals = [str(x).strip() for x in v if str(x).strip()]
|
| 23 |
+
if not vals:
|
| 24 |
+
return False
|
| 25 |
+
return not all(x.lower() in EMPTY_STRINGS for x in vals)
|
| 26 |
+
s = str(v).strip()
|
| 27 |
+
if not s:
|
| 28 |
+
return False
|
| 29 |
+
return s.lower() not in EMPTY_STRINGS
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _normalize_payload(extraction_payload: Any) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
| 33 |
+
if isinstance(extraction_payload, dict):
|
| 34 |
+
papers = extraction_payload.get("papers", [])
|
| 35 |
+
if isinstance(papers, list):
|
| 36 |
+
ext = extraction_payload.get("toxra_extensions", {})
|
| 37 |
+
return papers, (ext if isinstance(ext, dict) else {})
|
| 38 |
+
if isinstance(extraction_payload, list):
|
| 39 |
+
return extraction_payload, {}
|
| 40 |
+
raise ValueError("Unsupported extraction payload format. Expected list or object with papers.")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_framework_catalog(framework: str, catalog_dir: str = "regulatory_catalog") -> List[RegulatoryClause]:
|
| 44 |
+
fname = FRAMEWORK_TO_FILE.get(framework)
|
| 45 |
+
if not fname:
|
| 46 |
+
raise ValueError(f"Unsupported framework: {framework}")
|
| 47 |
+
path = Path(catalog_dir) / fname
|
| 48 |
+
if not path.exists():
|
| 49 |
+
raise FileNotFoundError(f"Catalog not found: {path}")
|
| 50 |
+
|
| 51 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 52 |
+
clauses = data.get("clauses", []) if isinstance(data, dict) else []
|
| 53 |
+
|
| 54 |
+
out: List[RegulatoryClause] = []
|
| 55 |
+
for c in clauses:
|
| 56 |
+
out.append(
|
| 57 |
+
RegulatoryClause(
|
| 58 |
+
clause_id=str(c.get("clause_id", "")).strip(),
|
| 59 |
+
framework=str(c.get("framework", framework)).strip(),
|
| 60 |
+
title=str(c.get("title", "")).strip(),
|
| 61 |
+
description=str(c.get("description", "")).strip(),
|
| 62 |
+
required_fields=list(c.get("required_fields", []) or []),
|
| 63 |
+
required_evidence_terms=list(c.get("required_evidence_terms", []) or []),
|
| 64 |
+
acceptance_rule=str(c.get("acceptance_rule", "all_required_fields")).strip(),
|
| 65 |
+
applicability=dict(c.get("applicability", {}) or {}),
|
| 66 |
+
source_reference=str(c.get("source_reference", "")).strip(),
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
return out
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _clause_applicable(extracted: Dict[str, Any], clause: RegulatoryClause) -> bool:
|
| 73 |
+
app = clause.applicability or {}
|
| 74 |
+
if not app:
|
| 75 |
+
return True
|
| 76 |
+
field = str(app.get("field", "")).strip()
|
| 77 |
+
equals = app.get("equals", None)
|
| 78 |
+
if not field:
|
| 79 |
+
return True
|
| 80 |
+
val = extracted.get(field)
|
| 81 |
+
if isinstance(val, list):
|
| 82 |
+
vals = [str(x).strip().lower() for x in val]
|
| 83 |
+
return str(equals).strip().lower() in vals
|
| 84 |
+
return str(val).strip().lower() == str(equals).strip().lower()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _evaluate_clause(
|
| 88 |
+
extracted: Dict[str, Any],
|
| 89 |
+
evidence: List[Dict[str, Any]],
|
| 90 |
+
clause: RegulatoryClause,
|
| 91 |
+
override_notes: str = "",
|
| 92 |
+
) -> ClauseEvaluation:
|
| 93 |
+
if not _clause_applicable(extracted, clause):
|
| 94 |
+
return ClauseEvaluation(
|
| 95 |
+
clause_id=clause.clause_id,
|
| 96 |
+
framework=clause.framework,
|
| 97 |
+
status="not_applicable",
|
| 98 |
+
reason="Applicability condition not met.",
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
present: List[str] = []
|
| 102 |
+
missing: List[str] = []
|
| 103 |
+
for f in clause.required_fields:
|
| 104 |
+
if _is_non_empty(extracted.get(f)):
|
| 105 |
+
present.append(f)
|
| 106 |
+
else:
|
| 107 |
+
missing.append(f)
|
| 108 |
+
|
| 109 |
+
evidence_hits: List[str] = []
|
| 110 |
+
ev_text = " ".join([str(x.get("quote", "")) for x in evidence]).lower()
|
| 111 |
+
for term in clause.required_evidence_terms:
|
| 112 |
+
t = str(term).strip().lower()
|
| 113 |
+
if t and t in ev_text:
|
| 114 |
+
evidence_hits.append(term)
|
| 115 |
+
|
| 116 |
+
if clause.required_fields:
|
| 117 |
+
if clause.acceptance_rule == "any_required_fields":
|
| 118 |
+
field_ok = len(present) > 0
|
| 119 |
+
else:
|
| 120 |
+
field_ok = len(missing) == 0
|
| 121 |
+
else:
|
| 122 |
+
field_ok = True
|
| 123 |
+
|
| 124 |
+
evidence_ok = True
|
| 125 |
+
if clause.required_evidence_terms:
|
| 126 |
+
evidence_ok = len(evidence_hits) > 0
|
| 127 |
+
|
| 128 |
+
if field_ok and evidence_ok:
|
| 129 |
+
status = "covered"
|
| 130 |
+
elif present or evidence_hits:
|
| 131 |
+
status = "partial"
|
| 132 |
+
else:
|
| 133 |
+
status = "missing"
|
| 134 |
+
|
| 135 |
+
missing_prompt = ""
|
| 136 |
+
if status in {"missing", "partial"}:
|
| 137 |
+
need_fields = ", ".join(missing) if missing else "additional corroborating evidence"
|
| 138 |
+
missing_prompt = (
|
| 139 |
+
f"Provide evidence for clause {clause.clause_id} ({clause.title}). "
|
| 140 |
+
f"Missing: {need_fields}."
|
| 141 |
+
)
|
| 142 |
+
if override_notes.strip():
|
| 143 |
+
missing_prompt += f" Notes: {override_notes.strip()}"
|
| 144 |
+
|
| 145 |
+
return ClauseEvaluation(
|
| 146 |
+
clause_id=clause.clause_id,
|
| 147 |
+
framework=clause.framework,
|
| 148 |
+
status=status,
|
| 149 |
+
fields_present=present,
|
| 150 |
+
missing_fields=missing,
|
| 151 |
+
evidence_hits=evidence_hits,
|
| 152 |
+
prompt=missing_prompt,
|
| 153 |
+
reason="",
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _paper_record_id(paper: Dict[str, Any]) -> str:
|
| 158 |
+
file_name = str(paper.get("_file", "unknown.pdf"))
|
| 159 |
+
extracted = paper.get("extracted", {}) or {}
|
| 160 |
+
chems = extracted.get("chemicals", [])
|
| 161 |
+
chem = "-"
|
| 162 |
+
if isinstance(chems, list) and chems:
|
| 163 |
+
chem = str(chems[0]).strip() or "-"
|
| 164 |
+
return f"{file_name} | {chem} | Paper"
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def map_extraction_to_framework(
|
| 168 |
+
extraction_payload: Any,
|
| 169 |
+
framework: str,
|
| 170 |
+
catalog_dir: str = "regulatory_catalog",
|
| 171 |
+
override_notes: str = "",
|
| 172 |
+
) -> Tuple[pd.DataFrame, Dict[str, Any], str]:
|
| 173 |
+
papers, existing_ext = _normalize_payload(extraction_payload)
|
| 174 |
+
clauses = load_framework_catalog(framework, catalog_dir=catalog_dir)
|
| 175 |
+
|
| 176 |
+
rows: List[Dict[str, Any]] = []
|
| 177 |
+
status_counts = {"covered": 0, "partial": 0, "missing": 0, "not_applicable": 0}
|
| 178 |
+
prompts: List[str] = []
|
| 179 |
+
|
| 180 |
+
for p in papers:
|
| 181 |
+
extracted = p.get("extracted", {}) or {}
|
| 182 |
+
evidence = p.get("evidence", []) or []
|
| 183 |
+
rec_id = _paper_record_id(p)
|
| 184 |
+
file_name = str(p.get("_file", ""))
|
| 185 |
+
title = str(p.get("paper_title", ""))
|
| 186 |
+
|
| 187 |
+
for clause in clauses:
|
| 188 |
+
ev = _evaluate_clause(extracted, evidence, clause, override_notes=override_notes)
|
| 189 |
+
status_counts[ev.status] = status_counts.get(ev.status, 0) + 1
|
| 190 |
+
if ev.prompt:
|
| 191 |
+
prompts.append(ev.prompt)
|
| 192 |
+
|
| 193 |
+
rows.append(
|
| 194 |
+
{
|
| 195 |
+
"framework": framework,
|
| 196 |
+
"clause_id": clause.clause_id,
|
| 197 |
+
"clause_title": clause.title,
|
| 198 |
+
"file": file_name,
|
| 199 |
+
"paper_title": title,
|
| 200 |
+
"record_id": rec_id,
|
| 201 |
+
"status": ev.status,
|
| 202 |
+
"fields_present": "; ".join(ev.fields_present),
|
| 203 |
+
"missing_fields": "; ".join(ev.missing_fields),
|
| 204 |
+
"evidence_hits": "; ".join(ev.evidence_hits),
|
| 205 |
+
"prompt": ev.prompt,
|
| 206 |
+
"source_reference": clause.source_reference,
|
| 207 |
+
}
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
df = pd.DataFrame(
|
| 211 |
+
rows,
|
| 212 |
+
columns=[
|
| 213 |
+
"framework",
|
| 214 |
+
"clause_id",
|
| 215 |
+
"clause_title",
|
| 216 |
+
"file",
|
| 217 |
+
"paper_title",
|
| 218 |
+
"record_id",
|
| 219 |
+
"status",
|
| 220 |
+
"fields_present",
|
| 221 |
+
"missing_fields",
|
| 222 |
+
"evidence_hits",
|
| 223 |
+
"prompt",
|
| 224 |
+
"source_reference",
|
| 225 |
+
],
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
report = {
|
| 229 |
+
"framework": framework,
|
| 230 |
+
"summary": status_counts,
|
| 231 |
+
"missing_prompts": prompts,
|
| 232 |
+
"existing_toxra_extensions": existing_ext,
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
md_lines = [
|
| 236 |
+
f"# {framework} Regulatory Gap Assessment",
|
| 237 |
+
"",
|
| 238 |
+
"## Status Summary",
|
| 239 |
+
f"- Covered: {status_counts.get('covered', 0)}",
|
| 240 |
+
f"- Partial: {status_counts.get('partial', 0)}",
|
| 241 |
+
f"- Missing: {status_counts.get('missing', 0)}",
|
| 242 |
+
f"- Not applicable: {status_counts.get('not_applicable', 0)}",
|
| 243 |
+
"",
|
| 244 |
+
"## Priority Data Gaps",
|
| 245 |
+
]
|
| 246 |
+
|
| 247 |
+
if prompts:
|
| 248 |
+
for p in prompts[:50]:
|
| 249 |
+
md_lines.append(f"- {p}")
|
| 250 |
+
else:
|
| 251 |
+
md_lines.append("- No immediate gaps identified.")
|
| 252 |
+
|
| 253 |
+
markdown = "\n".join(md_lines)
|
| 254 |
+
return df, report, markdown
|