hchevva commited on
Commit
630d650
·
verified ·
1 Parent(s): 4bf9d97

Upload 43 files

Browse files
Files changed (43) hide show
  1. mcp_tox_calc/__init__.py +1 -0
  2. mcp_tox_calc/__pycache__/__init__.cpython-314.pyc +0 -0
  3. mcp_tox_calc/__pycache__/equations.cpython-314.pyc +0 -0
  4. mcp_tox_calc/__pycache__/logging.cpython-314.pyc +0 -0
  5. mcp_tox_calc/__pycache__/server.cpython-314.pyc +0 -0
  6. mcp_tox_calc/__pycache__/units.cpython-314.pyc +0 -0
  7. mcp_tox_calc/equations.py +275 -0
  8. mcp_tox_calc/logging.py +58 -0
  9. mcp_tox_calc/server.py +183 -0
  10. mcp_tox_calc/units.py +152 -0
  11. regulatory_catalog/epa_cancer_v2005.json +70 -0
  12. regulatory_catalog/fda_ctp_v2024_06.json +70 -0
  13. scripts/__pycache__/replay_calc_log.cpython-314.pyc +0 -0
  14. scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc +0 -0
  15. scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc +0 -0
  16. scripts/replay_calc_log.py +37 -0
  17. scripts/run_cancer_risk_batch.py +57 -0
  18. scripts/run_mcp_calc_server.py +13 -0
  19. tests/__pycache__/conftest.cpython-314.pyc +0 -0
  20. tests/__pycache__/test_equations.cpython-314.pyc +0 -0
  21. tests/__pycache__/test_mcp_tools.cpython-314.pyc +0 -0
  22. tests/__pycache__/test_nlp_pipeline.cpython-314.pyc +0 -0
  23. tests/__pycache__/test_regulatory_mapper.cpython-314.pyc +0 -0
  24. tests/__pycache__/test_units.cpython-314.pyc +0 -0
  25. tests/conftest.py +6 -0
  26. tests/fixtures/extraction_sample.json +41 -0
  27. tests/test_equations.py +86 -0
  28. tests/test_mcp_tools.py +38 -0
  29. tests/test_nlp_pipeline.py +46 -0
  30. tests/test_regulatory_mapper.py +16 -0
  31. tests/test_units.py +31 -0
  32. toxra_core/__init__.py +1 -0
  33. toxra_core/__pycache__/__init__.cpython-314.pyc +0 -0
  34. toxra_core/__pycache__/artifacts.cpython-314.pyc +0 -0
  35. toxra_core/__pycache__/calculation_client.cpython-314.pyc +0 -0
  36. toxra_core/__pycache__/contracts.cpython-314.pyc +0 -0
  37. toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc +0 -0
  38. toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc +0 -0
  39. toxra_core/artifacts.py +51 -0
  40. toxra_core/calculation_client.py +150 -0
  41. toxra_core/contracts.py +67 -0
  42. toxra_core/nlp_pipeline.py +284 -0
  43. toxra_core/regulatory_mapper.py +254 -0
mcp_tox_calc/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Deterministic toxicology calculation engine exposed through a local MCP server."""
mcp_tox_calc/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (217 Bytes). View file
 
mcp_tox_calc/__pycache__/equations.cpython-314.pyc ADDED
Binary file (13.3 kB). View file
 
mcp_tox_calc/__pycache__/logging.cpython-314.pyc ADDED
Binary file (4.89 kB). View file
 
mcp_tox_calc/__pycache__/server.cpython-314.pyc ADDED
Binary file (8.62 kB). View file
 
mcp_tox_calc/__pycache__/units.cpython-314.pyc ADDED
Binary file (6.81 kB). View file
 
mcp_tox_calc/equations.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from toxra_core.contracts import classify_risk_tier
4
+
5
+ from .units import (
6
+ UnitError,
7
+ normalize_air_concentration,
8
+ normalize_csf,
9
+ normalize_iur,
10
+ normalize_oral_exposure,
11
+ normalize_route,
12
+ )
13
+
14
+ FORMULA_VERSION = "1.0.0"
15
+
16
+
17
+ class CalculationError(ValueError):
18
+ pass
19
+
20
+
21
+ def _base_result(formula_id: str) -> Dict[str, Any]:
22
+ return {
23
+ "formula_id": formula_id,
24
+ "formula_version": FORMULA_VERSION,
25
+ "inputs_normalized": {},
26
+ "unit_conversions": [],
27
+ "result_value": None,
28
+ "risk_tier": "unknown",
29
+ "warnings": [],
30
+ "log_ref": "",
31
+ }
32
+
33
+
34
+ def validate_risk_input(payload: Dict[str, Any]) -> Dict[str, Any]:
35
+ result = _base_result("validate_risk_input")
36
+ errors: List[str] = []
37
+ warnings: List[str] = []
38
+
39
+ try:
40
+ route = normalize_route(payload.get("route"))
41
+ except Exception as exc:
42
+ route = ""
43
+ errors.append(str(exc))
44
+
45
+ has_csf = payload.get("csf_value") not in (None, "") and payload.get("exposure_value") not in (None, "")
46
+ has_iur = payload.get("iur_value") not in (None, "") and payload.get("air_conc_value") not in (None, "")
47
+
48
+ if route == "oral" and not has_csf:
49
+ errors.append("Oral route requires csf_value and exposure_value for CSF pathway.")
50
+
51
+ if route == "inhalation" and not (has_iur or has_csf):
52
+ errors.append("Inhalation route requires iur_value+air_conc_value or csf_value+exposure_value.")
53
+
54
+ if has_csf and payload.get("csf_unit") in (None, ""):
55
+ warnings.append("csf_unit missing; assuming standard per (mg/kg-day).")
56
+
57
+ if has_iur and payload.get("air_conc_unit") in (None, ""):
58
+ warnings.append("air_conc_unit missing; assuming ug/m3.")
59
+
60
+ result["warnings"] = warnings
61
+ result["valid"] = len(errors) == 0
62
+ result["errors"] = errors
63
+ result["result_value"] = 1.0 if result["valid"] else 0.0
64
+ result["risk_tier"] = "unknown"
65
+ return result
66
+
67
+
68
+ def calculate_epa_elcr_csf(payload: Dict[str, Any]) -> Dict[str, Any]:
69
+ result = _base_result("calculate_epa_elcr_csf")
70
+ try:
71
+ normalize_route(payload.get("route"))
72
+ exp_norm = normalize_oral_exposure(
73
+ payload.get("exposure_value"),
74
+ payload.get("exposure_unit"),
75
+ payload.get("body_weight_kg"),
76
+ )
77
+ csf_norm = normalize_csf(payload.get("csf_value"), payload.get("csf_unit"))
78
+
79
+ elcr = exp_norm["value_mg_per_kg_day"] * csf_norm["value_per_mg_per_kg_day"]
80
+ result["inputs_normalized"] = {
81
+ "cdi_mg_per_kg_day": exp_norm["value_mg_per_kg_day"],
82
+ "csf_per_mg_per_kg_day": csf_norm["value_per_mg_per_kg_day"],
83
+ }
84
+ result["unit_conversions"] = exp_norm["conversions"] + csf_norm["conversions"]
85
+ result["result_value"] = float(elcr)
86
+ result["risk_tier"] = classify_risk_tier(result["result_value"])
87
+ return result
88
+ except (UnitError, ValueError) as exc:
89
+ raise CalculationError(str(exc)) from exc
90
+
91
+
92
+ def calculate_epa_elcr_iur(payload: Dict[str, Any]) -> Dict[str, Any]:
93
+ result = _base_result("calculate_epa_elcr_iur")
94
+ try:
95
+ route = normalize_route(payload.get("route"))
96
+ if route != "inhalation":
97
+ result["warnings"].append("IUR calculation is generally applicable to inhalation route.")
98
+
99
+ conc_norm = normalize_air_concentration(payload.get("air_conc_value"), payload.get("air_conc_unit"))
100
+ iur_norm = normalize_iur(payload.get("iur_value"), payload.get("iur_unit"))
101
+
102
+ elcr = conc_norm["value_ug_per_m3"] * iur_norm["value_per_ug_per_m3"]
103
+ result["inputs_normalized"] = {
104
+ "air_conc_ug_per_m3": conc_norm["value_ug_per_m3"],
105
+ "iur_per_ug_per_m3": iur_norm["value_per_ug_per_m3"],
106
+ }
107
+ result["unit_conversions"] = conc_norm["conversions"] + iur_norm["conversions"]
108
+ result["result_value"] = float(elcr)
109
+ result["risk_tier"] = classify_risk_tier(result["result_value"])
110
+ return result
111
+ except (UnitError, ValueError) as exc:
112
+ raise CalculationError(str(exc)) from exc
113
+
114
+
115
+ def calculate_fda_ctp_elcr(payload: Dict[str, Any]) -> Dict[str, Any]:
116
+ result = _base_result("calculate_fda_ctp_elcr")
117
+
118
+ # Supports either a single row payload or a multi-constituent list under "constituents".
119
+ components: List[Dict[str, Any]] = []
120
+ total = 0.0
121
+
122
+ rows: List[Dict[str, Any]]
123
+ if isinstance(payload.get("constituents"), list) and payload.get("constituents"):
124
+ rows = [x for x in payload.get("constituents", []) if isinstance(x, dict)]
125
+ else:
126
+ rows = [payload]
127
+
128
+ for row in rows:
129
+ comp: Dict[str, Any] = {
130
+ "chemical_name": row.get("chemical_name", ""),
131
+ "route": row.get("route", ""),
132
+ "csf_result": None,
133
+ "iur_result": None,
134
+ "component_total": 0.0,
135
+ }
136
+
137
+ if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
138
+ csf_res = calculate_epa_elcr_csf(row)
139
+ comp["csf_result"] = csf_res
140
+ comp["component_total"] += float(csf_res["result_value"] or 0.0)
141
+
142
+ if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
143
+ iur_res = calculate_epa_elcr_iur(row)
144
+ comp["iur_result"] = iur_res
145
+ comp["component_total"] += float(iur_res["result_value"] or 0.0)
146
+
147
+ total += comp["component_total"]
148
+ components.append(comp)
149
+
150
+ result["inputs_normalized"] = {"component_count": len(components)}
151
+ result["unit_conversions"] = []
152
+ result["result_value"] = float(total)
153
+ result["risk_tier"] = classify_risk_tier(result["result_value"])
154
+ result["component_results"] = components
155
+ return result
156
+
157
+
158
+ def get_formula_catalog() -> Dict[str, Any]:
159
+ return {
160
+ "formula_id": "get_formula_catalog",
161
+ "formula_version": FORMULA_VERSION,
162
+ "inputs_normalized": {},
163
+ "unit_conversions": [],
164
+ "result_value": None,
165
+ "risk_tier": "unknown",
166
+ "warnings": [],
167
+ "log_ref": "",
168
+ "formulas": [
169
+ {
170
+ "id": "calculate_epa_elcr_csf",
171
+ "equation": "ELCR = CDI (mg/kg-day) * CSF ((mg/kg-day)^-1)",
172
+ "notes": "Oral pathway using cancer slope factor.",
173
+ },
174
+ {
175
+ "id": "calculate_epa_elcr_iur",
176
+ "equation": "ELCR = Air Concentration (ug/m3) * IUR ((ug/m3)^-1)",
177
+ "notes": "Inhalation pathway using inhalation unit risk.",
178
+ },
179
+ {
180
+ "id": "calculate_fda_ctp_elcr",
181
+ "equation": "ELCR_total = sum(component ELCR)",
182
+ "notes": "Constituent-level aggregation wrapper for CTP-style profile assessment.",
183
+ },
184
+ ],
185
+ }
186
+
187
+
188
+ def run_batch_cancer_risk(payload: Dict[str, Any]) -> Dict[str, Any]:
189
+ out = _base_result("run_batch_cancer_risk")
190
+ rows = payload.get("rows", []) if isinstance(payload, dict) else []
191
+ if not isinstance(rows, list):
192
+ raise CalculationError("rows must be a list of objects")
193
+
194
+ row_results: List[Dict[str, Any]] = []
195
+ n_ok = 0
196
+ n_err = 0
197
+
198
+ for i, row in enumerate(rows):
199
+ if not isinstance(row, dict):
200
+ row_results.append(
201
+ {
202
+ "row_index": i,
203
+ "status": "error",
204
+ "error": "row must be an object",
205
+ }
206
+ )
207
+ n_err += 1
208
+ continue
209
+
210
+ v = validate_risk_input(row)
211
+ if not v.get("valid", False):
212
+ row_results.append(
213
+ {
214
+ "row_index": i,
215
+ "record_id": row.get("record_id", ""),
216
+ "chemical_name": row.get("chemical_name", ""),
217
+ "status": "error",
218
+ "errors": v.get("errors", []),
219
+ "warnings": v.get("warnings", []),
220
+ }
221
+ )
222
+ n_err += 1
223
+ continue
224
+
225
+ try:
226
+ csf_res: Optional[Dict[str, Any]] = None
227
+ iur_res: Optional[Dict[str, Any]] = None
228
+ if row.get("csf_value") not in (None, "") and row.get("exposure_value") not in (None, ""):
229
+ csf_res = calculate_epa_elcr_csf(row)
230
+ if row.get("iur_value") not in (None, "") and row.get("air_conc_value") not in (None, ""):
231
+ iur_res = calculate_epa_elcr_iur(row)
232
+
233
+ fda_res = calculate_fda_ctp_elcr(row)
234
+
235
+ row_out = {
236
+ "row_index": i,
237
+ "record_id": row.get("record_id", ""),
238
+ "chemical_name": row.get("chemical_name", ""),
239
+ "casrn": row.get("casrn", ""),
240
+ "route": row.get("route", ""),
241
+ "status": "ok",
242
+ "epa_elcr_csf": (csf_res or {}).get("result_value", ""),
243
+ "epa_elcr_iur": (iur_res or {}).get("result_value", ""),
244
+ "fda_ctp_elcr": fda_res.get("result_value", ""),
245
+ "risk_tier": fda_res.get("risk_tier", "unknown"),
246
+ "formula_id": fda_res.get("formula_id", "calculate_fda_ctp_elcr"),
247
+ "formula_version": fda_res.get("formula_version", FORMULA_VERSION),
248
+ "inputs_normalized": fda_res.get("inputs_normalized", {}),
249
+ "unit_conversions": fda_res.get("unit_conversions", []),
250
+ "warnings": (v.get("warnings", []) + fda_res.get("warnings", [])),
251
+ "log_ref": "",
252
+ }
253
+ row_results.append(row_out)
254
+ n_ok += 1
255
+ except Exception as exc:
256
+ row_results.append(
257
+ {
258
+ "row_index": i,
259
+ "record_id": row.get("record_id", ""),
260
+ "chemical_name": row.get("chemical_name", ""),
261
+ "status": "error",
262
+ "errors": [str(exc)],
263
+ }
264
+ )
265
+ n_err += 1
266
+
267
+ out["rows"] = row_results
268
+ out["summary"] = {
269
+ "total_rows": len(rows),
270
+ "ok_rows": n_ok,
271
+ "error_rows": n_err,
272
+ }
273
+ out["result_value"] = float(n_ok)
274
+ out["risk_tier"] = "unknown"
275
+ return out
mcp_tox_calc/logging.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime as dt
2
+ import json
3
+ import uuid
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+
7
+
8
+ class RunLogger:
9
+ def __init__(self, run_dir: str):
10
+ self.run_dir = Path(run_dir)
11
+ self.run_dir.mkdir(parents=True, exist_ok=True)
12
+ self.log_path = self.run_dir / "cancer_risk_log.jsonl"
13
+ self.report_path = self.run_dir / "cancer_risk_report.md"
14
+
15
+ def log_event(self, tool_name: str, request_args: Dict[str, Any], response: Dict[str, Any]) -> str:
16
+ event_id = f"evt_{uuid.uuid4().hex[:10]}"
17
+ row = {
18
+ "event_id": event_id,
19
+ "timestamp_utc": dt.datetime.utcnow().isoformat() + "Z",
20
+ "tool_name": tool_name,
21
+ "request": request_args,
22
+ "response": response,
23
+ }
24
+ with self.log_path.open("a", encoding="utf-8") as f:
25
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
26
+ return event_id
27
+
28
+ def write_report(self, summary: Dict[str, Any], rows: List[Dict[str, Any]]) -> Path:
29
+ lines = [
30
+ "# Cancer Risk Calculation Report",
31
+ "",
32
+ "## Run Summary",
33
+ f"- Total rows: {summary.get('total_rows', 0)}",
34
+ f"- Successful rows: {summary.get('ok_rows', 0)}",
35
+ f"- Error rows: {summary.get('error_rows', 0)}",
36
+ "",
37
+ "## High Priority Rows",
38
+ ]
39
+
40
+ high = [r for r in rows if str(r.get("risk_tier", "")).strip().lower() == "high_priority"]
41
+ if high:
42
+ for r in high[:100]:
43
+ lines.append(
44
+ f"- record_id={r.get('record_id','')} chemical={r.get('chemical_name','')} fda_ctp_elcr={r.get('fda_ctp_elcr','')}"
45
+ )
46
+ else:
47
+ lines.append("- None")
48
+
49
+ lines += ["", "## Data Quality Alerts"]
50
+ errors = [r for r in rows if r.get("status") == "error"]
51
+ if errors:
52
+ for r in errors[:100]:
53
+ lines.append(f"- row_index={r.get('row_index')} errors={r.get('errors', [])}")
54
+ else:
55
+ lines.append("- No row-level errors.")
56
+
57
+ self.report_path.write_text("\n".join(lines), encoding="utf-8")
58
+ return self.report_path
mcp_tox_calc/server.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any, Callable, Dict
6
+
7
+ from mcp_tox_calc.equations import (
8
+ CalculationError,
9
+ calculate_epa_elcr_csf,
10
+ calculate_epa_elcr_iur,
11
+ calculate_fda_ctp_elcr,
12
+ get_formula_catalog,
13
+ run_batch_cancer_risk,
14
+ validate_risk_input,
15
+ )
16
+ from mcp_tox_calc.logging import RunLogger
17
+
18
+
19
+ ToolFn = Callable[[Dict[str, Any]], Dict[str, Any]]
20
+
21
+
22
+ class ToxCalcMCPServer:
23
+ def __init__(self, run_dir: str):
24
+ self.run_dir = str(Path(run_dir))
25
+ self.logger = RunLogger(self.run_dir)
26
+ self.tools: Dict[str, Dict[str, Any]] = {
27
+ "validate_risk_input": {
28
+ "description": "Validate a row payload for deterministic cancer risk calculations.",
29
+ "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
30
+ "fn": validate_risk_input,
31
+ },
32
+ "calculate_epa_elcr_csf": {
33
+ "description": "Compute ELCR using EPA CSF pathway.",
34
+ "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
35
+ "fn": calculate_epa_elcr_csf,
36
+ },
37
+ "calculate_epa_elcr_iur": {
38
+ "description": "Compute ELCR using EPA IUR pathway.",
39
+ "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
40
+ "fn": calculate_epa_elcr_iur,
41
+ },
42
+ "calculate_fda_ctp_elcr": {
43
+ "description": "Compute ELCR profile using FDA CTP-style constituent aggregation.",
44
+ "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
45
+ "fn": calculate_fda_ctp_elcr,
46
+ },
47
+ "run_batch_cancer_risk": {
48
+ "description": "Run deterministic cancer risk calculations across a batch of rows.",
49
+ "inputSchema": {
50
+ "type": "object",
51
+ "properties": {"rows": {"type": "array", "items": {"type": "object"}}},
52
+ "required": ["rows"],
53
+ "additionalProperties": True,
54
+ },
55
+ "fn": run_batch_cancer_risk,
56
+ },
57
+ "get_formula_catalog": {
58
+ "description": "Return available formula catalog and version.",
59
+ "inputSchema": {"type": "object", "properties": {}, "additionalProperties": True},
60
+ "fn": lambda _args: get_formula_catalog(),
61
+ },
62
+ }
63
+
64
+ def handle_request(self, req: Dict[str, Any]) -> Dict[str, Any]:
65
+ method = req.get("method")
66
+ req_id = req.get("id")
67
+
68
+ if method == "initialize":
69
+ return {
70
+ "jsonrpc": "2.0",
71
+ "id": req_id,
72
+ "result": {
73
+ "protocolVersion": "2024-11-05",
74
+ "serverInfo": {"name": "toxra-calc-mcp", "version": "0.1.0"},
75
+ "capabilities": {"tools": {}},
76
+ },
77
+ }
78
+
79
+ if method == "tools/list":
80
+ tools = []
81
+ for name, meta in self.tools.items():
82
+ tools.append(
83
+ {
84
+ "name": name,
85
+ "description": meta["description"],
86
+ "inputSchema": meta["inputSchema"],
87
+ }
88
+ )
89
+ return {"jsonrpc": "2.0", "id": req_id, "result": {"tools": tools}}
90
+
91
+ if method == "tools/call":
92
+ params = req.get("params", {}) or {}
93
+ name = params.get("name")
94
+ args = params.get("arguments", {}) or {}
95
+
96
+ if name not in self.tools:
97
+ return {
98
+ "jsonrpc": "2.0",
99
+ "id": req_id,
100
+ "error": {"code": -32602, "message": f"Unknown tool: {name}"},
101
+ }
102
+
103
+ fn: ToolFn = self.tools[name]["fn"]
104
+ try:
105
+ result = fn(args)
106
+ if not isinstance(result, dict):
107
+ result = {"value": result}
108
+
109
+ # Attach structured log reference per tool call.
110
+ log_ref = self.logger.log_event(name, args, result)
111
+ result.setdefault("log_ref", log_ref)
112
+
113
+ if name == "run_batch_cancer_risk":
114
+ rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
115
+ for row in rows:
116
+ if isinstance(row, dict):
117
+ row.setdefault("formula_id", "calculate_fda_ctp_elcr")
118
+ row.setdefault("formula_version", result.get("formula_version", "1.0.0"))
119
+ row.setdefault("inputs_normalized", {})
120
+ row.setdefault("unit_conversions", [])
121
+ row.setdefault("result_value", row.get("fda_ctp_elcr", ""))
122
+ row.setdefault("risk_tier", row.get("risk_tier", "unknown"))
123
+ row.setdefault("warnings", row.get("warnings", []))
124
+ row.setdefault("log_ref", log_ref)
125
+ report_path = self.logger.write_report(result.get("summary", {}), rows)
126
+ result["artifacts"] = {
127
+ "run_dir": self.run_dir,
128
+ "log_jsonl": str(self.logger.log_path),
129
+ "report_md": str(report_path),
130
+ }
131
+
132
+ content = [{"type": "json", "json": result}]
133
+ return {"jsonrpc": "2.0", "id": req_id, "result": {"content": content}}
134
+ except CalculationError as exc:
135
+ return {
136
+ "jsonrpc": "2.0",
137
+ "id": req_id,
138
+ "error": {"code": -32001, "message": str(exc)},
139
+ }
140
+ except Exception as exc:
141
+ return {
142
+ "jsonrpc": "2.0",
143
+ "id": req_id,
144
+ "error": {"code": -32099, "message": f"Unexpected tool error: {exc}"},
145
+ }
146
+
147
+ return {
148
+ "jsonrpc": "2.0",
149
+ "id": req_id,
150
+ "error": {"code": -32601, "message": f"Method not found: {method}"},
151
+ }
152
+
153
+
154
+ def _serve_stdio(server: ToxCalcMCPServer) -> None:
155
+ for line in sys.stdin:
156
+ line = line.strip()
157
+ if not line:
158
+ continue
159
+ try:
160
+ req = json.loads(line)
161
+ resp = server.handle_request(req)
162
+ except Exception as exc:
163
+ resp = {
164
+ "jsonrpc": "2.0",
165
+ "id": None,
166
+ "error": {"code": -32700, "message": f"Parse/dispatch error: {exc}"},
167
+ }
168
+ sys.stdout.write(json.dumps(resp) + "\n")
169
+ sys.stdout.flush()
170
+
171
+
172
+ def main() -> None:
173
+ parser = argparse.ArgumentParser(description="Local MCP server for deterministic toxicology calculations")
174
+ parser.add_argument("--stdio", action="store_true", default=False, help="Run stdio JSON-RPC loop")
175
+ parser.add_argument("--run-dir", default="runs/mcp_server", help="Run artifact directory")
176
+ args = parser.parse_args()
177
+
178
+ server = ToxCalcMCPServer(run_dir=args.run_dir)
179
+ _serve_stdio(server)
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()
mcp_tox_calc/units.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any, Dict
3
+
4
+
5
+ class UnitError(ValueError):
6
+ pass
7
+
8
+
9
+ def _to_float(v: Any, field: str) -> float:
10
+ try:
11
+ x = float(v)
12
+ except Exception as exc:
13
+ raise UnitError(f"{field} must be numeric.") from exc
14
+ return x
15
+
16
+
17
+ def _norm_unit(unit: Any) -> str:
18
+ s = str(unit or "").strip().lower()
19
+ s = s.replace(" ", "")
20
+ s = s.replace("μ", "u")
21
+ return s
22
+
23
+
24
+ def normalize_route(route: Any) -> str:
25
+ r = str(route or "").strip().lower()
26
+ if r in {"oral", "inhalation"}:
27
+ return r
28
+ raise UnitError("route must be oral or inhalation")
29
+
30
+
31
+ def normalize_oral_exposure(exposure_value: Any, exposure_unit: Any, body_weight_kg: Any) -> Dict[str, Any]:
32
+ value = _to_float(exposure_value, "exposure_value")
33
+ unit = _norm_unit(exposure_unit)
34
+ bw = _to_float(body_weight_kg, "body_weight_kg") if body_weight_kg not in (None, "") else None
35
+
36
+ conversions = []
37
+
38
+ if unit in {"mg/kg-day", "mg/kg/d", "mg/kgday", "mgkgday"}:
39
+ cdi = value
40
+ elif unit in {"ug/kg-day", "ug/kg/d", "ug/kgday", "ugkgday"}:
41
+ cdi = value / 1000.0
42
+ conversions.append("exposure ug/kg-day -> mg/kg-day")
43
+ elif unit in {"mg/day", "mg/d", "mgday"}:
44
+ if bw is None or bw <= 0:
45
+ raise UnitError("body_weight_kg is required for exposure unit mg/day")
46
+ cdi = value / bw
47
+ conversions.append("exposure mg/day -> mg/kg-day")
48
+ elif unit in {"ug/day", "ug/d", "ugday"}:
49
+ if bw is None or bw <= 0:
50
+ raise UnitError("body_weight_kg is required for exposure unit ug/day")
51
+ cdi = (value / 1000.0) / bw
52
+ conversions.append("exposure ug/day -> mg/kg-day")
53
+ else:
54
+ raise UnitError(f"Unsupported oral exposure unit: {exposure_unit}")
55
+
56
+ return {
57
+ "value_mg_per_kg_day": cdi,
58
+ "unit": "mg/kg-day",
59
+ "conversions": conversions,
60
+ }
61
+
62
+
63
+ def normalize_air_concentration(air_conc_value: Any, air_conc_unit: Any) -> Dict[str, Any]:
64
+ value = _to_float(air_conc_value, "air_conc_value")
65
+ unit = _norm_unit(air_conc_unit)
66
+ conversions = []
67
+
68
+ if unit in {"ug/m3", "ugm3", "ug/m^3"}:
69
+ conc = value
70
+ elif unit in {"mg/m3", "mgm3", "mg/m^3"}:
71
+ conc = value * 1000.0
72
+ conversions.append("air concentration mg/m3 -> ug/m3")
73
+ elif unit in {"ng/m3", "ngm3", "ng/m^3"}:
74
+ conc = value / 1000.0
75
+ conversions.append("air concentration ng/m3 -> ug/m3")
76
+ else:
77
+ raise UnitError(f"Unsupported air concentration unit: {air_conc_unit}")
78
+
79
+ return {
80
+ "value_ug_per_m3": conc,
81
+ "unit": "ug/m3",
82
+ "conversions": conversions,
83
+ }
84
+
85
+
86
+ def normalize_csf(csf_value: Any, csf_unit: Any) -> Dict[str, Any]:
87
+ value = _to_float(csf_value, "csf_value")
88
+ unit = _norm_unit(csf_unit)
89
+ conversions = []
90
+
91
+ if unit in {
92
+ "(mg/kg-day)^-1",
93
+ "1/(mg/kg-day)",
94
+ "per(mg/kg-day)",
95
+ "permg/kg-day",
96
+ "(mgkgday)^-1",
97
+ "1/mgkgday",
98
+ }:
99
+ out = value
100
+ elif unit in {
101
+ "(ug/kg-day)^-1",
102
+ "1/(ug/kg-day)",
103
+ "per(ug/kg-day)",
104
+ "(ugkgday)^-1",
105
+ "1/ugkgday",
106
+ }:
107
+ out = value * 1000.0
108
+ conversions.append("CSF per (ug/kg-day) -> per (mg/kg-day)")
109
+ elif unit in {"", "na", "n/a"}:
110
+ out = value
111
+ else:
112
+ raise UnitError(f"Unsupported csf unit: {csf_unit}")
113
+
114
+ return {
115
+ "value_per_mg_per_kg_day": out,
116
+ "unit": "(mg/kg-day)^-1",
117
+ "conversions": conversions,
118
+ }
119
+
120
+
121
+ def normalize_iur(iur_value: Any, iur_unit: Any) -> Dict[str, Any]:
122
+ value = _to_float(iur_value, "iur_value")
123
+ unit = _norm_unit(iur_unit)
124
+ conversions = []
125
+
126
+ if unit in {
127
+ "(ug/m3)^-1",
128
+ "1/(ug/m3)",
129
+ "per(ug/m3)",
130
+ "1/ugm3",
131
+ "(ugm3)^-1",
132
+ }:
133
+ out = value
134
+ elif unit in {
135
+ "(mg/m3)^-1",
136
+ "1/(mg/m3)",
137
+ "per(mg/m3)",
138
+ "1/mgm3",
139
+ "(mgm3)^-1",
140
+ }:
141
+ out = value / 1000.0
142
+ conversions.append("IUR per (mg/m3) -> per (ug/m3)")
143
+ elif unit in {"", "na", "n/a"}:
144
+ out = value
145
+ else:
146
+ raise UnitError(f"Unsupported iur unit: {iur_unit}")
147
+
148
+ return {
149
+ "value_per_ug_per_m3": out,
150
+ "unit": "(ug/m3)^-1",
151
+ "conversions": conversions,
152
+ }
regulatory_catalog/epa_cancer_v2005.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "EPA",
3
+ "version": "2005",
4
+ "source": "EPA Guidelines for Carcinogen Risk Assessment (2005)",
5
+ "clauses": [
6
+ {
7
+ "clause_id": "EPA.CANCER.CSF.001",
8
+ "framework": "EPA",
9
+ "title": "Oral cancer slope factor applicability",
10
+ "description": "Assessment should include oral-dose based evidence and potency context for CSF application.",
11
+ "required_fields": [
12
+ "dose_metrics",
13
+ "exposure_route",
14
+ "carcinogenicity_result"
15
+ ],
16
+ "required_evidence_terms": [
17
+ "cancer slope factor",
18
+ "mg/kg-day",
19
+ "oral",
20
+ "dose"
21
+ ],
22
+ "acceptance_rule": "any_required_fields",
23
+ "applicability": {},
24
+ "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
25
+ },
26
+ {
27
+ "clause_id": "EPA.CANCER.IUR.001",
28
+ "framework": "EPA",
29
+ "title": "Inhalation unit risk applicability",
30
+ "description": "Assessment should include inhalation exposure metrics suitable for IUR-based risk quantification.",
31
+ "required_fields": [
32
+ "exposure_route",
33
+ "dose_metrics",
34
+ "carcinogenicity_notes"
35
+ ],
36
+ "required_evidence_terms": [
37
+ "inhalation unit risk",
38
+ "ug/m3",
39
+ "inhalation",
40
+ "air concentration"
41
+ ],
42
+ "acceptance_rule": "any_required_fields",
43
+ "applicability": {
44
+ "field": "exposure_route",
45
+ "equals": "inhalation"
46
+ },
47
+ "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
48
+ },
49
+ {
50
+ "clause_id": "EPA.CANCER.WOE.001",
51
+ "framework": "EPA",
52
+ "title": "Weight-of-evidence integration",
53
+ "description": "Narrative should integrate evidence quality, uncertainty, and plausibility of carcinogenic potential.",
54
+ "required_fields": [
55
+ "key_findings",
56
+ "conclusion",
57
+ "risk_summary"
58
+ ],
59
+ "required_evidence_terms": [
60
+ "weight of evidence",
61
+ "uncertainty",
62
+ "mode of action",
63
+ "cancer"
64
+ ],
65
+ "acceptance_rule": "any_required_fields",
66
+ "applicability": {},
67
+ "source_reference": "EPA Guidelines for Carcinogen Risk Assessment (2005)"
68
+ }
69
+ ]
70
+ }
regulatory_catalog/fda_ctp_v2024_06.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "FDA CTP",
3
+ "version": "2024-06",
4
+ "source": "FDA CTP Regulatory Science Policy Memoranda (June 3, 2024)",
5
+ "clauses": [
6
+ {
7
+ "clause_id": "FDA.CTP.GENOTOX.001",
8
+ "framework": "FDA CTP",
9
+ "title": "Genotoxicity hazard identification evidence",
10
+ "description": "Evidence should characterize in vitro and in vivo genotoxicity evidence and integrated interpretation.",
11
+ "required_fields": [
12
+ "genotox_oecd_tg_in_vitro",
13
+ "genotox_oecd_tg_in_vivo",
14
+ "genotoxicity_result",
15
+ "genotoxicity_result_notes"
16
+ ],
17
+ "required_evidence_terms": [
18
+ "genotoxic",
19
+ "ames",
20
+ "micronucleus",
21
+ "comet",
22
+ "oecd tg"
23
+ ],
24
+ "acceptance_rule": "all_required_fields",
25
+ "applicability": {},
26
+ "source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
27
+ },
28
+ {
29
+ "clause_id": "FDA.CTP.CARCIN.001",
30
+ "framework": "FDA CTP",
31
+ "title": "Carcinogenicity tiering narrative support",
32
+ "description": "Carcinogenicity conclusions should be supported by study findings and risk narrative.",
33
+ "required_fields": [
34
+ "carcinogenicity_result",
35
+ "carcinogenicity_notes",
36
+ "key_findings",
37
+ "conclusion"
38
+ ],
39
+ "required_evidence_terms": [
40
+ "carcinogenic",
41
+ "tumor",
42
+ "cancer",
43
+ "risk"
44
+ ],
45
+ "acceptance_rule": "all_required_fields",
46
+ "applicability": {},
47
+ "source_reference": "FDA CTP Genotoxicity Hazard Identification and Carcinogenicity Tiering memo (June 3, 2024)"
48
+ },
49
+ {
50
+ "clause_id": "FDA.CTP.ELCR.001",
51
+ "framework": "FDA CTP",
52
+ "title": "ELCR-ready quantitative evidence elements",
53
+ "description": "ELCR computations require quantitative exposure and potency anchors with transparent assumptions.",
54
+ "required_fields": [
55
+ "dose_metrics",
56
+ "risk_summary",
57
+ "exposure_route"
58
+ ],
59
+ "required_evidence_terms": [
60
+ "excess lifetime cancer risk",
61
+ "slope factor",
62
+ "unit risk",
63
+ "exposure"
64
+ ],
65
+ "acceptance_rule": "any_required_fields",
66
+ "applicability": {},
67
+ "source_reference": "FDA CTP Calculating Excess Lifetime Cancer Risk in ENDS PMTAs memo (June 3, 2024)"
68
+ }
69
+ ]
70
+ }
scripts/__pycache__/replay_calc_log.cpython-314.pyc ADDED
Binary file (2.24 kB). View file
 
scripts/__pycache__/run_cancer_risk_batch.cpython-314.pyc ADDED
Binary file (3.36 kB). View file
 
scripts/__pycache__/run_mcp_calc_server.cpython-314.pyc ADDED
Binary file (252 Bytes). View file
 
scripts/replay_calc_log.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ from collections import Counter
5
+ from pathlib import Path
6
+
7
+
8
+ def main() -> None:
9
+ parser = argparse.ArgumentParser(description="Replay and summarize cancer risk MCP log JSONL")
10
+ parser.add_argument("--log-jsonl", required=True, help="Path to cancer_risk_log.jsonl")
11
+ args = parser.parse_args()
12
+
13
+ path = Path(args.log_jsonl)
14
+ if not path.exists():
15
+ raise FileNotFoundError(f"Log file not found: {path}")
16
+
17
+ events = []
18
+ for line in path.read_text(encoding="utf-8").splitlines():
19
+ line = line.strip()
20
+ if not line:
21
+ continue
22
+ events.append(json.loads(line))
23
+
24
+ tool_counts = Counter([e.get("tool_name", "unknown") for e in events])
25
+
26
+ print("# MCP Calculation Log Replay")
27
+ print(f"events={len(events)}")
28
+ for tool, n in sorted(tool_counts.items()):
29
+ print(f"- {tool}: {n}")
30
+
31
+ if events:
32
+ print("\nlast_event=")
33
+ print(json.dumps(events[-1], indent=2))
34
+
35
+
36
+ if __name__ == "__main__":
37
+ main()
scripts/run_cancer_risk_batch.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ ROOT = Path(__file__).resolve().parents[1]
10
+ if str(ROOT) not in sys.path:
11
+ sys.path.insert(0, str(ROOT))
12
+
13
+ from toxra_core.artifacts import make_run_dir, write_dataframe_csv, write_json
14
+ from toxra_core.calculation_client import run_batch_cancer_risk
15
+ from toxra_core.contracts import CANCER_RISK_TEMPLATE_COLUMNS
16
+
17
+
18
+ def main() -> None:
19
+ parser = argparse.ArgumentParser(description="Run deterministic cancer risk batch using local MCP server")
20
+ parser.add_argument("--input-csv", required=True, help="Path to cancer risk input CSV")
21
+ parser.add_argument("--run-id", default="", help="Optional run ID")
22
+ parser.add_argument("--runs-dir", default="runs", help="Runs base directory")
23
+ args = parser.parse_args()
24
+
25
+ inp = Path(args.input_csv)
26
+ if not inp.exists():
27
+ raise FileNotFoundError(f"Input CSV not found: {inp}")
28
+
29
+ df = pd.read_csv(inp)
30
+ missing = [c for c in CANCER_RISK_TEMPLATE_COLUMNS if c not in df.columns]
31
+ if missing:
32
+ raise ValueError(f"Missing required columns: {missing}")
33
+
34
+ run_dir = make_run_dir(run_id=args.run_id or None, base_dir=args.runs_dir)
35
+ rows = df.fillna("").to_dict("records")
36
+
37
+ result = run_batch_cancer_risk(rows, run_dir=str(run_dir))
38
+ res_rows = result.get("rows", []) if isinstance(result.get("rows", []), list) else []
39
+ out_df = pd.DataFrame(res_rows)
40
+
41
+ out_csv = run_dir / "cancer_risk_results.csv"
42
+ out_json = run_dir / "cancer_risk_results.json"
43
+ write_dataframe_csv(out_csv, out_df)
44
+ write_json(out_json, result)
45
+
46
+ print(json.dumps({
47
+ "run_dir": str(run_dir),
48
+ "results_csv": str(out_csv),
49
+ "results_json": str(out_json),
50
+ "log_jsonl": result.get("artifacts", {}).get("log_jsonl", ""),
51
+ "report_md": result.get("artifacts", {}).get("report_md", ""),
52
+ "summary": result.get("summary", {}),
53
+ }, indent=2))
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
scripts/run_mcp_calc_server.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ ROOT = Path(__file__).resolve().parents[1]
6
+ if str(ROOT) not in sys.path:
7
+ sys.path.insert(0, str(ROOT))
8
+
9
+ from mcp_tox_calc.server import main
10
+
11
+
12
+ if __name__ == "__main__":
13
+ main()
tests/__pycache__/conftest.cpython-314.pyc ADDED
Binary file (492 Bytes). View file
 
tests/__pycache__/test_equations.cpython-314.pyc ADDED
Binary file (2.4 kB). View file
 
tests/__pycache__/test_mcp_tools.cpython-314.pyc ADDED
Binary file (1.88 kB). View file
 
tests/__pycache__/test_nlp_pipeline.cpython-314.pyc ADDED
Binary file (2.43 kB). View file
 
tests/__pycache__/test_regulatory_mapper.cpython-314.pyc ADDED
Binary file (982 Bytes). View file
 
tests/__pycache__/test_units.cpython-314.pyc ADDED
Binary file (1.79 kB). View file
 
tests/conftest.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ ROOT = Path(__file__).resolve().parents[1]
5
+ if str(ROOT) not in sys.path:
6
+ sys.path.insert(0, str(ROOT))
tests/fixtures/extraction_sample.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "papers": [
3
+ {
4
+ "_file": "paper_a.pdf",
5
+ "paper_title": "Test Paper A",
6
+ "risk_stance": "acceptable_with_uncertainty",
7
+ "risk_confidence": 0.72,
8
+ "risk_summary": "Contains partial carcinogenicity evidence.",
9
+ "extracted": {
10
+ "chemicals": ["Nicotine"],
11
+ "genotox_oecd_tg_in_vitro": ["OECD_TG_471_Bacterial Reverse mutation test(AMES test)"],
12
+ "genotox_oecd_tg_in_vivo": ["not_reported"],
13
+ "genotoxicity_result": "equivocal",
14
+ "genotoxicity_result_notes": "AMES mixed outcomes.",
15
+ "carcinogenicity_result": "insufficient_data",
16
+ "carcinogenicity_notes": "Long-term bioassay absent.",
17
+ "dose_metrics": ["NOAEL 10 mg/kg-day"],
18
+ "exposure_route": "oral",
19
+ "key_findings": "Potential DNA response observed.",
20
+ "conclusion": "Needs additional testing."
21
+ },
22
+ "evidence": [
23
+ {
24
+ "field": "genotoxicity_result",
25
+ "quote": "The AMES assay showed equivocal mutagenicity outcomes.",
26
+ "pages": "4-5"
27
+ },
28
+ {
29
+ "field": "dose_metrics",
30
+ "quote": "NOAEL was reported at 10 mg/kg-day.",
31
+ "pages": "6"
32
+ }
33
+ ]
34
+ }
35
+ ],
36
+ "toxra_extensions": {
37
+ "nlp_diagnostics": [],
38
+ "regulatory_gap_assessment": {},
39
+ "risk_calculation_refs": []
40
+ }
41
+ }
tests/test_equations.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mcp_tox_calc.equations import (
2
+ calculate_epa_elcr_csf,
3
+ calculate_epa_elcr_iur,
4
+ calculate_fda_ctp_elcr,
5
+ run_batch_cancer_risk,
6
+ )
7
+
8
+
9
+ def test_calculate_epa_elcr_csf_basic():
10
+ out = calculate_epa_elcr_csf(
11
+ {
12
+ "route": "oral",
13
+ "exposure_value": 0.01,
14
+ "exposure_unit": "mg/kg-day",
15
+ "body_weight_kg": 70,
16
+ "csf_value": 1.5,
17
+ "csf_unit": "(mg/kg-day)^-1",
18
+ }
19
+ )
20
+ assert round(out["result_value"], 8) == 0.015
21
+
22
+
23
+ def test_calculate_epa_elcr_iur_basic():
24
+ out = calculate_epa_elcr_iur(
25
+ {
26
+ "route": "inhalation",
27
+ "air_conc_value": 100,
28
+ "air_conc_unit": "ug/m3",
29
+ "iur_value": 1e-6,
30
+ "iur_unit": "(ug/m3)^-1",
31
+ }
32
+ )
33
+ assert round(out["result_value"], 8) == 0.0001
34
+
35
+
36
+ def test_fda_wrapper_aggregates_components():
37
+ out = calculate_fda_ctp_elcr(
38
+ {
39
+ "constituents": [
40
+ {
41
+ "route": "oral",
42
+ "exposure_value": 0.01,
43
+ "exposure_unit": "mg/kg-day",
44
+ "body_weight_kg": 70,
45
+ "csf_value": 1.0,
46
+ "csf_unit": "(mg/kg-day)^-1",
47
+ },
48
+ {
49
+ "route": "inhalation",
50
+ "air_conc_value": 50,
51
+ "air_conc_unit": "ug/m3",
52
+ "iur_value": 1e-6,
53
+ "iur_unit": "(ug/m3)^-1",
54
+ },
55
+ ]
56
+ }
57
+ )
58
+ assert out["result_value"] > 0
59
+ assert len(out["component_results"]) == 2
60
+
61
+
62
+ def test_batch_handles_mixed_rows():
63
+ out = run_batch_cancer_risk(
64
+ {
65
+ "rows": [
66
+ {
67
+ "record_id": "r1",
68
+ "chemical_name": "ChemA",
69
+ "route": "oral",
70
+ "exposure_value": 0.02,
71
+ "exposure_unit": "mg/kg-day",
72
+ "body_weight_kg": 70,
73
+ "csf_value": 1.1,
74
+ "csf_unit": "(mg/kg-day)^-1",
75
+ },
76
+ {
77
+ "record_id": "r2",
78
+ "chemical_name": "ChemB",
79
+ "route": "oral"
80
+ },
81
+ ]
82
+ }
83
+ )
84
+ assert out["summary"]["total_rows"] == 2
85
+ assert out["summary"]["ok_rows"] == 1
86
+ assert out["summary"]["error_rows"] == 1
tests/test_mcp_tools.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from toxra_core.artifacts import make_run_dir
4
+ from toxra_core.calculation_client import MCPCalculationClient
5
+
6
+
7
+ def test_mcp_client_lists_tools_and_runs_batch(tmp_path):
8
+ run_dir = make_run_dir(run_id="test_mcp", base_dir=str(tmp_path))
9
+ with MCPCalculationClient(run_dir=str(run_dir)) as client:
10
+ tools = client.list_tools()
11
+ names = {t.get("name") for t in tools}
12
+ assert "run_batch_cancer_risk" in names
13
+
14
+ result = client.call_tool(
15
+ "run_batch_cancer_risk",
16
+ {
17
+ "rows": [
18
+ {
19
+ "record_id": "r1",
20
+ "chemical_name": "ChemA",
21
+ "route": "oral",
22
+ "exposure_value": 0.01,
23
+ "exposure_unit": "mg/kg-day",
24
+ "body_weight_kg": 70,
25
+ "csf_value": 1.2,
26
+ "csf_unit": "(mg/kg-day)^-1",
27
+ "iur_value": "",
28
+ "air_conc_value": "",
29
+ "air_conc_unit": "",
30
+ }
31
+ ]
32
+ },
33
+ )
34
+
35
+ assert result["summary"]["total_rows"] == 1
36
+ assert result["summary"]["ok_rows"] == 1
37
+ assert Path(result["artifacts"]["log_jsonl"]).exists()
38
+ assert Path(result["artifacts"]["report_md"]).exists()
tests/test_nlp_pipeline.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from toxra_core.nlp_pipeline import (
4
+ expand_regulatory_queries,
5
+ extract_evidence_span,
6
+ hybrid_rank_text_items,
7
+ )
8
+
9
+
10
+ def test_expand_regulatory_queries_adds_families():
11
+ queries, families = expand_regulatory_queries(
12
+ base_queries=["genotoxicity risk"],
13
+ endpoint_modules=["Genotoxicity (OECD TG)"],
14
+ frameworks=["FDA CTP"],
15
+ )
16
+ assert len(queries) > 1
17
+ assert "endpoint" in families
18
+ assert families["endpoint"]
19
+
20
+
21
+ def test_extract_evidence_span_hit_and_fallback():
22
+ text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four."
23
+ hit = extract_evidence_span(text, "AMES")
24
+ assert "AMES" in hit["text"]
25
+
26
+ fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken")
27
+ assert fb["text"]
28
+
29
+
30
+ def test_hybrid_rank_text_items_lexical_only():
31
+ items = [
32
+ {"text": "This section discusses liver toxicity and NOAEL values."},
33
+ {"text": "Completely unrelated formulation text."},
34
+ ]
35
+ selected, diag = hybrid_rank_text_items(items, query="NOAEL liver")
36
+ assert selected
37
+ assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"}
38
+
39
+
40
+ def test_hybrid_rank_text_items_with_embeddings():
41
+ items = [{"text": "A"}, {"text": "B"}, {"text": "C"}]
42
+ emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32)
43
+ q = np.array([1.0, 0.0], dtype=np.float32)
44
+ selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q)
45
+ assert selected
46
+ assert diag["ranking_method"] == "hybrid_rrf"
tests/test_regulatory_mapper.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from toxra_core.regulatory_mapper import map_extraction_to_framework
5
+
6
+
7
+ def test_regulatory_mapping_outputs_matrix_and_report():
8
+ fixture = Path("tests/fixtures/extraction_sample.json")
9
+ payload = json.loads(fixture.read_text(encoding="utf-8"))
10
+
11
+ df, report, md = map_extraction_to_framework(payload, framework="FDA CTP")
12
+
13
+ assert not df.empty
14
+ assert "clause_id" in df.columns
15
+ assert report["framework"] == "FDA CTP"
16
+ assert "Status Summary" in md
tests/test_units.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mcp_tox_calc.units import (
2
+ normalize_air_concentration,
3
+ normalize_csf,
4
+ normalize_iur,
5
+ normalize_oral_exposure,
6
+ )
7
+
8
+
9
+ def test_normalize_oral_exposure_ug_per_kg_day():
10
+ out = normalize_oral_exposure(2500, "ug/kg-day", 70)
11
+ assert round(out["value_mg_per_kg_day"], 6) == 2.5
12
+
13
+
14
+ def test_normalize_oral_exposure_mg_day_with_bw():
15
+ out = normalize_oral_exposure(7, "mg/day", 70)
16
+ assert round(out["value_mg_per_kg_day"], 6) == 0.1
17
+
18
+
19
+ def test_normalize_air_concentration_mg_to_ug():
20
+ out = normalize_air_concentration(0.2, "mg/m3")
21
+ assert round(out["value_ug_per_m3"], 6) == 200.0
22
+
23
+
24
+ def test_normalize_csf_from_ug_basis():
25
+ out = normalize_csf(0.001, "(ug/kg-day)^-1")
26
+ assert round(out["value_per_mg_per_kg_day"], 6) == 1.0
27
+
28
+
29
+ def test_normalize_iur_from_mg_basis():
30
+ out = normalize_iur(0.002, "(mg/m3)^-1")
31
+ assert round(out["value_per_ug_per_m3"], 9) == 0.000002
toxra_core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Core modules for Toxra extraction, mapping, NLP, and calculation orchestration."""
toxra_core/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (215 Bytes). View file
 
toxra_core/__pycache__/artifacts.cpython-314.pyc ADDED
Binary file (4.62 kB). View file
 
toxra_core/__pycache__/calculation_client.cpython-314.pyc ADDED
Binary file (8.73 kB). View file
 
toxra_core/__pycache__/contracts.cpython-314.pyc ADDED
Binary file (4.04 kB). View file
 
toxra_core/__pycache__/nlp_pipeline.cpython-314.pyc ADDED
Binary file (17.9 kB). View file
 
toxra_core/__pycache__/regulatory_mapper.cpython-314.pyc ADDED
Binary file (13.6 kB). View file
 
toxra_core/artifacts.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Iterable, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from .contracts import default_run_id
8
+
9
+
10
+ def make_run_dir(run_id: Optional[str] = None, base_dir: str = "runs") -> Path:
11
+ rid = run_id or default_run_id("run")
12
+ out = Path(base_dir) / rid
13
+ out.mkdir(parents=True, exist_ok=True)
14
+ return out
15
+
16
+
17
+ def write_json(path: Path, data: Any) -> Path:
18
+ path.parent.mkdir(parents=True, exist_ok=True)
19
+ path.write_text(json.dumps(data, indent=2), encoding="utf-8")
20
+ return path
21
+
22
+
23
+ def write_markdown(path: Path, text: str) -> Path:
24
+ path.parent.mkdir(parents=True, exist_ok=True)
25
+ path.write_text(text or "", encoding="utf-8")
26
+ return path
27
+
28
+
29
+ def append_jsonl(path: Path, row: Dict[str, Any]) -> Path:
30
+ path.parent.mkdir(parents=True, exist_ok=True)
31
+ with path.open("a", encoding="utf-8") as f:
32
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
33
+ return path
34
+
35
+
36
+ def write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> Path:
37
+ path.parent.mkdir(parents=True, exist_ok=True)
38
+ with path.open("w", encoding="utf-8") as f:
39
+ for row in rows:
40
+ f.write(json.dumps(row, ensure_ascii=True) + "\n")
41
+ return path
42
+
43
+
44
+ def write_dataframe_csv(path: Path, df: pd.DataFrame) -> Path:
45
+ path.parent.mkdir(parents=True, exist_ok=True)
46
+ df.to_csv(path, index=False)
47
+ return path
48
+
49
+
50
+ def load_json(path: Path) -> Any:
51
+ return json.loads(path.read_text(encoding="utf-8"))
toxra_core/calculation_client.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import subprocess
3
+ import sys
4
+ from typing import Any, Dict, List, Optional
5
+
6
+
7
+ class MCPClientError(RuntimeError):
8
+ pass
9
+
10
+
11
+ class MCPCalculationClient:
12
+ def __init__(
13
+ self,
14
+ run_dir: str,
15
+ python_executable: Optional[str] = None,
16
+ server_script: Optional[str] = None,
17
+ ):
18
+ self.run_dir = str(run_dir)
19
+ self.python_executable = python_executable or sys.executable
20
+ if server_script:
21
+ self.server_script = str(server_script)
22
+ else:
23
+ self.server_script = None
24
+ self._proc: Optional[subprocess.Popen] = None
25
+ self._id = 0
26
+
27
+ def start(self) -> None:
28
+ if self._proc is not None:
29
+ return
30
+
31
+ if self.server_script:
32
+ cmd = [
33
+ self.python_executable,
34
+ self.server_script,
35
+ "--stdio",
36
+ "--run-dir",
37
+ self.run_dir,
38
+ ]
39
+ else:
40
+ cmd = [
41
+ self.python_executable,
42
+ "-m",
43
+ "mcp_tox_calc.server",
44
+ "--stdio",
45
+ "--run-dir",
46
+ self.run_dir,
47
+ ]
48
+
49
+ self._proc = subprocess.Popen(
50
+ cmd,
51
+ stdin=subprocess.PIPE,
52
+ stdout=subprocess.PIPE,
53
+ stderr=subprocess.PIPE,
54
+ text=True,
55
+ bufsize=1,
56
+ )
57
+
58
+ self._request("initialize", {"protocolVersion": "2024-11-05", "clientInfo": {"name": "toxra-app", "version": "0.1.0"}})
59
+
60
+ def stop(self) -> None:
61
+ if self._proc is None:
62
+ return
63
+ try:
64
+ if self._proc.stdin:
65
+ self._proc.stdin.close()
66
+ self._proc.terminate()
67
+ self._proc.wait(timeout=3)
68
+ except Exception:
69
+ try:
70
+ self._proc.kill()
71
+ except Exception:
72
+ pass
73
+ finally:
74
+ self._proc = None
75
+
76
+ def __enter__(self):
77
+ self.start()
78
+ return self
79
+
80
+ def __exit__(self, exc_type, exc, tb):
81
+ self.stop()
82
+
83
+ def _request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
84
+ if self._proc is None:
85
+ raise MCPClientError("MCP server not started.")
86
+ if self._proc.stdin is None or self._proc.stdout is None:
87
+ raise MCPClientError("MCP server pipes unavailable.")
88
+
89
+ self._id += 1
90
+ req_id = self._id
91
+ request = {
92
+ "jsonrpc": "2.0",
93
+ "id": req_id,
94
+ "method": method,
95
+ "params": params,
96
+ }
97
+
98
+ self._proc.stdin.write(json.dumps(request) + "\n")
99
+ self._proc.stdin.flush()
100
+
101
+ while True:
102
+ line = self._proc.stdout.readline()
103
+ if line == "":
104
+ err = ""
105
+ if self._proc.stderr is not None:
106
+ try:
107
+ err = self._proc.stderr.read()[-1500:]
108
+ except Exception:
109
+ err = ""
110
+ raise MCPClientError(f"No response from MCP server. stderr={err}")
111
+
112
+ line = line.strip()
113
+ if not line:
114
+ continue
115
+
116
+ try:
117
+ resp = json.loads(line)
118
+ except Exception:
119
+ continue
120
+
121
+ if resp.get("id") != req_id:
122
+ continue
123
+
124
+ if "error" in resp:
125
+ raise MCPClientError(str(resp["error"]))
126
+
127
+ result = resp.get("result", {})
128
+ if not isinstance(result, dict):
129
+ return {"result": result}
130
+ return result
131
+
132
+ def list_tools(self) -> List[Dict[str, Any]]:
133
+ result = self._request("tools/list", {})
134
+ tools = result.get("tools", [])
135
+ return tools if isinstance(tools, list) else []
136
+
137
+ def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
138
+ result = self._request("tools/call", {"name": name, "arguments": arguments})
139
+ content = result.get("content", []) if isinstance(result, dict) else []
140
+ if isinstance(content, list) and content:
141
+ first = content[0]
142
+ if isinstance(first, dict) and first.get("type") == "json":
143
+ data = first.get("json", {})
144
+ return data if isinstance(data, dict) else {"value": data}
145
+ return result if isinstance(result, dict) else {"value": result}
146
+
147
+
148
+ def run_batch_cancer_risk(rows: List[Dict[str, Any]], run_dir: str) -> Dict[str, Any]:
149
+ with MCPCalculationClient(run_dir=run_dir) as client:
150
+ return client.call_tool("run_batch_cancer_risk", {"rows": rows})
toxra_core/contracts.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime as _dt
2
+ import uuid
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Optional
5
+
6
+
7
+ CANCER_RISK_TEMPLATE_COLUMNS: List[str] = [
8
+ "record_id",
9
+ "chemical_name",
10
+ "casrn",
11
+ "route",
12
+ "exposure_value",
13
+ "exposure_unit",
14
+ "body_weight_kg",
15
+ "csf_value",
16
+ "csf_unit",
17
+ "iur_value",
18
+ "air_conc_value",
19
+ "air_conc_unit",
20
+ "source_reference",
21
+ ]
22
+
23
+
24
+ def default_run_id(prefix: str = "run") -> str:
25
+ ts = _dt.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
26
+ return f"{prefix}_{ts}_{uuid.uuid4().hex[:8]}"
27
+
28
+
29
+ @dataclass
30
+ class RegulatoryClause:
31
+ clause_id: str
32
+ framework: str
33
+ title: str
34
+ description: str
35
+ required_fields: List[str] = field(default_factory=list)
36
+ required_evidence_terms: List[str] = field(default_factory=list)
37
+ acceptance_rule: str = "all_required_fields"
38
+ applicability: Dict[str, Any] = field(default_factory=dict)
39
+ source_reference: str = ""
40
+
41
+
42
+ @dataclass
43
+ class ClauseEvaluation:
44
+ clause_id: str
45
+ framework: str
46
+ status: str
47
+ fields_present: List[str] = field(default_factory=list)
48
+ missing_fields: List[str] = field(default_factory=list)
49
+ evidence_hits: List[str] = field(default_factory=list)
50
+ prompt: str = ""
51
+ reason: str = ""
52
+
53
+
54
+ RISK_TIER_THRESHOLDS = {
55
+ "low": 1e-6,
56
+ "moderate": 1e-4,
57
+ }
58
+
59
+
60
+ def classify_risk_tier(value: Optional[float]) -> str:
61
+ if value is None:
62
+ return "unknown"
63
+ if value < RISK_TIER_THRESHOLDS["low"]:
64
+ return "de_minimis"
65
+ if value <= RISK_TIER_THRESHOLDS["moderate"]:
66
+ return "monitor"
67
+ return "high_priority"
toxra_core/nlp_pipeline.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ try:
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ except Exception: # pragma: no cover - fallback path for minimal runtime
8
+ TfidfVectorizer = None
9
+
10
+
11
+ ENDPOINT_QUERY_HINTS: Dict[str, List[str]] = {
12
+ "Genotoxicity (OECD TG)": [
13
+ "genotoxicity",
14
+ "mutagenicity",
15
+ "AMES",
16
+ "micronucleus",
17
+ "comet assay",
18
+ "chromosomal aberration",
19
+ "OECD TG 471 473 476 487 490 474 489",
20
+ ],
21
+ "NAMs / In Silico": [
22
+ "in silico",
23
+ "QSAR",
24
+ "read-across",
25
+ "AOP",
26
+ "PBPK",
27
+ "high-throughput",
28
+ "omics",
29
+ "organ-on-chip",
30
+ "microphysiological",
31
+ ],
32
+ "Acute toxicity": ["acute toxicity", "LD50", "LC50", "single dose", "lethality", "mortality"],
33
+ "Repeated dose toxicity": [
34
+ "repeated dose",
35
+ "subchronic",
36
+ "chronic",
37
+ "NOAEL",
38
+ "LOAEL",
39
+ "target organ",
40
+ "90-day",
41
+ "28-day",
42
+ ],
43
+ "Irritation / Sensitization": ["skin irritation", "eye irritation", "sensitization", "LLNA", "Draize"],
44
+ "Repro / Developmental": ["reproductive toxicity", "fertility", "developmental toxicity", "teratogenic", "prenatal", "postnatal"],
45
+ "Carcinogenicity": ["carcinogenicity", "tumor", "neoplasm", "cancer", "two-year bioassay"],
46
+ }
47
+
48
+ FRAMEWORK_QUERY_HINTS: Dict[str, List[str]] = {
49
+ "FDA CTP": [
50
+ "genotoxicity hazard identification",
51
+ "carcinogenicity tiering",
52
+ "excess lifetime cancer risk",
53
+ "constituent comparison",
54
+ "weight of evidence",
55
+ ],
56
+ "EPA": [
57
+ "cancer slope factor",
58
+ "inhalation unit risk",
59
+ "lifetime excess cancer risk",
60
+ "mode of action",
61
+ "weight of evidence descriptors",
62
+ ],
63
+ }
64
+
65
+ EQUATION_INPUT_HINTS: List[str] = [
66
+ "exposure concentration",
67
+ "daily intake",
68
+ "mg/kg-day",
69
+ "ug/m3",
70
+ "cancer slope factor",
71
+ "inhalation unit risk",
72
+ "body weight",
73
+ ]
74
+
75
+
76
+ def clean_text(t: str) -> str:
77
+ t = (t or "").replace("\x00", " ")
78
+ return re.sub(r"\s+", " ", t).strip()
79
+
80
+
81
+ def split_sentences(text: str) -> List[str]:
82
+ t = clean_text(text)
83
+ if not t:
84
+ return []
85
+ return [x.strip() for x in re.split(r"(?<=[\.!\?])\s+", t) if x.strip()]
86
+
87
+
88
+ def _tokenize(s: str) -> List[str]:
89
+ return [w for w in re.findall(r"[a-zA-Z0-9\-]+", (s or "").lower()) if len(w) >= 3]
90
+
91
+
92
+ def extract_evidence_span(page_text: str, query: str, page: Optional[int] = None, n_sentences: int = 5) -> Dict[str, Any]:
93
+ sents = split_sentences(page_text)
94
+ if not sents:
95
+ return {"text": "", "page": page, "start_sentence": 0, "mode": "empty"}
96
+
97
+ qwords = _tokenize(query)
98
+ hit_i = None
99
+ for i, s in enumerate(sents):
100
+ sl = s.lower()
101
+ if any(w in sl for w in qwords):
102
+ hit_i = i
103
+ break
104
+
105
+ if hit_i is None:
106
+ snippet = " ".join(sents[:n_sentences])
107
+ return {"text": snippet, "page": page, "start_sentence": 0, "mode": "fallback"}
108
+
109
+ start = max(0, hit_i - 2)
110
+ end = min(len(sents), hit_i + 3)
111
+ snippet = " ".join(sents[start:end])
112
+ return {"text": snippet, "page": page, "start_sentence": start, "mode": "hit"}
113
+
114
+
115
+ def build_query_families(
116
+ base_queries: List[str],
117
+ endpoint_modules: Optional[List[str]] = None,
118
+ frameworks: Optional[List[str]] = None,
119
+ ) -> Dict[str, List[str]]:
120
+ endpoint_modules = endpoint_modules or []
121
+ frameworks = frameworks or []
122
+
123
+ endpoint_terms: List[str] = []
124
+ for ep in endpoint_modules:
125
+ endpoint_terms.extend(ENDPOINT_QUERY_HINTS.get(ep, []))
126
+
127
+ framework_terms: List[str] = []
128
+ for fw in frameworks:
129
+ framework_terms.extend(FRAMEWORK_QUERY_HINTS.get(fw, []))
130
+
131
+ return {
132
+ "base": [q for q in base_queries if (q or "").strip()],
133
+ "endpoint": endpoint_terms,
134
+ "framework": framework_terms,
135
+ "equation_inputs": EQUATION_INPUT_HINTS,
136
+ }
137
+
138
+
139
+ def expand_regulatory_queries(
140
+ base_queries: List[str],
141
+ endpoint_modules: Optional[List[str]] = None,
142
+ frameworks: Optional[List[str]] = None,
143
+ extra_terms: Optional[List[str]] = None,
144
+ ) -> Tuple[List[str], Dict[str, List[str]]]:
145
+ families = build_query_families(base_queries, endpoint_modules, frameworks)
146
+ queries: List[str] = []
147
+ for vals in families.values():
148
+ queries.extend(vals)
149
+ queries.extend(extra_terms or [])
150
+
151
+ deduped: List[str] = []
152
+ seen = set()
153
+ for q in queries:
154
+ x = (q or "").strip()
155
+ if not x:
156
+ continue
157
+ k = x.lower()
158
+ if k in seen:
159
+ continue
160
+ seen.add(k)
161
+ deduped.append(x)
162
+
163
+ return deduped, families
164
+
165
+
166
+ def _lexical_ranks(texts: List[str], query: str) -> Tuple[List[int], np.ndarray]:
167
+ if not texts:
168
+ return [], np.array([], dtype=np.float32)
169
+ if TfidfVectorizer is None:
170
+ q_tokens = set(_tokenize(query))
171
+ sims = []
172
+ for t in texts:
173
+ tl = t.lower()
174
+ sims.append(float(sum(1 for tok in q_tokens if tok in tl)))
175
+ arr = np.array(sims, dtype=np.float32)
176
+ order = list(np.argsort(arr)[::-1])
177
+ return order, arr
178
+
179
+ vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=25000)
180
+ x = vec.fit_transform(texts)
181
+ qv = vec.transform([query])
182
+ sims = (x @ qv.T).toarray().ravel().astype(np.float32)
183
+ order = list(np.argsort(sims)[::-1])
184
+ return order, sims
185
+
186
+
187
+ def _embedding_ranks(item_embeddings: np.ndarray, query_embedding: np.ndarray) -> Tuple[List[int], np.ndarray]:
188
+ if item_embeddings.size == 0:
189
+ return [], np.array([], dtype=np.float32)
190
+ q = np.asarray(query_embedding, dtype=np.float32)
191
+ qn = np.linalg.norm(q) + 1e-12
192
+ q = q / qn
193
+ mat = np.asarray(item_embeddings, dtype=np.float32)
194
+ norms = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
195
+ mat = mat / norms
196
+ sims = (mat @ q).astype(np.float32)
197
+ order = list(np.argsort(sims)[::-1])
198
+ return order, sims
199
+
200
+
201
+ def _rrf_score(rank_lists: List[List[int]], k: int = 60) -> Dict[int, float]:
202
+ out: Dict[int, float] = {}
203
+ for rank_list in rank_lists:
204
+ for rank_pos, idx in enumerate(rank_list):
205
+ out[idx] = out.get(idx, 0.0) + (1.0 / (k + rank_pos + 1.0))
206
+ return out
207
+
208
+
209
+ def _family_coverage_score(texts: List[str], families: Dict[str, List[str]]) -> Dict[str, float]:
210
+ merged = " ".join([clean_text(t).lower() for t in texts])
211
+ out: Dict[str, float] = {}
212
+ for family, queries in families.items():
213
+ if not queries:
214
+ out[family] = 0.0
215
+ continue
216
+ hits = 0
217
+ for q in queries:
218
+ tokens = _tokenize(q)
219
+ if any(t in merged for t in tokens):
220
+ hits += 1
221
+ out[family] = round(hits / max(1, len(queries)), 4)
222
+ return out
223
+
224
+
225
+ def hybrid_rank_text_items(
226
+ items: List[Dict[str, Any]],
227
+ query: str,
228
+ families: Optional[Dict[str, List[str]]] = None,
229
+ top_k: int = 12,
230
+ item_embeddings: Optional[np.ndarray] = None,
231
+ query_embedding: Optional[np.ndarray] = None,
232
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
233
+ if not items:
234
+ return [], {
235
+ "ranking_method": "empty",
236
+ "selected_indices": [],
237
+ "coverage_by_query_family": families or {},
238
+ "coverage_score": 0.0,
239
+ "component_scores": {},
240
+ }
241
+
242
+ texts = [clean_text(i.get("text", "")) for i in items]
243
+
244
+ lex_order, lex_scores = _lexical_ranks(texts, query)
245
+ rank_lists = [lex_order]
246
+ method = "lexical_only"
247
+
248
+ emb_scores = None
249
+ if item_embeddings is not None and query_embedding is not None:
250
+ try:
251
+ emb_order, emb_scores = _embedding_ranks(item_embeddings, query_embedding)
252
+ rank_lists.append(emb_order)
253
+ method = "hybrid_rrf"
254
+ except Exception:
255
+ emb_scores = None
256
+
257
+ rrf = _rrf_score(rank_lists)
258
+ final_order = sorted(rrf.keys(), key=lambda idx: rrf[idx], reverse=True)
259
+ selected_indices = final_order[: max(1, int(top_k))]
260
+
261
+ selected: List[Dict[str, Any]] = []
262
+ for idx in selected_indices:
263
+ row = dict(items[idx])
264
+ row["_nlp_rrf_score"] = float(rrf.get(idx, 0.0))
265
+ row["_nlp_lex_score"] = float(lex_scores[idx]) if len(lex_scores) > idx else 0.0
266
+ if emb_scores is not None and len(emb_scores) > idx:
267
+ row["_nlp_emb_score"] = float(emb_scores[idx])
268
+ selected.append(row)
269
+
270
+ fam = families or {"base": [query]}
271
+ cov = _family_coverage_score([x.get("text", "") for x in selected], fam)
272
+ cov_score = round(float(np.mean(list(cov.values()))) if cov else 0.0, 4)
273
+
274
+ diagnostics = {
275
+ "ranking_method": method,
276
+ "selected_indices": selected_indices,
277
+ "coverage_by_query_family": cov,
278
+ "coverage_score": cov_score,
279
+ "component_scores": {
280
+ "lexical": [float(lex_scores[i]) for i in selected_indices if len(lex_scores) > i],
281
+ "embedding": [float(emb_scores[i]) for i in selected_indices if emb_scores is not None and len(emb_scores) > i],
282
+ },
283
+ }
284
+ return selected, diagnostics
toxra_core/regulatory_mapper.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import pandas as pd
6
+
7
+ from .contracts import ClauseEvaluation, RegulatoryClause
8
+
9
+
10
+ FRAMEWORK_TO_FILE = {
11
+ "FDA CTP": "fda_ctp_v2024_06.json",
12
+ "EPA": "epa_cancer_v2005.json",
13
+ }
14
+
15
+ EMPTY_STRINGS = {"", "not_reported", "insufficient_data", "none", "na", "n/a", "null"}
16
+
17
+
18
+ def _is_non_empty(v: Any) -> bool:
19
+ if v is None:
20
+ return False
21
+ if isinstance(v, list):
22
+ vals = [str(x).strip() for x in v if str(x).strip()]
23
+ if not vals:
24
+ return False
25
+ return not all(x.lower() in EMPTY_STRINGS for x in vals)
26
+ s = str(v).strip()
27
+ if not s:
28
+ return False
29
+ return s.lower() not in EMPTY_STRINGS
30
+
31
+
32
+ def _normalize_payload(extraction_payload: Any) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
33
+ if isinstance(extraction_payload, dict):
34
+ papers = extraction_payload.get("papers", [])
35
+ if isinstance(papers, list):
36
+ ext = extraction_payload.get("toxra_extensions", {})
37
+ return papers, (ext if isinstance(ext, dict) else {})
38
+ if isinstance(extraction_payload, list):
39
+ return extraction_payload, {}
40
+ raise ValueError("Unsupported extraction payload format. Expected list or object with papers.")
41
+
42
+
43
+ def load_framework_catalog(framework: str, catalog_dir: str = "regulatory_catalog") -> List[RegulatoryClause]:
44
+ fname = FRAMEWORK_TO_FILE.get(framework)
45
+ if not fname:
46
+ raise ValueError(f"Unsupported framework: {framework}")
47
+ path = Path(catalog_dir) / fname
48
+ if not path.exists():
49
+ raise FileNotFoundError(f"Catalog not found: {path}")
50
+
51
+ data = json.loads(path.read_text(encoding="utf-8"))
52
+ clauses = data.get("clauses", []) if isinstance(data, dict) else []
53
+
54
+ out: List[RegulatoryClause] = []
55
+ for c in clauses:
56
+ out.append(
57
+ RegulatoryClause(
58
+ clause_id=str(c.get("clause_id", "")).strip(),
59
+ framework=str(c.get("framework", framework)).strip(),
60
+ title=str(c.get("title", "")).strip(),
61
+ description=str(c.get("description", "")).strip(),
62
+ required_fields=list(c.get("required_fields", []) or []),
63
+ required_evidence_terms=list(c.get("required_evidence_terms", []) or []),
64
+ acceptance_rule=str(c.get("acceptance_rule", "all_required_fields")).strip(),
65
+ applicability=dict(c.get("applicability", {}) or {}),
66
+ source_reference=str(c.get("source_reference", "")).strip(),
67
+ )
68
+ )
69
+ return out
70
+
71
+
72
+ def _clause_applicable(extracted: Dict[str, Any], clause: RegulatoryClause) -> bool:
73
+ app = clause.applicability or {}
74
+ if not app:
75
+ return True
76
+ field = str(app.get("field", "")).strip()
77
+ equals = app.get("equals", None)
78
+ if not field:
79
+ return True
80
+ val = extracted.get(field)
81
+ if isinstance(val, list):
82
+ vals = [str(x).strip().lower() for x in val]
83
+ return str(equals).strip().lower() in vals
84
+ return str(val).strip().lower() == str(equals).strip().lower()
85
+
86
+
87
+ def _evaluate_clause(
88
+ extracted: Dict[str, Any],
89
+ evidence: List[Dict[str, Any]],
90
+ clause: RegulatoryClause,
91
+ override_notes: str = "",
92
+ ) -> ClauseEvaluation:
93
+ if not _clause_applicable(extracted, clause):
94
+ return ClauseEvaluation(
95
+ clause_id=clause.clause_id,
96
+ framework=clause.framework,
97
+ status="not_applicable",
98
+ reason="Applicability condition not met.",
99
+ )
100
+
101
+ present: List[str] = []
102
+ missing: List[str] = []
103
+ for f in clause.required_fields:
104
+ if _is_non_empty(extracted.get(f)):
105
+ present.append(f)
106
+ else:
107
+ missing.append(f)
108
+
109
+ evidence_hits: List[str] = []
110
+ ev_text = " ".join([str(x.get("quote", "")) for x in evidence]).lower()
111
+ for term in clause.required_evidence_terms:
112
+ t = str(term).strip().lower()
113
+ if t and t in ev_text:
114
+ evidence_hits.append(term)
115
+
116
+ if clause.required_fields:
117
+ if clause.acceptance_rule == "any_required_fields":
118
+ field_ok = len(present) > 0
119
+ else:
120
+ field_ok = len(missing) == 0
121
+ else:
122
+ field_ok = True
123
+
124
+ evidence_ok = True
125
+ if clause.required_evidence_terms:
126
+ evidence_ok = len(evidence_hits) > 0
127
+
128
+ if field_ok and evidence_ok:
129
+ status = "covered"
130
+ elif present or evidence_hits:
131
+ status = "partial"
132
+ else:
133
+ status = "missing"
134
+
135
+ missing_prompt = ""
136
+ if status in {"missing", "partial"}:
137
+ need_fields = ", ".join(missing) if missing else "additional corroborating evidence"
138
+ missing_prompt = (
139
+ f"Provide evidence for clause {clause.clause_id} ({clause.title}). "
140
+ f"Missing: {need_fields}."
141
+ )
142
+ if override_notes.strip():
143
+ missing_prompt += f" Notes: {override_notes.strip()}"
144
+
145
+ return ClauseEvaluation(
146
+ clause_id=clause.clause_id,
147
+ framework=clause.framework,
148
+ status=status,
149
+ fields_present=present,
150
+ missing_fields=missing,
151
+ evidence_hits=evidence_hits,
152
+ prompt=missing_prompt,
153
+ reason="",
154
+ )
155
+
156
+
157
+ def _paper_record_id(paper: Dict[str, Any]) -> str:
158
+ file_name = str(paper.get("_file", "unknown.pdf"))
159
+ extracted = paper.get("extracted", {}) or {}
160
+ chems = extracted.get("chemicals", [])
161
+ chem = "-"
162
+ if isinstance(chems, list) and chems:
163
+ chem = str(chems[0]).strip() or "-"
164
+ return f"{file_name} | {chem} | Paper"
165
+
166
+
167
+ def map_extraction_to_framework(
168
+ extraction_payload: Any,
169
+ framework: str,
170
+ catalog_dir: str = "regulatory_catalog",
171
+ override_notes: str = "",
172
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], str]:
173
+ papers, existing_ext = _normalize_payload(extraction_payload)
174
+ clauses = load_framework_catalog(framework, catalog_dir=catalog_dir)
175
+
176
+ rows: List[Dict[str, Any]] = []
177
+ status_counts = {"covered": 0, "partial": 0, "missing": 0, "not_applicable": 0}
178
+ prompts: List[str] = []
179
+
180
+ for p in papers:
181
+ extracted = p.get("extracted", {}) or {}
182
+ evidence = p.get("evidence", []) or []
183
+ rec_id = _paper_record_id(p)
184
+ file_name = str(p.get("_file", ""))
185
+ title = str(p.get("paper_title", ""))
186
+
187
+ for clause in clauses:
188
+ ev = _evaluate_clause(extracted, evidence, clause, override_notes=override_notes)
189
+ status_counts[ev.status] = status_counts.get(ev.status, 0) + 1
190
+ if ev.prompt:
191
+ prompts.append(ev.prompt)
192
+
193
+ rows.append(
194
+ {
195
+ "framework": framework,
196
+ "clause_id": clause.clause_id,
197
+ "clause_title": clause.title,
198
+ "file": file_name,
199
+ "paper_title": title,
200
+ "record_id": rec_id,
201
+ "status": ev.status,
202
+ "fields_present": "; ".join(ev.fields_present),
203
+ "missing_fields": "; ".join(ev.missing_fields),
204
+ "evidence_hits": "; ".join(ev.evidence_hits),
205
+ "prompt": ev.prompt,
206
+ "source_reference": clause.source_reference,
207
+ }
208
+ )
209
+
210
+ df = pd.DataFrame(
211
+ rows,
212
+ columns=[
213
+ "framework",
214
+ "clause_id",
215
+ "clause_title",
216
+ "file",
217
+ "paper_title",
218
+ "record_id",
219
+ "status",
220
+ "fields_present",
221
+ "missing_fields",
222
+ "evidence_hits",
223
+ "prompt",
224
+ "source_reference",
225
+ ],
226
+ )
227
+
228
+ report = {
229
+ "framework": framework,
230
+ "summary": status_counts,
231
+ "missing_prompts": prompts,
232
+ "existing_toxra_extensions": existing_ext,
233
+ }
234
+
235
+ md_lines = [
236
+ f"# {framework} Regulatory Gap Assessment",
237
+ "",
238
+ "## Status Summary",
239
+ f"- Covered: {status_counts.get('covered', 0)}",
240
+ f"- Partial: {status_counts.get('partial', 0)}",
241
+ f"- Missing: {status_counts.get('missing', 0)}",
242
+ f"- Not applicable: {status_counts.get('not_applicable', 0)}",
243
+ "",
244
+ "## Priority Data Gaps",
245
+ ]
246
+
247
+ if prompts:
248
+ for p in prompts[:50]:
249
+ md_lines.append(f"- {p}")
250
+ else:
251
+ md_lines.append("- No immediate gaps identified.")
252
+
253
+ markdown = "\n".join(md_lines)
254
+ return df, report, markdown