meta-hack / scripts /data_quality_report.py
vvinayakkk's picture
Sync full clinical-trial-triage project into Space
404c45f
"""
Generate a compact quality report for the clinical case bank.
Usage:
python scripts/data_quality_report.py
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from typing import Any, Dict, List
ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from tasks.case_bank import AE_CASES, DEVIATION_CASES, NARRATIVE_CASES
REQUIRED_KEYS = {
"ae": [
"case_id",
"narrative",
"ground_truth",
"patient_age",
"study_drug",
"lab_values",
],
"deviation": [
"site_id",
"site_name",
"findings",
"ground_truth",
],
"narrative": [
"case_id",
"patient_demographics",
"adverse_event",
"ground_truth",
],
}
def _missing_keys(case: Dict[str, Any], required_keys: List[str]) -> List[str]:
return [key for key in required_keys if key not in case or case[key] in (None, "", [])]
def _analyze_cases(cases: List[Dict[str, Any]], required: List[str], label: str) -> Dict[str, Any]:
missing = []
for case in cases:
gaps = _missing_keys(case, required)
if gaps:
missing.append({"id": case.get("case_id") or case.get("site_id"), "missing": gaps})
return {
"task": label,
"count": len(cases),
"missing_case_count": len(missing),
"missing_examples": missing[:5],
}
def main() -> None:
report = {
"summary": {
"ae_cases": len(AE_CASES),
"deviation_cases": len(DEVIATION_CASES),
"narrative_cases": len(NARRATIVE_CASES),
"total_cases": len(AE_CASES) + len(DEVIATION_CASES) + len(NARRATIVE_CASES),
},
"quality": [
_analyze_cases(AE_CASES, REQUIRED_KEYS["ae"], "adverse_event_triage"),
_analyze_cases(DEVIATION_CASES, REQUIRED_KEYS["deviation"], "protocol_deviation_audit"),
_analyze_cases(NARRATIVE_CASES, REQUIRED_KEYS["narrative"], "safety_narrative_generation"),
],
}
output_path = ROOT / "outputs" / "data_quality_report.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
print(json.dumps(report, indent=2))
print(f"\nSaved report to: {output_path}")
if __name__ == "__main__":
main()