Spaces:

geeteshcodes
/

clinical-trial-env

Sleeping

Geetesh

fix: add /metadata /schema /mcp endpoints for OpenEnv runtime validation

b0d5924 2 months ago

29.7 kB

	"""
	Task definitions for the Clinical Trial Protocol Review environment.
	Each task has:
	- A scenario (protocol + patient records + adverse events)
	- A deterministic grader returning a score in [0.0, 1.0]
	- A difficulty: easy \| medium \| hard
	"""

	from __future__ import annotations
	import re
	from typing import Any, Dict, List, Tuple

	# ---------------------------------------------------------------------------
	# Shared helpers
	# ---------------------------------------------------------------------------

	def _normalize(text: str) -> str:
	return text.lower().strip()


	def _mentions_any(text: str, keywords: List[str]) -> bool:
	t = _normalize(text)
	return any(k in t for k in keywords)


	def _finding_types_in(findings: List[Dict], ftype: str) -> List[Dict]:
	return [f for f in findings if _normalize(f.get("finding_type", "")) == _normalize(ftype)]


	# ===========================================================================
	# TASK 1 — Easy: Eligibility Violation Screening
	# ===========================================================================

	TASK1_PROTOCOL_SUMMARY = """
	TRIAL: ONCO-2024-301 Phase II — Investigational drug XR-7 for metastatic colorectal cancer.
	SPONSOR: NovaPharma Inc.
	PHASE: II (Open-label, single-arm)

	INCLUSION CRITERIA (ALL must be met):
	IC-1: Age ≥ 18 and ≤ 75 years
	IC-2: Histologically confirmed metastatic colorectal adenocarcinoma
	IC-3: ECOG Performance Status 0 or 1
	IC-4: Adequate renal function: eGFR ≥ 60 mL/min/1.73m²
	IC-5: No prior treatment with XR-7 or any KRAS-G12C inhibitor

	EXCLUSION CRITERIA (ANY disqualifies):
	EC-1: Active uncontrolled infection
	EC-2: Prior solid organ transplant
	EC-3: Pregnancy or breastfeeding
	EC-4: QTcF > 480 ms on screening ECG
	EC-5: Concurrent use of strong CYP3A4 inhibitors
	"""

	TASK1_PATIENT_RECORDS = [
	{
	"subject_id": "PT-001",
	"age": 68,
	"diagnosis": "Metastatic colorectal adenocarcinoma",
	"ecog_ps": 1,
	"egfr": 72,
	"prior_kras_inhibitor": False,
	"active_infection": False,
	"organ_transplant": False,
	"pregnant": False,
	"qtcf_ms": 455,
	"cyp3a4_inhibitor": False,
	"notes": "Standard patient, enrolled without issues."
	},
	{
	"subject_id": "PT-002",
	"age": 78, # VIOLATION: age > 75 (IC-1)
	"diagnosis": "Metastatic colorectal adenocarcinoma",
	"ecog_ps": 1,
	"egfr": 65,
	"prior_kras_inhibitor": False,
	"active_infection": False,
	"organ_transplant": False,
	"pregnant": False,
	"qtcf_ms": 460,
	"cyp3a4_inhibitor": False,
	"notes": "Enrolled 3 days after protocol amendment discussion (not yet approved)."
	},
	{
	"subject_id": "PT-003",
	"age": 55,
	"diagnosis": "Metastatic colorectal adenocarcinoma",
	"ecog_ps": 2, # VIOLATION: ECOG PS = 2, must be 0 or 1 (IC-3)
	"egfr": 58, # VIOLATION: eGFR < 60 (IC-4)
	"prior_kras_inhibitor": False,
	"active_infection": False,
	"organ_transplant": False,
	"pregnant": False,
	"qtcf_ms": 470,
	"cyp3a4_inhibitor": False,
	"notes": "Patient self-reported feeling fatigued. Site enrolled after verbal PI approval."
	},
	{
	"subject_id": "PT-004",
	"age": 61,
	"diagnosis": "Metastatic colorectal adenocarcinoma",
	"ecog_ps": 0,
	"egfr": 80,
	"prior_kras_inhibitor": True, # VIOLATION: prior KRAS-G12C inhibitor use (IC-5)
	"active_infection": False,
	"organ_transplant": False,
	"pregnant": False,
	"qtcf_ms": 440,
	"cyp3a4_inhibitor": False,
	"notes": "Patient had prior sotorasib therapy 14 months ago."
	},
	{
	"subject_id": "PT-005",
	"age": 45,
	"diagnosis": "Metastatic colorectal adenocarcinoma",
	"ecog_ps": 1,
	"egfr": 90,
	"prior_kras_inhibitor": False,
	"active_infection": False,
	"organ_transplant": False,
	"pregnant": False,
	"qtcf_ms": 495, # VIOLATION: QTcF > 480 ms (EC-4)
	"cyp3a4_inhibitor": True, # VIOLATION: strong CYP3A4 inhibitor (EC-5)
	"notes": "Patient on ketoconazole for fungal infection. ECG borderline."
	},
	]

	# Ground truth: which subject_ids have violations and what kind
	TASK1_GROUND_TRUTH = {
	"PT-002": {"criteria": ["IC-1"], "severity": "critical"},
	"PT-003": {"criteria": ["IC-3", "IC-4"], "severity": "critical"},
	"PT-004": {"criteria": ["IC-5"], "severity": "critical"},
	"PT-005": {"criteria": ["EC-4", "EC-5"], "severity": "critical"},
	}


	def grade_task1(findings: List[Dict], rationale: str) -> Tuple[float, str]:
	"""
	Score Task 1: Eligibility Violation Screening

	Scoring:
	- 20 pts each for correctly identifying PT-002, PT-003, PT-004, PT-005 as having violations
	- PT-003 has 2 violations; partial credit (10 pts) if only one is found
	- PT-005 has 2 violations; partial credit (10 pts) if only one is found
	- PT-001 false-positive flag: -10 pts
	- Max: 100 pts → normalized to [0, 1]
	"""
	max_pts = 100
	pts = 0
	feedback_parts = []

	# Collect all subject_ids mentioned in findings as violations
	violation_findings = [
	f for f in findings
	if _normalize(f.get("finding_type", "")) in (
	"eligibility_violation", "protocol_deviation"
	)
	]
	flagged_subjects = {
	str(f.get("subject_id", "")).strip().upper()
	for f in violation_findings
	}

	def desc_of(sid):
	descs = [
	_normalize(f.get("description", "") + " " + f.get("recommendation", ""))
	for f in violation_findings
	if str(f.get("subject_id", "")).upper() == sid
	]
	return " ".join(descs)

	# PT-002: age violation
	if "PT-002" in flagged_subjects:
	d = desc_of("PT-002")
	if _mentions_any(d, ["age", "78", "ic-1", "inclusion"]):
	pts += 20
	feedback_parts.append("✓ PT-002 age violation correctly identified (+20)")
	else:
	pts += 10
	feedback_parts.append("~ PT-002 flagged but violation type unclear (+10)")
	else:
	feedback_parts.append("✗ PT-002 age violation missed (0)")

	# PT-003: ECOG PS + eGFR
	if "PT-003" in flagged_subjects:
	d = desc_of("PT-003")
	found_ecog = _mentions_any(d, ["ecog", "performance", "ps", "ic-3"])
	found_egfr = _mentions_any(d, ["egfr", "renal", "kidney", "58", "ic-4"])
	if found_ecog and found_egfr:
	pts += 20
	feedback_parts.append("✓ PT-003 both ECOG and eGFR violations found (+20)")
	elif found_ecog or found_egfr:
	pts += 10
	feedback_parts.append("~ PT-003 only one of two violations found (+10)")
	else:
	pts += 8
	feedback_parts.append("~ PT-003 flagged but violations not specified (+8)")
	else:
	feedback_parts.append("✗ PT-003 violations missed (0)")

	# PT-004: prior KRAS inhibitor
	if "PT-004" in flagged_subjects:
	d = desc_of("PT-004")
	if _mentions_any(d, ["kras", "sotorasib", "prior", "ic-5", "inhibitor"]):
	pts += 20
	feedback_parts.append("✓ PT-004 prior KRAS inhibitor correctly identified (+20)")
	else:
	pts += 10
	feedback_parts.append("~ PT-004 flagged but violation type unclear (+10)")
	else:
	feedback_parts.append("✗ PT-004 prior KRAS inhibitor violation missed (0)")

	# PT-005: QTcF + CYP3A4
	if "PT-005" in flagged_subjects:
	d = desc_of("PT-005")
	found_qtcf = _mentions_any(d, ["qtcf", "qt", "ecg", "480", "495", "ec-4"])
	found_cyp = _mentions_any(d, ["cyp", "ketoconazole", "ec-5", "inhibitor"])
	if found_qtcf and found_cyp:
	pts += 20
	feedback_parts.append("✓ PT-005 both QTcF and CYP3A4 violations found (+20)")
	elif found_qtcf or found_cyp:
	pts += 10
	feedback_parts.append("~ PT-005 only one of two violations found (+10)")
	else:
	pts += 8
	feedback_parts.append("~ PT-005 flagged but violations not specified (+8)")
	else:
	feedback_parts.append("✗ PT-005 violations missed (0)")

	# False positive penalty: PT-001 flagged
	if "PT-001" in flagged_subjects:
	pts -= 10
	feedback_parts.append("✗ PT-001 incorrectly flagged as violation (-10)")
	else:
	pts += 20
	feedback_parts.append("✓ PT-001 correctly not flagged as violation (+20)")

	score = max(0.01, min(0.99, pts / max_pts))
	feedback = f"Task 1 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
	return score, feedback


	# ===========================================================================
	# TASK 2 — Medium: Adverse Event Severity Misclassification
	# ===========================================================================

	TASK2_PROTOCOL_SUMMARY = """
	TRIAL: CARD-2024-112 Phase III — Drug BX-9 for heart failure (NYHA Class II–III).
	SPONSOR: CardioVita Therapeutics
	PHASE: III (Randomized, double-blind, placebo-controlled)

	ADVERSE EVENT CLASSIFICATION (per ICH E2A):
	Grade 1 (Mild) — Asymptomatic/mild symptoms; no intervention needed
	Grade 2 (Moderate)— Minimal intervention indicated; limits instrumental ADL
	Grade 3 (Severe) — Medically significant but not immediately life-threatening;
	hospitalization or prolongation of hospitalization
	Grade 4 (Life-threatening) — Urgent intervention indicated
	Grade 5 (Fatal) — Death related to AE

	SERIOUS ADVERSE EVENTS (SAEs) must be reported to sponsor within 24 hours if:
	- Any Grade 3–5 event
	- Any hospitalization (even Grade 2)
	- Any event deemed life-threatening by the investigator
	- Congenital anomaly/birth defect
	- Any other medically important event

	PROTOCOL NOTE: Dyspnea requiring hospitalization is a pre-specified AE of Special Interest (AESI).
	"""

	TASK2_ADVERSE_EVENTS = [
	{
	"ae_id": "AE-001",
	"subject_id": "PT-201",
	"event": "Headache",
	"site_reported_grade": 1,
	"hospitalized": False,
	"duration_days": 2,
	"intervention": "Acetaminophen PRN",
	"notes": "Resolved spontaneously.",
	"correct_grade": 1,
	"is_sae": False,
	"misclassified": False,
	},
	{
	"ae_id": "AE-002",
	"subject_id": "PT-202",
	"event": "Severe dyspnea with hospitalization for 3 days",
	"site_reported_grade": 2, # MISCLASSIFIED — should be Grade 3 SAE + AESI
	"hospitalized": True,
	"duration_days": 3,
	"intervention": "IV diuretics, oxygen supplementation",
	"notes": "Patient hospitalized. Site reported as Grade 2 and did not file SAE report.",
	"correct_grade": 3,
	"is_sae": True,
	"misclassified": True,
	"issue": "Grade downgraded; SAE not filed; AESI not flagged",
	},
	{
	"ae_id": "AE-003",
	"subject_id": "PT-203",
	"event": "Mild peripheral edema",
	"site_reported_grade": 2,
	"hospitalized": False,
	"duration_days": 14,
	"intervention": "Dose reduction",
	"notes": "Managed with dose reduction. Ongoing.",
	"correct_grade": 2,
	"is_sae": False,
	"misclassified": False,
	},
	{
	"ae_id": "AE-004",
	"subject_id": "PT-204",
	"event": "Ventricular tachycardia — 30-second episode, asymptomatic, resolved spontaneously",
	"site_reported_grade": 1, # MISCLASSIFIED — VT is Grade 3 minimum; potentially Grade 4
	"hospitalized": False,
	"duration_days": 0,
	"intervention": "None; monitored",
	"notes": "Recorded on Holter monitor. Patient unaware. Site classified as Grade 1.",
	"correct_grade": 3,
	"is_sae": True,
	"misclassified": True,
	"issue": "VT always ≥ Grade 3; SAE report missing; cardiac safety signal",
	},
	{
	"ae_id": "AE-005",
	"subject_id": "PT-205",
	"event": "Acute kidney injury — creatinine 3.2x ULN",
	"site_reported_grade": 2, # MISCLASSIFIED — creatinine >3x ULN = Grade 3
	"hospitalized": True,
	"duration_days": 5,
	"intervention": "IV fluids, nephrology consult, drug held",
	"notes": "Creatinine peaked at 3.2x ULN. Patient hospitalized. Site reported Grade 2.",
	"correct_grade": 3,
	"is_sae": True,
	"misclassified": True,
	"issue": "Creatinine >3x ULN = Grade 3 per CTCAE; hospitalization = SAE",
	},
	{
	"ae_id": "AE-006",
	"subject_id": "PT-206",
	"event": "Nausea with vomiting, unable to maintain oral intake for 24h",
	"site_reported_grade": 3,
	"hospitalized": True,
	"duration_days": 2,
	"intervention": "IV antiemetics, IV hydration",
	"notes": "SAE filed. Resolved with treatment.",
	"correct_grade": 3,
	"is_sae": True,
	"misclassified": False,
	},
	{
	"ae_id": "AE-007",
	"subject_id": "PT-207",
	"event": "Fatigue — patient reports feeling tired but continues all daily activities",
	"site_reported_grade": 3, # MISCLASSIFIED — should be Grade 1 (continues ADL)
	"hospitalized": False,
	"duration_days": 7,
	"intervention": "None",
	"notes": "Site over-reported as Grade 3. No hospitalization, no intervention.",
	"correct_grade": 1,
	"is_sae": False,
	"misclassified": True,
	"issue": "Fatigue with no ADL limitation = Grade 1; Grade 3 is over-reporting",
	},
	]

	TASK2_MISCLASSIFIED_AE_IDS = {"AE-002", "AE-004", "AE-005", "AE-007"}


	def grade_task2(findings: List[Dict], rationale: str) -> Tuple[float, str]:
	"""
	Score Task 2: Adverse Event Severity Misclassification

	Max 100 pts:
	- 20 pts each for correctly identifying AE-002, AE-004, AE-005, AE-007 as misclassified
	- Must identify BOTH the wrong grade AND suggest correct grade for full credit (10 each)
	- AE-004 and AE-002: additional 5 pts for identifying SAE reporting failure
	- False positive (flagging AE-001, AE-003, AE-006): -8 pts each
	"""
	max_pts = 100
	pts = 0
	feedback_parts = []

	ae_findings = [
	f for f in findings
	if _normalize(f.get("finding_type", "")) in (
	"adverse_event", "protocol_deviation", "safety_concern"
	)
	]

	def get_ae_descs(ae_id):
	descs = []
	for f in ae_findings:
	desc = _normalize(f.get("description", "") + " " + f.get("recommendation", ""))
	subj = str(f.get("subject_id", "")).strip()
	# Match by AE ID or by subject ID
	if ae_id.lower() in desc or ae_id.replace("AE-", "ae-") in desc:
	descs.append(desc)
	# Also match by subject_id mapping
	ae_to_subj = {
	"AE-002": "PT-202", "AE-004": "PT-204",
	"AE-005": "PT-205", "AE-007": "PT-207",
	"AE-001": "PT-201", "AE-003": "PT-203", "AE-006": "PT-206",
	}
	sid = ae_to_subj.get(ae_id, "")
	for f in ae_findings:
	if str(f.get("subject_id", "")).strip().upper() == sid:
	descs.append(_normalize(f.get("description", "") + " " + f.get("recommendation", "")))
	return " ".join(descs)

	def ae_flagged(ae_id):
	d = get_ae_descs(ae_id)
	return len(d) > 0

	# AE-002: dyspnea — Grade 2→3, SAE, AESI
	if ae_flagged("AE-002"):
	d = get_ae_descs("AE-002")
	grade_correct = _mentions_any(d, ["grade 3", "grade-3", "grade3", "3", "sae", "serious"])
	aesi_flag = _mentions_any(d, ["aesi", "special interest", "dyspnea", "sae", "hospitali"])
	pts += 10 if grade_correct else 5
	pts += 5 if aesi_flag else 0
	feedback_parts.append(f"✓ AE-002 flagged {'with grade+SAE correction' if grade_correct and aesi_flag else 'partially'} (+{10 if grade_correct else 5}{'+5 AESI' if aesi_flag else ''})")
	else:
	feedback_parts.append("✗ AE-002 dyspnea misclassification missed (0)")

	# AE-004: VT — Grade 1→3, SAE
	if ae_flagged("AE-004"):
	d = get_ae_descs("AE-004")
	grade_correct = _mentions_any(d, ["grade 3", "grade-3", "3", "sae", "serious", "life"])
	cardiac_signal = _mentions_any(d, ["cardiac", "ventricular", "vt", "safety", "sae"])
	pts += 12 if grade_correct else 6
	pts += 5 if cardiac_signal else 0
	feedback_parts.append(f"✓ AE-004 VT flagged {'with cardiac SAE recognition' if cardiac_signal else 'partially'} (+{12 if grade_correct else 6}{'+5' if cardiac_signal else ''})")
	else:
	feedback_parts.append("✗ AE-004 VT misclassification missed (0)")

	# AE-005: AKI — Grade 2→3
	if ae_flagged("AE-005"):
	d = get_ae_descs("AE-005")
	grade_correct = _mentions_any(d, ["grade 3", "3", "ctcae", "3x", "uln"])
	pts += 15 if grade_correct else 8
	feedback_parts.append(f"✓ AE-005 AKI flagged {'with CTCAE reference' if grade_correct else 'partially'} (+{15 if grade_correct else 8})")
	else:
	feedback_parts.append("✗ AE-005 AKI misclassification missed (0)")

	# AE-007: fatigue — Grade 3→1 (over-reporting)
	if ae_flagged("AE-007"):
	d = get_ae_descs("AE-007")
	grade_correct = _mentions_any(d, ["grade 1", "1", "over", "over-report", "adl", "downgrade"])
	pts += 15 if grade_correct else 8
	feedback_parts.append(f"✓ AE-007 fatigue over-reporting flagged {'correctly' if grade_correct else 'partially'} (+{15 if grade_correct else 8})")
	else:
	feedback_parts.append("✗ AE-007 fatigue over-reporting missed (0)")

	# Bonus for clean rationale covering systemic issue
	if _mentions_any(rationale, ["sae", "reporting", "systematic", "site training", "pattern"]):
	pts += 5
	feedback_parts.append("✓ Rationale identifies systemic reporting issues (+5)")

	# Correct AEs not flagged (true negatives baseline)
	correct_ae_ids = ["AE-001", "AE-003", "AE-006"]
	fp_penalty = 0
	for ae_id in correct_ae_ids:
	if ae_flagged(ae_id):
	pts -= 8
	fp_penalty += 8
	feedback_parts.append(f"✗ {ae_id} incorrectly flagged as misclassified (-8)")

	if fp_penalty == 0:
	pts += 33 # baseline for correctly leaving good AEs alone
	feedback_parts.append("✓ No false positives on correctly classified AEs (+33)")

	score = max(0.01, min(0.99, pts / max_pts))
	feedback = f"Task 2 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
	return score, feedback


	# ===========================================================================
	# TASK 3 — Hard: Comprehensive Protocol Amendment Review
	# ===========================================================================

	TASK3_PROTOCOL_TEXT = """
	PROTOCOL: NEURO-2024-450 Phase II — Drug NX-12 for treatment-resistant depression (TRD)
	VERSION: 1.2 (Under Review for Amendment)
	SPONSOR: MindBridge Pharmaceuticals

	═══════════════════════════════════════════════════
	SECTION 3: STUDY DESIGN
	═══════════════════════════════════════════════════
	Open-label, single-arm study. No control arm. Duration: 12 weeks active treatment.
	Primary endpoint: Change from baseline in MADRS score at Week 8.
	Secondary endpoint: Response rate (≥50% MADRS reduction) at Week 12.

	PROPOSED CHANGE (Amendment A):
	Extend treatment from 12 → 24 weeks with no change to primary endpoint timing.

	PROPOSED CHANGE (Amendment B):
	Add an optional open-label extension (OLE) of up to 52 weeks for responders.
	Consent for OLE to be obtained at Week 12 visit (same day as eligibility assessment).

	═══════════════════════════════════════════════════
	SECTION 4: PATIENT POPULATION
	═══════════════════════════════════════════════════
	INCLUSION:
	- Age 22–65
	- DSM-5 diagnosis of MDD, current episode resistant to ≥2 adequate antidepressant trials
	- MADRS score ≥28 at screening AND baseline (within 3 days of screening)
	- Capable of providing informed consent

	EXCLUSION:
	- Active suicidal ideation with plan or intent (C-SSRS score ≥4)
	- Current or recent (within 6 months) substance use disorder
	- Pregnancy or intent to become pregnant
	- Current use of MAOIs

	PROPOSED CHANGE (Amendment C):
	Relax exclusion EC-CSSRS: Allow enrollment of patients with C-SSRS score 4
	(active suicidal ideation WITH plan, WITHOUT intent) if monitored weekly.

	═══════════════════════════════════════════════════
	SECTION 5: SAFETY MONITORING
	═══════════════════════════════════════════════════
	Data Safety Monitoring Board (DSMB): Annual review only.
	Suicidality monitoring: C-SSRS at baseline and Week 8.
	Blood pressure monitoring: At screening, Week 4, Week 12.

	NX-12 KNOWN RISKS (from Phase I):
	- Dose-dependent blood pressure elevation (mean +18 mmHg systolic at max dose)
	- Transient dissociative symptoms in 23% of patients at Weeks 1-2
	- Two cases of hypertensive urgency in Phase I (n=42)

	PROPOSED CHANGE (Amendment D):
	Increase DSMB review from annual to quarterly.
	Add BP monitoring at Weeks 2 and 8.

	═══════════════════════════════════════════════════
	SECTION 6: STATISTICAL ANALYSIS PLAN
	═══════════════════════════════════════════════════
	Sample size: N=45 (powered for 80% power to detect 7-point MADRS change)
	Analysis population: Per-protocol (PP) only.
	Missing data: Last observation carried forward (LOCF).
	Interim analysis: None planned.

	PROPOSED CHANGE (Amendment E):
	Add a single interim analysis at 50% enrollment for futility only.

	═══════════════════════════════════════════════════
	SECTION 7: INFORMED CONSENT
	═══════════════════════════════════════════════════
	Current consent process: Written informed consent obtained at screening visit.
	Consent document revision required for all amendments.

	PROPOSED CHANGE (Amendment F):
	Allow verbal consent for the OLE (Amendment B) to reduce patient burden,
	with written consent waived if patient signs OLE enrollment form.
	"""

	# Ground truth: what a qualified medical monitor / regulatory reviewer would flag
	TASK3_GROUND_TRUTH_ISSUES = {
	"amendment_c_safety": {
	"description": "Amendment C allows suicidal patients (C-SSRS≥4) with plan — unacceptable in TRD trial without intensive monitoring infrastructure beyond 'weekly check'",
	"severity": "critical",
	"keywords": ["suicid", "c-ssrs", "amendment c", "safety", "vulnerable", "risk"],
	"weight": 20,
	},
	"amendment_b_consent_timing": {
	"description": "OLE consent at same visit as eligibility assessment creates undue influence / coercion risk",
	"severity": "major",
	"keywords": ["ole", "consent", "same day", "coercio", "undue", "extension", "amendment b"],
	"weight": 15,
	},
	"amendment_f_verbal_consent": {
	"description": "Verbal consent for OLE is insufficient per ICH E6(R2) GCP — written consent required for all interventional trial participation",
	"severity": "critical",
	"keywords": ["verbal", "consent", "gcp", "ich", "written", "waiv", "amendment f"],
	"weight": 15,
	},
	"suicidality_monitoring_gap": {
	"description": "C-SSRS only at baseline and Week 8 is insufficient for TRD + suicidal risk population; should be monthly minimum",
	"severity": "major",
	"keywords": ["c-ssrs", "suicid", "monitoring", "frequen", "weekly", "monthly", "trd"],
	"weight": 15,
	},
	"bp_monitoring_still_insufficient": {
	"description": "Even with Amendment D, BP monitoring is missing at Weeks 1, 6, 16, 20 — given +18 mmHg known risk, monthly monitoring minimum is needed during OLE",
	"severity": "major",
	"keywords": ["blood pressure", "bp", "hypertens", "monitoring", "amendment d", "ole"],
	"weight": 10,
	},
	"no_control_arm_bias": {
	"description": "Open-label single-arm design in TRD (high placebo response ~30-40%) without randomized comparator limits interpretability; Amendment A extension compounds this",
	"severity": "major",
	"keywords": ["open-label", "control", "placebo", "bias", "single-arm", "interpretab"],
	"weight": 10,
	},
	"locf_missing_data": {
	"description": "LOCF is discouraged by FDA/EMA for psychiatric trials; multiple imputation or mixed-model repeated measures preferred",
	"severity": "minor",
	"keywords": ["locf", "missing data", "imputation", "fda", "ema", "mmrm"],
	"weight": 8,
	},
	"interim_analysis_alpha_spend": {
	"description": "Amendment E adds futility interim but no alpha-spending rule or stopping boundaries specified — protocol gap",
	"severity": "minor",
	"keywords": ["interim", "alpha", "spend", "futility", "boundar", "amendment e"],
	"weight": 7,
	},
	}


	def grade_task3(findings: List[Dict], rationale: str) -> Tuple[float, str]:
	"""
	Score Task 3: Comprehensive Protocol Amendment Review

	Scores based on how many ground-truth issues are identified,
	weighted by severity and quality of the finding description.
	Max ~100 pts.
	"""
	max_pts = 100
	pts = 0
	feedback_parts = []

	all_text = rationale
	for f in findings:
	all_text += " " + f.get("description", "") + " " + f.get("recommendation", "")
	all_text = _normalize(all_text)

	for issue_key, issue in TASK3_GROUND_TRUTH_ISSUES.items():
	keywords = issue["keywords"]
	weight = issue["weight"]
	hits = sum(1 for k in keywords if k in all_text)
	hit_rate = hits / len(keywords)

	if hit_rate >= 0.5:
	earned = int(weight * min(1.0, hit_rate + 0.2))
	pts += earned
	feedback_parts.append(f"✓ [{issue['severity'].upper()}] {issue_key}: identified ({hits}/{len(keywords)} keywords, +{earned})")
	elif hit_rate >= 0.25:
	earned = int(weight * 0.4)
	pts += earned
	feedback_parts.append(f"~ [{issue['severity'].upper()}] {issue_key}: partially identified (+{earned})")
	else:
	feedback_parts.append(f"✗ [{issue['severity'].upper()}] {issue_key}: missed (0/{weight})")

	# Bonus for structured, actionable recommendations
	n_recommendations = sum(
	1 for f in findings
	if len(f.get("recommendation", "")) > 30
	)
	if n_recommendations >= 5:
	pts += 5
	feedback_parts.append(f"✓ {n_recommendations} actionable recommendations provided (+5)")
	elif n_recommendations >= 3:
	pts += 3
	feedback_parts.append(f"~ {n_recommendations} recommendations provided (+3)")

	score = max(0.01, min(0.99, pts / max_pts))
	feedback = f"Task 3 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
	return score, feedback


	# ===========================================================================
	# Task registry
	# ===========================================================================

	TASKS = {
	"eligibility_screening": {
	"name": "eligibility_screening",
	"difficulty": "easy",
	"description": "Identify protocol eligibility violations across 5 patient records for ONCO-2024-301.",
	"max_steps": 3,
	"protocol_summary": TASK1_PROTOCOL_SUMMARY,
	"patient_records": TASK1_PATIENT_RECORDS,
	"adverse_events": [],
	"protocol_text": "",
	"grader": grade_task1,
	},
	"ae_classification": {
	"name": "ae_classification",
	"difficulty": "medium",
	"description": "Review 7 adverse events in CARD-2024-112 and identify all misclassifications.",
	"max_steps": 4,
	"protocol_summary": TASK2_PROTOCOL_SUMMARY,
	"patient_records": [],
	"adverse_events": TASK2_ADVERSE_EVENTS,
	"protocol_text": "",
	"grader": grade_task2,
	},
	"protocol_amendment_review": {
	"name": "protocol_amendment_review",
	"difficulty": "hard",
	"description": "Comprehensively review 6 proposed protocol amendments for NEURO-2024-450 and produce structured findings.",
	"max_steps": 5,
	"protocol_summary": "NEURO-2024-450 Phase II — Drug NX-12 for treatment-resistant depression (TRD). 6 amendments under review.",
	"patient_records": [],
	"adverse_events": [],
	"protocol_text": TASK3_PROTOCOL_TEXT,
	"grader": grade_task3,
	},
	}