""" Task definitions for the Clinical Trial Protocol Review environment. Each task has: - A scenario (protocol + patient records + adverse events) - A deterministic grader returning a score in [0.0, 1.0] - A difficulty: easy | medium | hard """ from __future__ import annotations import re from typing import Any, Dict, List, Tuple # --------------------------------------------------------------------------- # Shared helpers # --------------------------------------------------------------------------- def _normalize(text: str) -> str: return text.lower().strip() def _mentions_any(text: str, keywords: List[str]) -> bool: t = _normalize(text) return any(k in t for k in keywords) def _finding_types_in(findings: List[Dict], ftype: str) -> List[Dict]: return [f for f in findings if _normalize(f.get("finding_type", "")) == _normalize(ftype)] # =========================================================================== # TASK 1 — Easy: Eligibility Violation Screening # =========================================================================== TASK1_PROTOCOL_SUMMARY = """ TRIAL: ONCO-2024-301 Phase II — Investigational drug XR-7 for metastatic colorectal cancer. SPONSOR: NovaPharma Inc. PHASE: II (Open-label, single-arm) INCLUSION CRITERIA (ALL must be met): IC-1: Age ≥ 18 and ≤ 75 years IC-2: Histologically confirmed metastatic colorectal adenocarcinoma IC-3: ECOG Performance Status 0 or 1 IC-4: Adequate renal function: eGFR ≥ 60 mL/min/1.73m² IC-5: No prior treatment with XR-7 or any KRAS-G12C inhibitor EXCLUSION CRITERIA (ANY disqualifies): EC-1: Active uncontrolled infection EC-2: Prior solid organ transplant EC-3: Pregnancy or breastfeeding EC-4: QTcF > 480 ms on screening ECG EC-5: Concurrent use of strong CYP3A4 inhibitors """ TASK1_PATIENT_RECORDS = [ { "subject_id": "PT-001", "age": 68, "diagnosis": "Metastatic colorectal adenocarcinoma", "ecog_ps": 1, "egfr": 72, "prior_kras_inhibitor": False, "active_infection": False, "organ_transplant": False, "pregnant": False, "qtcf_ms": 455, "cyp3a4_inhibitor": False, "notes": "Standard patient, enrolled without issues." }, { "subject_id": "PT-002", "age": 78, # VIOLATION: age > 75 (IC-1) "diagnosis": "Metastatic colorectal adenocarcinoma", "ecog_ps": 1, "egfr": 65, "prior_kras_inhibitor": False, "active_infection": False, "organ_transplant": False, "pregnant": False, "qtcf_ms": 460, "cyp3a4_inhibitor": False, "notes": "Enrolled 3 days after protocol amendment discussion (not yet approved)." }, { "subject_id": "PT-003", "age": 55, "diagnosis": "Metastatic colorectal adenocarcinoma", "ecog_ps": 2, # VIOLATION: ECOG PS = 2, must be 0 or 1 (IC-3) "egfr": 58, # VIOLATION: eGFR < 60 (IC-4) "prior_kras_inhibitor": False, "active_infection": False, "organ_transplant": False, "pregnant": False, "qtcf_ms": 470, "cyp3a4_inhibitor": False, "notes": "Patient self-reported feeling fatigued. Site enrolled after verbal PI approval." }, { "subject_id": "PT-004", "age": 61, "diagnosis": "Metastatic colorectal adenocarcinoma", "ecog_ps": 0, "egfr": 80, "prior_kras_inhibitor": True, # VIOLATION: prior KRAS-G12C inhibitor use (IC-5) "active_infection": False, "organ_transplant": False, "pregnant": False, "qtcf_ms": 440, "cyp3a4_inhibitor": False, "notes": "Patient had prior sotorasib therapy 14 months ago." }, { "subject_id": "PT-005", "age": 45, "diagnosis": "Metastatic colorectal adenocarcinoma", "ecog_ps": 1, "egfr": 90, "prior_kras_inhibitor": False, "active_infection": False, "organ_transplant": False, "pregnant": False, "qtcf_ms": 495, # VIOLATION: QTcF > 480 ms (EC-4) "cyp3a4_inhibitor": True, # VIOLATION: strong CYP3A4 inhibitor (EC-5) "notes": "Patient on ketoconazole for fungal infection. ECG borderline." }, ] # Ground truth: which subject_ids have violations and what kind TASK1_GROUND_TRUTH = { "PT-002": {"criteria": ["IC-1"], "severity": "critical"}, "PT-003": {"criteria": ["IC-3", "IC-4"], "severity": "critical"}, "PT-004": {"criteria": ["IC-5"], "severity": "critical"}, "PT-005": {"criteria": ["EC-4", "EC-5"], "severity": "critical"}, } def grade_task1(findings: List[Dict], rationale: str) -> Tuple[float, str]: """ Score Task 1: Eligibility Violation Screening Scoring: - 20 pts each for correctly identifying PT-002, PT-003, PT-004, PT-005 as having violations - PT-003 has 2 violations; partial credit (10 pts) if only one is found - PT-005 has 2 violations; partial credit (10 pts) if only one is found - PT-001 false-positive flag: -10 pts - Max: 100 pts → normalized to [0, 1] """ max_pts = 100 pts = 0 feedback_parts = [] # Collect all subject_ids mentioned in findings as violations violation_findings = [ f for f in findings if _normalize(f.get("finding_type", "")) in ( "eligibility_violation", "protocol_deviation" ) ] flagged_subjects = { str(f.get("subject_id", "")).strip().upper() for f in violation_findings } def desc_of(sid): descs = [ _normalize(f.get("description", "") + " " + f.get("recommendation", "")) for f in violation_findings if str(f.get("subject_id", "")).upper() == sid ] return " ".join(descs) # PT-002: age violation if "PT-002" in flagged_subjects: d = desc_of("PT-002") if _mentions_any(d, ["age", "78", "ic-1", "inclusion"]): pts += 20 feedback_parts.append("✓ PT-002 age violation correctly identified (+20)") else: pts += 10 feedback_parts.append("~ PT-002 flagged but violation type unclear (+10)") else: feedback_parts.append("✗ PT-002 age violation missed (0)") # PT-003: ECOG PS + eGFR if "PT-003" in flagged_subjects: d = desc_of("PT-003") found_ecog = _mentions_any(d, ["ecog", "performance", "ps", "ic-3"]) found_egfr = _mentions_any(d, ["egfr", "renal", "kidney", "58", "ic-4"]) if found_ecog and found_egfr: pts += 20 feedback_parts.append("✓ PT-003 both ECOG and eGFR violations found (+20)") elif found_ecog or found_egfr: pts += 10 feedback_parts.append("~ PT-003 only one of two violations found (+10)") else: pts += 8 feedback_parts.append("~ PT-003 flagged but violations not specified (+8)") else: feedback_parts.append("✗ PT-003 violations missed (0)") # PT-004: prior KRAS inhibitor if "PT-004" in flagged_subjects: d = desc_of("PT-004") if _mentions_any(d, ["kras", "sotorasib", "prior", "ic-5", "inhibitor"]): pts += 20 feedback_parts.append("✓ PT-004 prior KRAS inhibitor correctly identified (+20)") else: pts += 10 feedback_parts.append("~ PT-004 flagged but violation type unclear (+10)") else: feedback_parts.append("✗ PT-004 prior KRAS inhibitor violation missed (0)") # PT-005: QTcF + CYP3A4 if "PT-005" in flagged_subjects: d = desc_of("PT-005") found_qtcf = _mentions_any(d, ["qtcf", "qt", "ecg", "480", "495", "ec-4"]) found_cyp = _mentions_any(d, ["cyp", "ketoconazole", "ec-5", "inhibitor"]) if found_qtcf and found_cyp: pts += 20 feedback_parts.append("✓ PT-005 both QTcF and CYP3A4 violations found (+20)") elif found_qtcf or found_cyp: pts += 10 feedback_parts.append("~ PT-005 only one of two violations found (+10)") else: pts += 8 feedback_parts.append("~ PT-005 flagged but violations not specified (+8)") else: feedback_parts.append("✗ PT-005 violations missed (0)") # False positive penalty: PT-001 flagged if "PT-001" in flagged_subjects: pts -= 10 feedback_parts.append("✗ PT-001 incorrectly flagged as violation (-10)") else: pts += 20 feedback_parts.append("✓ PT-001 correctly not flagged as violation (+20)") score = max(0.01, min(0.99, pts / max_pts)) feedback = f"Task 1 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts) return score, feedback # =========================================================================== # TASK 2 — Medium: Adverse Event Severity Misclassification # =========================================================================== TASK2_PROTOCOL_SUMMARY = """ TRIAL: CARD-2024-112 Phase III — Drug BX-9 for heart failure (NYHA Class II–III). SPONSOR: CardioVita Therapeutics PHASE: III (Randomized, double-blind, placebo-controlled) ADVERSE EVENT CLASSIFICATION (per ICH E2A): Grade 1 (Mild) — Asymptomatic/mild symptoms; no intervention needed Grade 2 (Moderate)— Minimal intervention indicated; limits instrumental ADL Grade 3 (Severe) — Medically significant but not immediately life-threatening; hospitalization or prolongation of hospitalization Grade 4 (Life-threatening) — Urgent intervention indicated Grade 5 (Fatal) — Death related to AE SERIOUS ADVERSE EVENTS (SAEs) must be reported to sponsor within 24 hours if: - Any Grade 3–5 event - Any hospitalization (even Grade 2) - Any event deemed life-threatening by the investigator - Congenital anomaly/birth defect - Any other medically important event PROTOCOL NOTE: Dyspnea requiring hospitalization is a pre-specified AE of Special Interest (AESI). """ TASK2_ADVERSE_EVENTS = [ { "ae_id": "AE-001", "subject_id": "PT-201", "event": "Headache", "site_reported_grade": 1, "hospitalized": False, "duration_days": 2, "intervention": "Acetaminophen PRN", "notes": "Resolved spontaneously.", "correct_grade": 1, "is_sae": False, "misclassified": False, }, { "ae_id": "AE-002", "subject_id": "PT-202", "event": "Severe dyspnea with hospitalization for 3 days", "site_reported_grade": 2, # MISCLASSIFIED — should be Grade 3 SAE + AESI "hospitalized": True, "duration_days": 3, "intervention": "IV diuretics, oxygen supplementation", "notes": "Patient hospitalized. Site reported as Grade 2 and did not file SAE report.", "correct_grade": 3, "is_sae": True, "misclassified": True, "issue": "Grade downgraded; SAE not filed; AESI not flagged", }, { "ae_id": "AE-003", "subject_id": "PT-203", "event": "Mild peripheral edema", "site_reported_grade": 2, "hospitalized": False, "duration_days": 14, "intervention": "Dose reduction", "notes": "Managed with dose reduction. Ongoing.", "correct_grade": 2, "is_sae": False, "misclassified": False, }, { "ae_id": "AE-004", "subject_id": "PT-204", "event": "Ventricular tachycardia — 30-second episode, asymptomatic, resolved spontaneously", "site_reported_grade": 1, # MISCLASSIFIED — VT is Grade 3 minimum; potentially Grade 4 "hospitalized": False, "duration_days": 0, "intervention": "None; monitored", "notes": "Recorded on Holter monitor. Patient unaware. Site classified as Grade 1.", "correct_grade": 3, "is_sae": True, "misclassified": True, "issue": "VT always ≥ Grade 3; SAE report missing; cardiac safety signal", }, { "ae_id": "AE-005", "subject_id": "PT-205", "event": "Acute kidney injury — creatinine 3.2x ULN", "site_reported_grade": 2, # MISCLASSIFIED — creatinine >3x ULN = Grade 3 "hospitalized": True, "duration_days": 5, "intervention": "IV fluids, nephrology consult, drug held", "notes": "Creatinine peaked at 3.2x ULN. Patient hospitalized. Site reported Grade 2.", "correct_grade": 3, "is_sae": True, "misclassified": True, "issue": "Creatinine >3x ULN = Grade 3 per CTCAE; hospitalization = SAE", }, { "ae_id": "AE-006", "subject_id": "PT-206", "event": "Nausea with vomiting, unable to maintain oral intake for 24h", "site_reported_grade": 3, "hospitalized": True, "duration_days": 2, "intervention": "IV antiemetics, IV hydration", "notes": "SAE filed. Resolved with treatment.", "correct_grade": 3, "is_sae": True, "misclassified": False, }, { "ae_id": "AE-007", "subject_id": "PT-207", "event": "Fatigue — patient reports feeling tired but continues all daily activities", "site_reported_grade": 3, # MISCLASSIFIED — should be Grade 1 (continues ADL) "hospitalized": False, "duration_days": 7, "intervention": "None", "notes": "Site over-reported as Grade 3. No hospitalization, no intervention.", "correct_grade": 1, "is_sae": False, "misclassified": True, "issue": "Fatigue with no ADL limitation = Grade 1; Grade 3 is over-reporting", }, ] TASK2_MISCLASSIFIED_AE_IDS = {"AE-002", "AE-004", "AE-005", "AE-007"} def grade_task2(findings: List[Dict], rationale: str) -> Tuple[float, str]: """ Score Task 2: Adverse Event Severity Misclassification Max 100 pts: - 20 pts each for correctly identifying AE-002, AE-004, AE-005, AE-007 as misclassified - Must identify BOTH the wrong grade AND suggest correct grade for full credit (10 each) - AE-004 and AE-002: additional 5 pts for identifying SAE reporting failure - False positive (flagging AE-001, AE-003, AE-006): -8 pts each """ max_pts = 100 pts = 0 feedback_parts = [] ae_findings = [ f for f in findings if _normalize(f.get("finding_type", "")) in ( "adverse_event", "protocol_deviation", "safety_concern" ) ] def get_ae_descs(ae_id): descs = [] for f in ae_findings: desc = _normalize(f.get("description", "") + " " + f.get("recommendation", "")) subj = str(f.get("subject_id", "")).strip() # Match by AE ID or by subject ID if ae_id.lower() in desc or ae_id.replace("AE-", "ae-") in desc: descs.append(desc) # Also match by subject_id mapping ae_to_subj = { "AE-002": "PT-202", "AE-004": "PT-204", "AE-005": "PT-205", "AE-007": "PT-207", "AE-001": "PT-201", "AE-003": "PT-203", "AE-006": "PT-206", } sid = ae_to_subj.get(ae_id, "") for f in ae_findings: if str(f.get("subject_id", "")).strip().upper() == sid: descs.append(_normalize(f.get("description", "") + " " + f.get("recommendation", ""))) return " ".join(descs) def ae_flagged(ae_id): d = get_ae_descs(ae_id) return len(d) > 0 # AE-002: dyspnea — Grade 2→3, SAE, AESI if ae_flagged("AE-002"): d = get_ae_descs("AE-002") grade_correct = _mentions_any(d, ["grade 3", "grade-3", "grade3", "3", "sae", "serious"]) aesi_flag = _mentions_any(d, ["aesi", "special interest", "dyspnea", "sae", "hospitali"]) pts += 10 if grade_correct else 5 pts += 5 if aesi_flag else 0 feedback_parts.append(f"✓ AE-002 flagged {'with grade+SAE correction' if grade_correct and aesi_flag else 'partially'} (+{10 if grade_correct else 5}{'+5 AESI' if aesi_flag else ''})") else: feedback_parts.append("✗ AE-002 dyspnea misclassification missed (0)") # AE-004: VT — Grade 1→3, SAE if ae_flagged("AE-004"): d = get_ae_descs("AE-004") grade_correct = _mentions_any(d, ["grade 3", "grade-3", "3", "sae", "serious", "life"]) cardiac_signal = _mentions_any(d, ["cardiac", "ventricular", "vt", "safety", "sae"]) pts += 12 if grade_correct else 6 pts += 5 if cardiac_signal else 0 feedback_parts.append(f"✓ AE-004 VT flagged {'with cardiac SAE recognition' if cardiac_signal else 'partially'} (+{12 if grade_correct else 6}{'+5' if cardiac_signal else ''})") else: feedback_parts.append("✗ AE-004 VT misclassification missed (0)") # AE-005: AKI — Grade 2→3 if ae_flagged("AE-005"): d = get_ae_descs("AE-005") grade_correct = _mentions_any(d, ["grade 3", "3", "ctcae", "3x", "uln"]) pts += 15 if grade_correct else 8 feedback_parts.append(f"✓ AE-005 AKI flagged {'with CTCAE reference' if grade_correct else 'partially'} (+{15 if grade_correct else 8})") else: feedback_parts.append("✗ AE-005 AKI misclassification missed (0)") # AE-007: fatigue — Grade 3→1 (over-reporting) if ae_flagged("AE-007"): d = get_ae_descs("AE-007") grade_correct = _mentions_any(d, ["grade 1", "1", "over", "over-report", "adl", "downgrade"]) pts += 15 if grade_correct else 8 feedback_parts.append(f"✓ AE-007 fatigue over-reporting flagged {'correctly' if grade_correct else 'partially'} (+{15 if grade_correct else 8})") else: feedback_parts.append("✗ AE-007 fatigue over-reporting missed (0)") # Bonus for clean rationale covering systemic issue if _mentions_any(rationale, ["sae", "reporting", "systematic", "site training", "pattern"]): pts += 5 feedback_parts.append("✓ Rationale identifies systemic reporting issues (+5)") # Correct AEs not flagged (true negatives baseline) correct_ae_ids = ["AE-001", "AE-003", "AE-006"] fp_penalty = 0 for ae_id in correct_ae_ids: if ae_flagged(ae_id): pts -= 8 fp_penalty += 8 feedback_parts.append(f"✗ {ae_id} incorrectly flagged as misclassified (-8)") if fp_penalty == 0: pts += 33 # baseline for correctly leaving good AEs alone feedback_parts.append("✓ No false positives on correctly classified AEs (+33)") score = max(0.01, min(0.99, pts / max_pts)) feedback = f"Task 2 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts) return score, feedback # =========================================================================== # TASK 3 — Hard: Comprehensive Protocol Amendment Review # =========================================================================== TASK3_PROTOCOL_TEXT = """ PROTOCOL: NEURO-2024-450 Phase II — Drug NX-12 for treatment-resistant depression (TRD) VERSION: 1.2 (Under Review for Amendment) SPONSOR: MindBridge Pharmaceuticals ═══════════════════════════════════════════════════ SECTION 3: STUDY DESIGN ═══════════════════════════════════════════════════ Open-label, single-arm study. No control arm. Duration: 12 weeks active treatment. Primary endpoint: Change from baseline in MADRS score at Week 8. Secondary endpoint: Response rate (≥50% MADRS reduction) at Week 12. PROPOSED CHANGE (Amendment A): Extend treatment from 12 → 24 weeks with no change to primary endpoint timing. PROPOSED CHANGE (Amendment B): Add an optional open-label extension (OLE) of up to 52 weeks for responders. Consent for OLE to be obtained at Week 12 visit (same day as eligibility assessment). ═══════════════════════════════════════════════════ SECTION 4: PATIENT POPULATION ═══════════════════════════════════════════════════ INCLUSION: - Age 22–65 - DSM-5 diagnosis of MDD, current episode resistant to ≥2 adequate antidepressant trials - MADRS score ≥28 at screening AND baseline (within 3 days of screening) - Capable of providing informed consent EXCLUSION: - Active suicidal ideation with plan or intent (C-SSRS score ≥4) - Current or recent (within 6 months) substance use disorder - Pregnancy or intent to become pregnant - Current use of MAOIs PROPOSED CHANGE (Amendment C): Relax exclusion EC-CSSRS: Allow enrollment of patients with C-SSRS score 4 (active suicidal ideation WITH plan, WITHOUT intent) if monitored weekly. ═══════════════════════════════════════════════════ SECTION 5: SAFETY MONITORING ═══════════════════════════════════════════════════ Data Safety Monitoring Board (DSMB): Annual review only. Suicidality monitoring: C-SSRS at baseline and Week 8. Blood pressure monitoring: At screening, Week 4, Week 12. NX-12 KNOWN RISKS (from Phase I): - Dose-dependent blood pressure elevation (mean +18 mmHg systolic at max dose) - Transient dissociative symptoms in 23% of patients at Weeks 1-2 - Two cases of hypertensive urgency in Phase I (n=42) PROPOSED CHANGE (Amendment D): Increase DSMB review from annual to quarterly. Add BP monitoring at Weeks 2 and 8. ═══════════════════════════════════════════════════ SECTION 6: STATISTICAL ANALYSIS PLAN ═══════════════════════════════════════════════════ Sample size: N=45 (powered for 80% power to detect 7-point MADRS change) Analysis population: Per-protocol (PP) only. Missing data: Last observation carried forward (LOCF). Interim analysis: None planned. PROPOSED CHANGE (Amendment E): Add a single interim analysis at 50% enrollment for futility only. ═══════════════════════════════════════════════════ SECTION 7: INFORMED CONSENT ═══════════════════════════════════════════════════ Current consent process: Written informed consent obtained at screening visit. Consent document revision required for all amendments. PROPOSED CHANGE (Amendment F): Allow verbal consent for the OLE (Amendment B) to reduce patient burden, with written consent waived if patient signs OLE enrollment form. """ # Ground truth: what a qualified medical monitor / regulatory reviewer would flag TASK3_GROUND_TRUTH_ISSUES = { "amendment_c_safety": { "description": "Amendment C allows suicidal patients (C-SSRS≥4) with plan — unacceptable in TRD trial without intensive monitoring infrastructure beyond 'weekly check'", "severity": "critical", "keywords": ["suicid", "c-ssrs", "amendment c", "safety", "vulnerable", "risk"], "weight": 20, }, "amendment_b_consent_timing": { "description": "OLE consent at same visit as eligibility assessment creates undue influence / coercion risk", "severity": "major", "keywords": ["ole", "consent", "same day", "coercio", "undue", "extension", "amendment b"], "weight": 15, }, "amendment_f_verbal_consent": { "description": "Verbal consent for OLE is insufficient per ICH E6(R2) GCP — written consent required for all interventional trial participation", "severity": "critical", "keywords": ["verbal", "consent", "gcp", "ich", "written", "waiv", "amendment f"], "weight": 15, }, "suicidality_monitoring_gap": { "description": "C-SSRS only at baseline and Week 8 is insufficient for TRD + suicidal risk population; should be monthly minimum", "severity": "major", "keywords": ["c-ssrs", "suicid", "monitoring", "frequen", "weekly", "monthly", "trd"], "weight": 15, }, "bp_monitoring_still_insufficient": { "description": "Even with Amendment D, BP monitoring is missing at Weeks 1, 6, 16, 20 — given +18 mmHg known risk, monthly monitoring minimum is needed during OLE", "severity": "major", "keywords": ["blood pressure", "bp", "hypertens", "monitoring", "amendment d", "ole"], "weight": 10, }, "no_control_arm_bias": { "description": "Open-label single-arm design in TRD (high placebo response ~30-40%) without randomized comparator limits interpretability; Amendment A extension compounds this", "severity": "major", "keywords": ["open-label", "control", "placebo", "bias", "single-arm", "interpretab"], "weight": 10, }, "locf_missing_data": { "description": "LOCF is discouraged by FDA/EMA for psychiatric trials; multiple imputation or mixed-model repeated measures preferred", "severity": "minor", "keywords": ["locf", "missing data", "imputation", "fda", "ema", "mmrm"], "weight": 8, }, "interim_analysis_alpha_spend": { "description": "Amendment E adds futility interim but no alpha-spending rule or stopping boundaries specified — protocol gap", "severity": "minor", "keywords": ["interim", "alpha", "spend", "futility", "boundar", "amendment e"], "weight": 7, }, } def grade_task3(findings: List[Dict], rationale: str) -> Tuple[float, str]: """ Score Task 3: Comprehensive Protocol Amendment Review Scores based on how many ground-truth issues are identified, weighted by severity and quality of the finding description. Max ~100 pts. """ max_pts = 100 pts = 0 feedback_parts = [] all_text = rationale for f in findings: all_text += " " + f.get("description", "") + " " + f.get("recommendation", "") all_text = _normalize(all_text) for issue_key, issue in TASK3_GROUND_TRUTH_ISSUES.items(): keywords = issue["keywords"] weight = issue["weight"] hits = sum(1 for k in keywords if k in all_text) hit_rate = hits / len(keywords) if hit_rate >= 0.5: earned = int(weight * min(1.0, hit_rate + 0.2)) pts += earned feedback_parts.append(f"✓ [{issue['severity'].upper()}] {issue_key}: identified ({hits}/{len(keywords)} keywords, +{earned})") elif hit_rate >= 0.25: earned = int(weight * 0.4) pts += earned feedback_parts.append(f"~ [{issue['severity'].upper()}] {issue_key}: partially identified (+{earned})") else: feedback_parts.append(f"✗ [{issue['severity'].upper()}] {issue_key}: missed (0/{weight})") # Bonus for structured, actionable recommendations n_recommendations = sum( 1 for f in findings if len(f.get("recommendation", "")) > 30 ) if n_recommendations >= 5: pts += 5 feedback_parts.append(f"✓ {n_recommendations} actionable recommendations provided (+5)") elif n_recommendations >= 3: pts += 3 feedback_parts.append(f"~ {n_recommendations} recommendations provided (+3)") score = max(0.01, min(0.99, pts / max_pts)) feedback = f"Task 3 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts) return score, feedback # =========================================================================== # Task registry # =========================================================================== TASKS = { "eligibility_screening": { "name": "eligibility_screening", "difficulty": "easy", "description": "Identify protocol eligibility violations across 5 patient records for ONCO-2024-301.", "max_steps": 3, "protocol_summary": TASK1_PROTOCOL_SUMMARY, "patient_records": TASK1_PATIENT_RECORDS, "adverse_events": [], "protocol_text": "", "grader": grade_task1, }, "ae_classification": { "name": "ae_classification", "difficulty": "medium", "description": "Review 7 adverse events in CARD-2024-112 and identify all misclassifications.", "max_steps": 4, "protocol_summary": TASK2_PROTOCOL_SUMMARY, "patient_records": [], "adverse_events": TASK2_ADVERSE_EVENTS, "protocol_text": "", "grader": grade_task2, }, "protocol_amendment_review": { "name": "protocol_amendment_review", "difficulty": "hard", "description": "Comprehensively review 6 proposed protocol amendments for NEURO-2024-450 and produce structured findings.", "max_steps": 5, "protocol_summary": "NEURO-2024-450 Phase II — Drug NX-12 for treatment-resistant depression (TRD). 6 amendments under review.", "patient_records": [], "adverse_events": [], "protocol_text": TASK3_PROTOCOL_TEXT, "grader": grade_task3, }, }