Geetesh
fix: add /metadata /schema /mcp endpoints for OpenEnv runtime validation
b0d5924
"""
Task definitions for the Clinical Trial Protocol Review environment.
Each task has:
- A scenario (protocol + patient records + adverse events)
- A deterministic grader returning a score in [0.0, 1.0]
- A difficulty: easy | medium | hard
"""
from __future__ import annotations
import re
from typing import Any, Dict, List, Tuple
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _normalize(text: str) -> str:
return text.lower().strip()
def _mentions_any(text: str, keywords: List[str]) -> bool:
t = _normalize(text)
return any(k in t for k in keywords)
def _finding_types_in(findings: List[Dict], ftype: str) -> List[Dict]:
return [f for f in findings if _normalize(f.get("finding_type", "")) == _normalize(ftype)]
# ===========================================================================
# TASK 1 β€” Easy: Eligibility Violation Screening
# ===========================================================================
TASK1_PROTOCOL_SUMMARY = """
TRIAL: ONCO-2024-301 Phase II β€” Investigational drug XR-7 for metastatic colorectal cancer.
SPONSOR: NovaPharma Inc.
PHASE: II (Open-label, single-arm)
INCLUSION CRITERIA (ALL must be met):
IC-1: Age β‰₯ 18 and ≀ 75 years
IC-2: Histologically confirmed metastatic colorectal adenocarcinoma
IC-3: ECOG Performance Status 0 or 1
IC-4: Adequate renal function: eGFR β‰₯ 60 mL/min/1.73mΒ²
IC-5: No prior treatment with XR-7 or any KRAS-G12C inhibitor
EXCLUSION CRITERIA (ANY disqualifies):
EC-1: Active uncontrolled infection
EC-2: Prior solid organ transplant
EC-3: Pregnancy or breastfeeding
EC-4: QTcF > 480 ms on screening ECG
EC-5: Concurrent use of strong CYP3A4 inhibitors
"""
TASK1_PATIENT_RECORDS = [
{
"subject_id": "PT-001",
"age": 68,
"diagnosis": "Metastatic colorectal adenocarcinoma",
"ecog_ps": 1,
"egfr": 72,
"prior_kras_inhibitor": False,
"active_infection": False,
"organ_transplant": False,
"pregnant": False,
"qtcf_ms": 455,
"cyp3a4_inhibitor": False,
"notes": "Standard patient, enrolled without issues."
},
{
"subject_id": "PT-002",
"age": 78, # VIOLATION: age > 75 (IC-1)
"diagnosis": "Metastatic colorectal adenocarcinoma",
"ecog_ps": 1,
"egfr": 65,
"prior_kras_inhibitor": False,
"active_infection": False,
"organ_transplant": False,
"pregnant": False,
"qtcf_ms": 460,
"cyp3a4_inhibitor": False,
"notes": "Enrolled 3 days after protocol amendment discussion (not yet approved)."
},
{
"subject_id": "PT-003",
"age": 55,
"diagnosis": "Metastatic colorectal adenocarcinoma",
"ecog_ps": 2, # VIOLATION: ECOG PS = 2, must be 0 or 1 (IC-3)
"egfr": 58, # VIOLATION: eGFR < 60 (IC-4)
"prior_kras_inhibitor": False,
"active_infection": False,
"organ_transplant": False,
"pregnant": False,
"qtcf_ms": 470,
"cyp3a4_inhibitor": False,
"notes": "Patient self-reported feeling fatigued. Site enrolled after verbal PI approval."
},
{
"subject_id": "PT-004",
"age": 61,
"diagnosis": "Metastatic colorectal adenocarcinoma",
"ecog_ps": 0,
"egfr": 80,
"prior_kras_inhibitor": True, # VIOLATION: prior KRAS-G12C inhibitor use (IC-5)
"active_infection": False,
"organ_transplant": False,
"pregnant": False,
"qtcf_ms": 440,
"cyp3a4_inhibitor": False,
"notes": "Patient had prior sotorasib therapy 14 months ago."
},
{
"subject_id": "PT-005",
"age": 45,
"diagnosis": "Metastatic colorectal adenocarcinoma",
"ecog_ps": 1,
"egfr": 90,
"prior_kras_inhibitor": False,
"active_infection": False,
"organ_transplant": False,
"pregnant": False,
"qtcf_ms": 495, # VIOLATION: QTcF > 480 ms (EC-4)
"cyp3a4_inhibitor": True, # VIOLATION: strong CYP3A4 inhibitor (EC-5)
"notes": "Patient on ketoconazole for fungal infection. ECG borderline."
},
]
# Ground truth: which subject_ids have violations and what kind
TASK1_GROUND_TRUTH = {
"PT-002": {"criteria": ["IC-1"], "severity": "critical"},
"PT-003": {"criteria": ["IC-3", "IC-4"], "severity": "critical"},
"PT-004": {"criteria": ["IC-5"], "severity": "critical"},
"PT-005": {"criteria": ["EC-4", "EC-5"], "severity": "critical"},
}
def grade_task1(findings: List[Dict], rationale: str) -> Tuple[float, str]:
"""
Score Task 1: Eligibility Violation Screening
Scoring:
- 20 pts each for correctly identifying PT-002, PT-003, PT-004, PT-005 as having violations
- PT-003 has 2 violations; partial credit (10 pts) if only one is found
- PT-005 has 2 violations; partial credit (10 pts) if only one is found
- PT-001 false-positive flag: -10 pts
- Max: 100 pts β†’ normalized to [0, 1]
"""
max_pts = 100
pts = 0
feedback_parts = []
# Collect all subject_ids mentioned in findings as violations
violation_findings = [
f for f in findings
if _normalize(f.get("finding_type", "")) in (
"eligibility_violation", "protocol_deviation"
)
]
flagged_subjects = {
str(f.get("subject_id", "")).strip().upper()
for f in violation_findings
}
def desc_of(sid):
descs = [
_normalize(f.get("description", "") + " " + f.get("recommendation", ""))
for f in violation_findings
if str(f.get("subject_id", "")).upper() == sid
]
return " ".join(descs)
# PT-002: age violation
if "PT-002" in flagged_subjects:
d = desc_of("PT-002")
if _mentions_any(d, ["age", "78", "ic-1", "inclusion"]):
pts += 20
feedback_parts.append("βœ“ PT-002 age violation correctly identified (+20)")
else:
pts += 10
feedback_parts.append("~ PT-002 flagged but violation type unclear (+10)")
else:
feedback_parts.append("βœ— PT-002 age violation missed (0)")
# PT-003: ECOG PS + eGFR
if "PT-003" in flagged_subjects:
d = desc_of("PT-003")
found_ecog = _mentions_any(d, ["ecog", "performance", "ps", "ic-3"])
found_egfr = _mentions_any(d, ["egfr", "renal", "kidney", "58", "ic-4"])
if found_ecog and found_egfr:
pts += 20
feedback_parts.append("βœ“ PT-003 both ECOG and eGFR violations found (+20)")
elif found_ecog or found_egfr:
pts += 10
feedback_parts.append("~ PT-003 only one of two violations found (+10)")
else:
pts += 8
feedback_parts.append("~ PT-003 flagged but violations not specified (+8)")
else:
feedback_parts.append("βœ— PT-003 violations missed (0)")
# PT-004: prior KRAS inhibitor
if "PT-004" in flagged_subjects:
d = desc_of("PT-004")
if _mentions_any(d, ["kras", "sotorasib", "prior", "ic-5", "inhibitor"]):
pts += 20
feedback_parts.append("βœ“ PT-004 prior KRAS inhibitor correctly identified (+20)")
else:
pts += 10
feedback_parts.append("~ PT-004 flagged but violation type unclear (+10)")
else:
feedback_parts.append("βœ— PT-004 prior KRAS inhibitor violation missed (0)")
# PT-005: QTcF + CYP3A4
if "PT-005" in flagged_subjects:
d = desc_of("PT-005")
found_qtcf = _mentions_any(d, ["qtcf", "qt", "ecg", "480", "495", "ec-4"])
found_cyp = _mentions_any(d, ["cyp", "ketoconazole", "ec-5", "inhibitor"])
if found_qtcf and found_cyp:
pts += 20
feedback_parts.append("βœ“ PT-005 both QTcF and CYP3A4 violations found (+20)")
elif found_qtcf or found_cyp:
pts += 10
feedback_parts.append("~ PT-005 only one of two violations found (+10)")
else:
pts += 8
feedback_parts.append("~ PT-005 flagged but violations not specified (+8)")
else:
feedback_parts.append("βœ— PT-005 violations missed (0)")
# False positive penalty: PT-001 flagged
if "PT-001" in flagged_subjects:
pts -= 10
feedback_parts.append("βœ— PT-001 incorrectly flagged as violation (-10)")
else:
pts += 20
feedback_parts.append("βœ“ PT-001 correctly not flagged as violation (+20)")
score = max(0.01, min(0.99, pts / max_pts))
feedback = f"Task 1 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
return score, feedback
# ===========================================================================
# TASK 2 β€” Medium: Adverse Event Severity Misclassification
# ===========================================================================
TASK2_PROTOCOL_SUMMARY = """
TRIAL: CARD-2024-112 Phase III β€” Drug BX-9 for heart failure (NYHA Class II–III).
SPONSOR: CardioVita Therapeutics
PHASE: III (Randomized, double-blind, placebo-controlled)
ADVERSE EVENT CLASSIFICATION (per ICH E2A):
Grade 1 (Mild) β€” Asymptomatic/mild symptoms; no intervention needed
Grade 2 (Moderate)β€” Minimal intervention indicated; limits instrumental ADL
Grade 3 (Severe) β€” Medically significant but not immediately life-threatening;
hospitalization or prolongation of hospitalization
Grade 4 (Life-threatening) β€” Urgent intervention indicated
Grade 5 (Fatal) β€” Death related to AE
SERIOUS ADVERSE EVENTS (SAEs) must be reported to sponsor within 24 hours if:
- Any Grade 3–5 event
- Any hospitalization (even Grade 2)
- Any event deemed life-threatening by the investigator
- Congenital anomaly/birth defect
- Any other medically important event
PROTOCOL NOTE: Dyspnea requiring hospitalization is a pre-specified AE of Special Interest (AESI).
"""
TASK2_ADVERSE_EVENTS = [
{
"ae_id": "AE-001",
"subject_id": "PT-201",
"event": "Headache",
"site_reported_grade": 1,
"hospitalized": False,
"duration_days": 2,
"intervention": "Acetaminophen PRN",
"notes": "Resolved spontaneously.",
"correct_grade": 1,
"is_sae": False,
"misclassified": False,
},
{
"ae_id": "AE-002",
"subject_id": "PT-202",
"event": "Severe dyspnea with hospitalization for 3 days",
"site_reported_grade": 2, # MISCLASSIFIED β€” should be Grade 3 SAE + AESI
"hospitalized": True,
"duration_days": 3,
"intervention": "IV diuretics, oxygen supplementation",
"notes": "Patient hospitalized. Site reported as Grade 2 and did not file SAE report.",
"correct_grade": 3,
"is_sae": True,
"misclassified": True,
"issue": "Grade downgraded; SAE not filed; AESI not flagged",
},
{
"ae_id": "AE-003",
"subject_id": "PT-203",
"event": "Mild peripheral edema",
"site_reported_grade": 2,
"hospitalized": False,
"duration_days": 14,
"intervention": "Dose reduction",
"notes": "Managed with dose reduction. Ongoing.",
"correct_grade": 2,
"is_sae": False,
"misclassified": False,
},
{
"ae_id": "AE-004",
"subject_id": "PT-204",
"event": "Ventricular tachycardia β€” 30-second episode, asymptomatic, resolved spontaneously",
"site_reported_grade": 1, # MISCLASSIFIED β€” VT is Grade 3 minimum; potentially Grade 4
"hospitalized": False,
"duration_days": 0,
"intervention": "None; monitored",
"notes": "Recorded on Holter monitor. Patient unaware. Site classified as Grade 1.",
"correct_grade": 3,
"is_sae": True,
"misclassified": True,
"issue": "VT always β‰₯ Grade 3; SAE report missing; cardiac safety signal",
},
{
"ae_id": "AE-005",
"subject_id": "PT-205",
"event": "Acute kidney injury β€” creatinine 3.2x ULN",
"site_reported_grade": 2, # MISCLASSIFIED β€” creatinine >3x ULN = Grade 3
"hospitalized": True,
"duration_days": 5,
"intervention": "IV fluids, nephrology consult, drug held",
"notes": "Creatinine peaked at 3.2x ULN. Patient hospitalized. Site reported Grade 2.",
"correct_grade": 3,
"is_sae": True,
"misclassified": True,
"issue": "Creatinine >3x ULN = Grade 3 per CTCAE; hospitalization = SAE",
},
{
"ae_id": "AE-006",
"subject_id": "PT-206",
"event": "Nausea with vomiting, unable to maintain oral intake for 24h",
"site_reported_grade": 3,
"hospitalized": True,
"duration_days": 2,
"intervention": "IV antiemetics, IV hydration",
"notes": "SAE filed. Resolved with treatment.",
"correct_grade": 3,
"is_sae": True,
"misclassified": False,
},
{
"ae_id": "AE-007",
"subject_id": "PT-207",
"event": "Fatigue β€” patient reports feeling tired but continues all daily activities",
"site_reported_grade": 3, # MISCLASSIFIED β€” should be Grade 1 (continues ADL)
"hospitalized": False,
"duration_days": 7,
"intervention": "None",
"notes": "Site over-reported as Grade 3. No hospitalization, no intervention.",
"correct_grade": 1,
"is_sae": False,
"misclassified": True,
"issue": "Fatigue with no ADL limitation = Grade 1; Grade 3 is over-reporting",
},
]
TASK2_MISCLASSIFIED_AE_IDS = {"AE-002", "AE-004", "AE-005", "AE-007"}
def grade_task2(findings: List[Dict], rationale: str) -> Tuple[float, str]:
"""
Score Task 2: Adverse Event Severity Misclassification
Max 100 pts:
- 20 pts each for correctly identifying AE-002, AE-004, AE-005, AE-007 as misclassified
- Must identify BOTH the wrong grade AND suggest correct grade for full credit (10 each)
- AE-004 and AE-002: additional 5 pts for identifying SAE reporting failure
- False positive (flagging AE-001, AE-003, AE-006): -8 pts each
"""
max_pts = 100
pts = 0
feedback_parts = []
ae_findings = [
f for f in findings
if _normalize(f.get("finding_type", "")) in (
"adverse_event", "protocol_deviation", "safety_concern"
)
]
def get_ae_descs(ae_id):
descs = []
for f in ae_findings:
desc = _normalize(f.get("description", "") + " " + f.get("recommendation", ""))
subj = str(f.get("subject_id", "")).strip()
# Match by AE ID or by subject ID
if ae_id.lower() in desc or ae_id.replace("AE-", "ae-") in desc:
descs.append(desc)
# Also match by subject_id mapping
ae_to_subj = {
"AE-002": "PT-202", "AE-004": "PT-204",
"AE-005": "PT-205", "AE-007": "PT-207",
"AE-001": "PT-201", "AE-003": "PT-203", "AE-006": "PT-206",
}
sid = ae_to_subj.get(ae_id, "")
for f in ae_findings:
if str(f.get("subject_id", "")).strip().upper() == sid:
descs.append(_normalize(f.get("description", "") + " " + f.get("recommendation", "")))
return " ".join(descs)
def ae_flagged(ae_id):
d = get_ae_descs(ae_id)
return len(d) > 0
# AE-002: dyspnea β€” Grade 2β†’3, SAE, AESI
if ae_flagged("AE-002"):
d = get_ae_descs("AE-002")
grade_correct = _mentions_any(d, ["grade 3", "grade-3", "grade3", "3", "sae", "serious"])
aesi_flag = _mentions_any(d, ["aesi", "special interest", "dyspnea", "sae", "hospitali"])
pts += 10 if grade_correct else 5
pts += 5 if aesi_flag else 0
feedback_parts.append(f"βœ“ AE-002 flagged {'with grade+SAE correction' if grade_correct and aesi_flag else 'partially'} (+{10 if grade_correct else 5}{'+5 AESI' if aesi_flag else ''})")
else:
feedback_parts.append("βœ— AE-002 dyspnea misclassification missed (0)")
# AE-004: VT β€” Grade 1β†’3, SAE
if ae_flagged("AE-004"):
d = get_ae_descs("AE-004")
grade_correct = _mentions_any(d, ["grade 3", "grade-3", "3", "sae", "serious", "life"])
cardiac_signal = _mentions_any(d, ["cardiac", "ventricular", "vt", "safety", "sae"])
pts += 12 if grade_correct else 6
pts += 5 if cardiac_signal else 0
feedback_parts.append(f"βœ“ AE-004 VT flagged {'with cardiac SAE recognition' if cardiac_signal else 'partially'} (+{12 if grade_correct else 6}{'+5' if cardiac_signal else ''})")
else:
feedback_parts.append("βœ— AE-004 VT misclassification missed (0)")
# AE-005: AKI β€” Grade 2β†’3
if ae_flagged("AE-005"):
d = get_ae_descs("AE-005")
grade_correct = _mentions_any(d, ["grade 3", "3", "ctcae", "3x", "uln"])
pts += 15 if grade_correct else 8
feedback_parts.append(f"βœ“ AE-005 AKI flagged {'with CTCAE reference' if grade_correct else 'partially'} (+{15 if grade_correct else 8})")
else:
feedback_parts.append("βœ— AE-005 AKI misclassification missed (0)")
# AE-007: fatigue β€” Grade 3β†’1 (over-reporting)
if ae_flagged("AE-007"):
d = get_ae_descs("AE-007")
grade_correct = _mentions_any(d, ["grade 1", "1", "over", "over-report", "adl", "downgrade"])
pts += 15 if grade_correct else 8
feedback_parts.append(f"βœ“ AE-007 fatigue over-reporting flagged {'correctly' if grade_correct else 'partially'} (+{15 if grade_correct else 8})")
else:
feedback_parts.append("βœ— AE-007 fatigue over-reporting missed (0)")
# Bonus for clean rationale covering systemic issue
if _mentions_any(rationale, ["sae", "reporting", "systematic", "site training", "pattern"]):
pts += 5
feedback_parts.append("βœ“ Rationale identifies systemic reporting issues (+5)")
# Correct AEs not flagged (true negatives baseline)
correct_ae_ids = ["AE-001", "AE-003", "AE-006"]
fp_penalty = 0
for ae_id in correct_ae_ids:
if ae_flagged(ae_id):
pts -= 8
fp_penalty += 8
feedback_parts.append(f"βœ— {ae_id} incorrectly flagged as misclassified (-8)")
if fp_penalty == 0:
pts += 33 # baseline for correctly leaving good AEs alone
feedback_parts.append("βœ“ No false positives on correctly classified AEs (+33)")
score = max(0.01, min(0.99, pts / max_pts))
feedback = f"Task 2 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
return score, feedback
# ===========================================================================
# TASK 3 β€” Hard: Comprehensive Protocol Amendment Review
# ===========================================================================
TASK3_PROTOCOL_TEXT = """
PROTOCOL: NEURO-2024-450 Phase II β€” Drug NX-12 for treatment-resistant depression (TRD)
VERSION: 1.2 (Under Review for Amendment)
SPONSOR: MindBridge Pharmaceuticals
═══════════════════════════════════════════════════
SECTION 3: STUDY DESIGN
═══════════════════════════════════════════════════
Open-label, single-arm study. No control arm. Duration: 12 weeks active treatment.
Primary endpoint: Change from baseline in MADRS score at Week 8.
Secondary endpoint: Response rate (β‰₯50% MADRS reduction) at Week 12.
PROPOSED CHANGE (Amendment A):
Extend treatment from 12 β†’ 24 weeks with no change to primary endpoint timing.
PROPOSED CHANGE (Amendment B):
Add an optional open-label extension (OLE) of up to 52 weeks for responders.
Consent for OLE to be obtained at Week 12 visit (same day as eligibility assessment).
═══════════════════════════════════════════════════
SECTION 4: PATIENT POPULATION
═══════════════════════════════════════════════════
INCLUSION:
- Age 22–65
- DSM-5 diagnosis of MDD, current episode resistant to β‰₯2 adequate antidepressant trials
- MADRS score β‰₯28 at screening AND baseline (within 3 days of screening)
- Capable of providing informed consent
EXCLUSION:
- Active suicidal ideation with plan or intent (C-SSRS score β‰₯4)
- Current or recent (within 6 months) substance use disorder
- Pregnancy or intent to become pregnant
- Current use of MAOIs
PROPOSED CHANGE (Amendment C):
Relax exclusion EC-CSSRS: Allow enrollment of patients with C-SSRS score 4
(active suicidal ideation WITH plan, WITHOUT intent) if monitored weekly.
═══════════════════════════════════════════════════
SECTION 5: SAFETY MONITORING
═══════════════════════════════════════════════════
Data Safety Monitoring Board (DSMB): Annual review only.
Suicidality monitoring: C-SSRS at baseline and Week 8.
Blood pressure monitoring: At screening, Week 4, Week 12.
NX-12 KNOWN RISKS (from Phase I):
- Dose-dependent blood pressure elevation (mean +18 mmHg systolic at max dose)
- Transient dissociative symptoms in 23% of patients at Weeks 1-2
- Two cases of hypertensive urgency in Phase I (n=42)
PROPOSED CHANGE (Amendment D):
Increase DSMB review from annual to quarterly.
Add BP monitoring at Weeks 2 and 8.
═══════════════════════════════════════════════════
SECTION 6: STATISTICAL ANALYSIS PLAN
═══════════════════════════════════════════════════
Sample size: N=45 (powered for 80% power to detect 7-point MADRS change)
Analysis population: Per-protocol (PP) only.
Missing data: Last observation carried forward (LOCF).
Interim analysis: None planned.
PROPOSED CHANGE (Amendment E):
Add a single interim analysis at 50% enrollment for futility only.
═══════════════════════════════════════════════════
SECTION 7: INFORMED CONSENT
═══════════════════════════════════════════════════
Current consent process: Written informed consent obtained at screening visit.
Consent document revision required for all amendments.
PROPOSED CHANGE (Amendment F):
Allow verbal consent for the OLE (Amendment B) to reduce patient burden,
with written consent waived if patient signs OLE enrollment form.
"""
# Ground truth: what a qualified medical monitor / regulatory reviewer would flag
TASK3_GROUND_TRUTH_ISSUES = {
"amendment_c_safety": {
"description": "Amendment C allows suicidal patients (C-SSRSβ‰₯4) with plan β€” unacceptable in TRD trial without intensive monitoring infrastructure beyond 'weekly check'",
"severity": "critical",
"keywords": ["suicid", "c-ssrs", "amendment c", "safety", "vulnerable", "risk"],
"weight": 20,
},
"amendment_b_consent_timing": {
"description": "OLE consent at same visit as eligibility assessment creates undue influence / coercion risk",
"severity": "major",
"keywords": ["ole", "consent", "same day", "coercio", "undue", "extension", "amendment b"],
"weight": 15,
},
"amendment_f_verbal_consent": {
"description": "Verbal consent for OLE is insufficient per ICH E6(R2) GCP β€” written consent required for all interventional trial participation",
"severity": "critical",
"keywords": ["verbal", "consent", "gcp", "ich", "written", "waiv", "amendment f"],
"weight": 15,
},
"suicidality_monitoring_gap": {
"description": "C-SSRS only at baseline and Week 8 is insufficient for TRD + suicidal risk population; should be monthly minimum",
"severity": "major",
"keywords": ["c-ssrs", "suicid", "monitoring", "frequen", "weekly", "monthly", "trd"],
"weight": 15,
},
"bp_monitoring_still_insufficient": {
"description": "Even with Amendment D, BP monitoring is missing at Weeks 1, 6, 16, 20 β€” given +18 mmHg known risk, monthly monitoring minimum is needed during OLE",
"severity": "major",
"keywords": ["blood pressure", "bp", "hypertens", "monitoring", "amendment d", "ole"],
"weight": 10,
},
"no_control_arm_bias": {
"description": "Open-label single-arm design in TRD (high placebo response ~30-40%) without randomized comparator limits interpretability; Amendment A extension compounds this",
"severity": "major",
"keywords": ["open-label", "control", "placebo", "bias", "single-arm", "interpretab"],
"weight": 10,
},
"locf_missing_data": {
"description": "LOCF is discouraged by FDA/EMA for psychiatric trials; multiple imputation or mixed-model repeated measures preferred",
"severity": "minor",
"keywords": ["locf", "missing data", "imputation", "fda", "ema", "mmrm"],
"weight": 8,
},
"interim_analysis_alpha_spend": {
"description": "Amendment E adds futility interim but no alpha-spending rule or stopping boundaries specified β€” protocol gap",
"severity": "minor",
"keywords": ["interim", "alpha", "spend", "futility", "boundar", "amendment e"],
"weight": 7,
},
}
def grade_task3(findings: List[Dict], rationale: str) -> Tuple[float, str]:
"""
Score Task 3: Comprehensive Protocol Amendment Review
Scores based on how many ground-truth issues are identified,
weighted by severity and quality of the finding description.
Max ~100 pts.
"""
max_pts = 100
pts = 0
feedback_parts = []
all_text = rationale
for f in findings:
all_text += " " + f.get("description", "") + " " + f.get("recommendation", "")
all_text = _normalize(all_text)
for issue_key, issue in TASK3_GROUND_TRUTH_ISSUES.items():
keywords = issue["keywords"]
weight = issue["weight"]
hits = sum(1 for k in keywords if k in all_text)
hit_rate = hits / len(keywords)
if hit_rate >= 0.5:
earned = int(weight * min(1.0, hit_rate + 0.2))
pts += earned
feedback_parts.append(f"βœ“ [{issue['severity'].upper()}] {issue_key}: identified ({hits}/{len(keywords)} keywords, +{earned})")
elif hit_rate >= 0.25:
earned = int(weight * 0.4)
pts += earned
feedback_parts.append(f"~ [{issue['severity'].upper()}] {issue_key}: partially identified (+{earned})")
else:
feedback_parts.append(f"βœ— [{issue['severity'].upper()}] {issue_key}: missed (0/{weight})")
# Bonus for structured, actionable recommendations
n_recommendations = sum(
1 for f in findings
if len(f.get("recommendation", "")) > 30
)
if n_recommendations >= 5:
pts += 5
feedback_parts.append(f"βœ“ {n_recommendations} actionable recommendations provided (+5)")
elif n_recommendations >= 3:
pts += 3
feedback_parts.append(f"~ {n_recommendations} recommendations provided (+3)")
score = max(0.01, min(0.99, pts / max_pts))
feedback = f"Task 3 Score: {pts}/{max_pts} ({score:.2f})\n" + "\n".join(feedback_parts)
return score, feedback
# ===========================================================================
# Task registry
# ===========================================================================
TASKS = {
"eligibility_screening": {
"name": "eligibility_screening",
"difficulty": "easy",
"description": "Identify protocol eligibility violations across 5 patient records for ONCO-2024-301.",
"max_steps": 3,
"protocol_summary": TASK1_PROTOCOL_SUMMARY,
"patient_records": TASK1_PATIENT_RECORDS,
"adverse_events": [],
"protocol_text": "",
"grader": grade_task1,
},
"ae_classification": {
"name": "ae_classification",
"difficulty": "medium",
"description": "Review 7 adverse events in CARD-2024-112 and identify all misclassifications.",
"max_steps": 4,
"protocol_summary": TASK2_PROTOCOL_SUMMARY,
"patient_records": [],
"adverse_events": TASK2_ADVERSE_EVENTS,
"protocol_text": "",
"grader": grade_task2,
},
"protocol_amendment_review": {
"name": "protocol_amendment_review",
"difficulty": "hard",
"description": "Comprehensively review 6 proposed protocol amendments for NEURO-2024-450 and produce structured findings.",
"max_steps": 5,
"protocol_summary": "NEURO-2024-450 Phase II β€” Drug NX-12 for treatment-resistant depression (TRD). 6 amendments under review.",
"patient_records": [],
"adverse_events": [],
"protocol_text": TASK3_PROTOCOL_TEXT,
"grader": grade_task3,
},
}