Spaces:
Running
Running
Major grading overhaul: difficulty multiplier, tighter scoring, mastery removal, precision penalties
72b3e8d | # server/datasets/clinical_cases.py | |
| # Ground truth cases for Clinical Workflow Chaos Simulator tasks. | |
| # | |
| # FIXES APPLIED: | |
| # 1. cli_easy: completion_threshold lowered to 0.65 (was 0.80) | |
| # expected_missing_steps made more specific (not guessable from task description alone) | |
| # 2. cli_medium: required_sequence now MUST include both detect_gap AND rank_issues | |
| # Previously it ended at step 1 if completion_threshold was met by detect_gap alone | |
| # 3. cli_hard: required_sequence MUST include all 3: detect_gap, rank_issues, order_steps | |
| # This forces the full 3-step workflow to run every time | |
| CLINICAL_CASES = { | |
| 'cli_easy': [ | |
| { | |
| 'case_id': 'cli_easy_001', | |
| 'completion_threshold': 0.65, # FIX: was 0.80 | |
| 'max_steps': 4, | |
| # FIX: required_sequence is the done trigger — episode ends only when detect_gap is done | |
| 'done_conditions': {'min_actions': 1, 'required_sequence': ['detect_gap']}, | |
| 'patient_id': 'P101', | |
| 'patient_events': ['admission', 'surgery_scheduled', 'surgery_performed'], | |
| 'events': ['admission', 'surgery_scheduled', 'surgery_performed'], | |
| # FIX: More specific — 'pre_op_consent' is the answer, not guessable from available_steps alone | |
| 'expected_missing_steps': ['pre_op_consent'], | |
| 'expected_risk': 'critical', | |
| 'available_steps': ['pre_op_consent', 'blood_work', 'anesthesia_consult', 'vitals_check', 'infection_screening'], | |
| 'task_description': 'A patient underwent surgery but the pre-operative checklist shows gaps. The patient_events show what happened. Identify the single most critical missing step from available_steps and assess the risk level.', | |
| }, | |
| { | |
| 'case_id': 'cli_easy_002', | |
| 'completion_threshold': 0.65, | |
| 'max_steps': 4, | |
| 'done_conditions': {'min_actions': 1, 'required_sequence': ['detect_gap']}, | |
| 'patient_id': 'P102', | |
| 'patient_events': ['admission', 'diagnosis', 'medication_prescribed', 'discharge'], | |
| 'events': ['admission', 'diagnosis', 'medication_prescribed', 'discharge'], | |
| 'expected_missing_steps': ['allergy_check'], | |
| 'expected_risk': 'high', | |
| 'available_steps': ['allergy_check', 'follow_up_scheduled', 'lab_results_reviewed', 'pharmacist_review', 'patient_education'], | |
| 'task_description': 'Find the single missing safety check in this medication workflow. Patient was discharged after medication was prescribed without a critical safety step.', | |
| }, | |
| { | |
| 'case_id': 'cli_easy_003', | |
| 'completion_threshold': 0.65, | |
| 'max_steps': 4, | |
| 'done_conditions': {'min_actions': 1, 'required_sequence': ['detect_gap']}, | |
| 'patient_id': 'P103', | |
| 'patient_events': ['er_admission', 'triage', 'treatment', 'discharge'], | |
| 'events': ['er_admission', 'triage', 'treatment', 'discharge'], | |
| 'expected_missing_steps': ['insurance_verification'], | |
| 'expected_risk': 'medium', | |
| 'available_steps': ['insurance_verification', 'attending_consult', 'social_work_referral', 'discharge_summary', 'follow_up_appointment'], | |
| 'task_description': 'Find the missing administrative step in this ER discharge workflow.', | |
| }, | |
| { | |
| 'case_id': 'cli_easy_004', | |
| 'completion_threshold': 0.65, | |
| 'max_steps': 4, | |
| 'done_conditions': {'min_actions': 1, 'required_sequence': ['detect_gap']}, | |
| 'patient_id': 'P104', | |
| 'patient_events': ['admission', 'ct_scan_ordered', 'ct_scan_performed', 'diagnosis'], | |
| 'events': ['admission', 'ct_scan_ordered', 'ct_scan_performed', 'diagnosis'], | |
| 'expected_missing_steps': ['contrast_allergy_screen'], | |
| 'expected_risk': 'high', | |
| 'available_steps': ['contrast_allergy_screen', 'kidney_function_test', 'radiologist_review', 'patient_consent', 'iv_access_check'], | |
| 'task_description': 'Find the single missing safety step that should have occurred before this contrast CT scan was performed.', | |
| }, | |
| { | |
| 'case_id': 'cli_easy_005', | |
| 'completion_threshold': 0.65, | |
| 'max_steps': 4, | |
| 'done_conditions': {'min_actions': 1, 'required_sequence': ['detect_gap']}, | |
| 'patient_id': 'P105', | |
| 'patient_events': ['admission', 'blood_transfusion_ordered', 'transfusion_started'], | |
| 'events': ['admission', 'blood_transfusion_ordered', 'transfusion_started'], | |
| 'expected_missing_steps': ['blood_type_crossmatch'], | |
| 'expected_risk': 'critical', | |
| 'available_steps': ['blood_type_crossmatch', 'consent_form', 'vital_signs_baseline', 'hemoglobin_check', 'iv_gauge_verify'], | |
| 'task_description': 'A blood transfusion was started. Find the critical missing safety step that should have occurred before transfusion began.', | |
| }, | |
| ], | |
| 'cli_medium': [ | |
| { | |
| 'case_id': 'cli_medium_001', | |
| 'completion_threshold': 0.60, # FIX: was 0.75 | |
| 'max_steps': 6, | |
| # FIX: required_sequence now requires BOTH actions — episode only ends when both done | |
| 'done_conditions': {'min_actions': 2, 'required_sequence': ['detect_gap', 'rank_issues']}, | |
| 'patient_id': 'P201', | |
| 'patient_events': ['admission', 'surgery_planned', 'insurance_denied', 'specialist_unavailable'], | |
| 'events': ['admission', 'surgery_planned', 'insurance_denied', 'specialist_unavailable'], | |
| 'expected_missing_steps': ['resolve_insurance', 'pre_op_consent', 'book_specialist'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['resolve_insurance', 'pre_op_consent', 'book_specialist'], | |
| 'available_steps': ['resolve_insurance', 'pre_op_consent', 'book_specialist', 'schedule_surgery', 'anesthesia_consult'], | |
| 'dependency_graph': { | |
| 'schedule_surgery': ['resolve_insurance', 'pre_op_consent', 'book_specialist'], | |
| 'pre_op_consent': [], | |
| 'book_specialist': [], | |
| 'resolve_insurance': [], | |
| }, | |
| 'task_description': 'Multiple steps are missing in this surgical patient workflow. First detect ALL gaps (there are 3), then rank them by clinical priority. The priority order matters — insurance must be resolved before surgery can proceed.', | |
| }, | |
| { | |
| 'case_id': 'cli_medium_002', | |
| 'completion_threshold': 0.60, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 2, 'required_sequence': ['detect_gap', 'rank_issues']}, | |
| 'patient_id': 'P202', | |
| 'patient_events': ['er_admission', 'triage_level_2', 'medication_given'], | |
| 'events': ['er_admission', 'triage_level_2', 'medication_given'], | |
| 'expected_missing_steps': ['allergy_check', 'attending_notification', 'vital_signs_check'], | |
| 'expected_risk': 'high', | |
| 'priority_order': ['allergy_check', 'vital_signs_check', 'attending_notification'], | |
| 'available_steps': ['allergy_check', 'attending_notification', 'vital_signs_check', 'lab_order', 'discharge_planning'], | |
| 'dependency_graph': { | |
| 'allergy_check': [], | |
| 'vital_signs_check': [], | |
| 'attending_notification': [], | |
| 'lab_order': ['vital_signs_check'], | |
| }, | |
| 'task_description': 'Multiple safety steps were skipped in this ER case where medication was given. Detect all 3 gaps, then rank them by urgency. Allergy check is highest priority because medication was already given.', | |
| }, | |
| { | |
| 'case_id': 'cli_medium_003', | |
| 'completion_threshold': 0.60, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 2, 'required_sequence': ['detect_gap', 'rank_issues']}, | |
| 'patient_id': 'P203', | |
| 'patient_events': ['admission', 'chemo_ordered', 'chemo_started', 'adverse_reaction'], | |
| 'events': ['admission', 'chemo_ordered', 'chemo_started', 'adverse_reaction'], | |
| 'expected_missing_steps': ['baseline_labs', 'oncologist_approval', 'dose_verification'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['oncologist_approval', 'dose_verification', 'baseline_labs'], | |
| 'available_steps': ['baseline_labs', 'oncologist_approval', 'dose_verification', 'pharmacy_review', 'patient_consent'], | |
| 'dependency_graph': { | |
| 'oncologist_approval': [], | |
| 'dose_verification': ['oncologist_approval'], | |
| 'baseline_labs': [], | |
| 'pharmacy_review': ['dose_verification'], | |
| }, | |
| 'task_description': 'Critical chemotherapy workflow violations caused an adverse reaction. Detect all 3 missing safety steps, then rank by urgency. Oncologist approval is highest priority — without it the other steps are meaningless.', | |
| }, | |
| ], | |
| 'cli_hard': [ | |
| { | |
| 'case_id': 'cli_hard_001', | |
| 'completion_threshold': 0.55, # FIX: was 0.70 — hard IS hard | |
| 'max_steps': 6, | |
| # FIX: required_sequence MUST include all 3 actions — episode runs full 3-step workflow | |
| 'done_conditions': {'min_actions': 3, 'required_sequence': ['detect_gap', 'rank_issues', 'order_steps']}, | |
| 'patient_id': 'P301', | |
| 'patient_events': ['surgery_planned', 'insurance_denied', 'pre_op_test_skipped'], | |
| 'events': ['surgery_planned', 'insurance_denied', 'pre_op_test_skipped'], | |
| 'expected_missing_steps': ['resolve_insurance', 'complete_pre_op', 'book_specialist', 'schedule_surgery'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['resolve_insurance', 'complete_pre_op', 'book_specialist', 'schedule_surgery'], | |
| 'dependency_graph': { | |
| 'schedule_surgery': ['resolve_insurance', 'complete_pre_op', 'book_specialist'], | |
| 'complete_pre_op': ['resolve_insurance'], | |
| 'book_specialist': [], | |
| 'resolve_insurance': [], | |
| }, | |
| 'required_steps': ['resolve_insurance', 'complete_pre_op', 'book_specialist', 'schedule_surgery'], | |
| 'available_steps': ['resolve_insurance', 'complete_pre_op', 'book_specialist', 'schedule_surgery'], | |
| 'task_description': 'Complex surgical patient has 4 workflow failures. Detect ALL gaps, rank by priority, then plan a dependency-ordered recovery: resolve_insurance must come first (complete_pre_op depends on it), schedule_surgery must come last (depends on all others).', | |
| }, | |
| { | |
| 'case_id': 'cli_hard_002', | |
| 'completion_threshold': 0.55, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 3, 'required_sequence': ['detect_gap', 'rank_issues', 'order_steps']}, | |
| 'patient_id': 'P302', | |
| 'patient_events': ['cardiac_event', 'icu_admission', 'multiple_failures_detected'], | |
| 'events': ['cardiac_event', 'icu_admission', 'multiple_failures_detected'], | |
| 'expected_missing_steps': ['stabilize_vitals', 'cardiology_consult', 'imaging_ordered', 'medication_review', 'family_notification'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['stabilize_vitals', 'cardiology_consult', 'imaging_ordered', 'medication_review', 'family_notification'], | |
| 'dependency_graph': { | |
| 'family_notification': ['stabilize_vitals'], | |
| 'medication_review': ['cardiology_consult', 'imaging_ordered'], | |
| 'imaging_ordered': ['stabilize_vitals'], | |
| 'cardiology_consult': ['stabilize_vitals'], | |
| 'stabilize_vitals': [], | |
| }, | |
| 'required_steps': ['stabilize_vitals', 'cardiology_consult', 'imaging_ordered', 'medication_review', 'family_notification'], | |
| 'available_steps': ['stabilize_vitals', 'cardiology_consult', 'imaging_ordered', 'medication_review', 'family_notification'], | |
| 'task_description': 'Complex cardiac emergency. stabilize_vitals must come FIRST (everything depends on it). medication_review needs BOTH cardiology_consult AND imaging_ordered. Plan a recovery sequence that respects ALL dependencies.', | |
| }, | |
| { | |
| 'case_id': 'cli_hard_003', | |
| 'completion_threshold': 0.55, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 3, 'required_sequence': ['detect_gap', 'rank_issues', 'order_steps']}, | |
| 'patient_id': 'P303', | |
| 'patient_events': ['chemo_ordered', 'lab_results_missing', 'dose_unclear', 'pharmacy_backlog'], | |
| 'events': ['chemo_ordered', 'lab_results_missing', 'dose_unclear', 'pharmacy_backlog'], | |
| 'expected_missing_steps': ['baseline_cbc', 'oncology_dose_verify', 'pharmacy_prep', 'nurse_admin_check'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['baseline_cbc', 'oncology_dose_verify', 'pharmacy_prep', 'nurse_admin_check'], | |
| 'dependency_graph': { | |
| 'nurse_admin_check': ['pharmacy_prep'], | |
| 'pharmacy_prep': ['oncology_dose_verify', 'baseline_cbc'], | |
| 'oncology_dose_verify': ['baseline_cbc'], | |
| 'baseline_cbc': [], | |
| }, | |
| 'required_steps': ['baseline_cbc', 'oncology_dose_verify', 'pharmacy_prep', 'nurse_admin_check'], | |
| 'available_steps': ['baseline_cbc', 'oncology_dose_verify', 'pharmacy_prep', 'nurse_admin_check'], | |
| 'task_description': 'Chemotherapy workflow chaos. baseline_cbc must come first. oncology_dose_verify needs baseline_cbc. pharmacy_prep needs BOTH dose_verify AND baseline_cbc. nurse_admin_check needs pharmacy_prep. Detect, rank, then order correctly.', | |
| }, | |
| { | |
| 'case_id': 'cli_hard_004', | |
| 'completion_threshold': 0.55, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 3, 'required_sequence': ['detect_gap', 'rank_issues', 'order_steps']}, | |
| 'patient_id': 'P304', | |
| 'patient_events': ['transplant_scheduled', 'donor_typing_incomplete', 'immunosuppress_missing', 'consent_partial'], | |
| 'events': ['transplant_scheduled', 'donor_typing_incomplete', 'immunosuppress_missing', 'consent_partial'], | |
| 'expected_missing_steps': ['hla_typing', 'crossmatch', 'immunosuppress_order', 'full_consent', 'surgery_slot'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['hla_typing', 'crossmatch', 'full_consent', 'immunosuppress_order', 'surgery_slot'], | |
| 'dependency_graph': { | |
| 'surgery_slot': ['hla_typing', 'crossmatch', 'full_consent', 'immunosuppress_order'], | |
| 'immunosuppress_order': ['crossmatch'], | |
| 'crossmatch': ['hla_typing'], | |
| 'full_consent': [], | |
| 'hla_typing': [], | |
| }, | |
| 'required_steps': ['hla_typing', 'crossmatch', 'immunosuppress_order', 'full_consent', 'surgery_slot'], | |
| 'available_steps': ['hla_typing', 'crossmatch', 'immunosuppress_order', 'full_consent', 'surgery_slot'], | |
| 'task_description': 'Organ transplant pre-op disaster. HLA typing must come first. Crossmatch needs HLA typing. Immunosuppression order needs crossmatch. Surgery booking requires ALL four prerequisites. One wrong order delays transplant.', | |
| }, | |
| { | |
| 'case_id': 'cli_hard_005', | |
| 'completion_threshold': 0.55, | |
| 'max_steps': 6, | |
| 'done_conditions': {'min_actions': 3, 'required_sequence': ['detect_gap', 'rank_issues', 'order_steps']}, | |
| 'patient_id': 'P305', | |
| 'patient_events': ['stroke_code', 'imaging_delayed', 'tpa_window_closing', 'neuro_unavailable'], | |
| 'events': ['stroke_code', 'imaging_delayed', 'tpa_window_closing', 'neuro_unavailable'], | |
| 'expected_missing_steps': ['ct_head', 'neuro_consult', 'tpa_eligibility', 'family_consent', 'icu_bed'], | |
| 'expected_risk': 'critical', | |
| 'priority_order': ['ct_head', 'tpa_eligibility', 'neuro_consult', 'family_consent', 'icu_bed'], | |
| 'dependency_graph': { | |
| 'icu_bed': ['tpa_eligibility'], | |
| 'family_consent': ['tpa_eligibility', 'neuro_consult'], | |
| 'neuro_consult': ['ct_head'], | |
| 'tpa_eligibility': ['ct_head'], | |
| 'ct_head': [], | |
| }, | |
| 'required_steps': ['ct_head', 'neuro_consult', 'tpa_eligibility', 'family_consent', 'icu_bed'], | |
| 'available_steps': ['ct_head', 'neuro_consult', 'tpa_eligibility', 'family_consent', 'icu_bed'], | |
| 'task_description': 'Acute stroke with closing tPA window. ct_head must come FIRST. Both tpa_eligibility and neuro_consult depend on ct_head. family_consent needs BOTH tpa_eligibility AND neuro_consult. icu_bed needs tpa_eligibility. Detect, rank, then order correctly.', | |
| }, | |
| ], | |
| } | |