Spaces:
Sleeping
Sleeping
| """Test script to validate both gap fixes. | |
| Tests: | |
| 1. Gap 1: RL training uses EDA-derived parameters | |
| 2. Gap 2: Ripeness feedback loop works | |
| """ | |
| from datetime import date, datetime | |
| from rl.config import RLTrainingConfig | |
| from rl.simple_agent import TabularQAgent | |
| from rl.training import RLTrainingEnvironment, train_agent | |
| from src.core.ripeness import RipenessClassifier, RipenessStatus | |
| from src.data.case_generator import CaseGenerator | |
| from src.data.param_loader import ParameterLoader | |
| from src.monitoring.ripeness_calibrator import RipenessCalibrator | |
| from src.monitoring.ripeness_metrics import RipenessMetrics | |
| def test_gap1_eda_alignment(): | |
| """Test that RL training uses EDA-derived parameters.""" | |
| print("\n" + "=" * 70) | |
| print("GAP 1: Testing EDA Alignment in RL Training") | |
| print("=" * 70) | |
| # Generate test cases | |
| generator = CaseGenerator( | |
| start=date(2024, 1, 1), | |
| end=date(2024, 1, 31), | |
| seed=42, | |
| ) | |
| cases = generator.generate(100, stage_mix_auto=True) | |
| # Create environment with param_loader | |
| env = RLTrainingEnvironment( | |
| cases=cases, | |
| start_date=date(2024, 1, 1), | |
| horizon_days=30, | |
| ) | |
| # Verify param_loader exists | |
| assert hasattr(env, "param_loader"), "Environment should have param_loader" | |
| assert isinstance(env.param_loader, ParameterLoader), ( | |
| "param_loader should be ParameterLoader instance" | |
| ) | |
| print("ParameterLoader successfully integrated into RLTrainingEnvironment") | |
| # Test hearing outcome simulation uses EDA parameters | |
| test_case = cases[0] | |
| test_case.current_stage = "ADMISSION" | |
| test_case.case_type = "RSA" | |
| # Get EDA-derived adjournment probability | |
| p_adj_eda = env.param_loader.get_adjournment_prob("ADMISSION", "RSA") | |
| print(f"EDA adjournment probability for ADMISSION/RSA: {p_adj_eda:.2%}") | |
| # Simulate outcomes multiple times and check alignment | |
| outcomes = [] | |
| for _ in range(100): | |
| outcome = env._simulate_hearing_outcome(test_case) | |
| outcomes.append(outcome) | |
| adjourn_rate = sum(1 for o in outcomes if o == "ADJOURNED") / len(outcomes) | |
| print(f"Simulated adjournment rate: {adjourn_rate:.2%}") | |
| print(f" Difference from EDA: {abs(adjourn_rate - p_adj_eda):.2%}") | |
| # Should be within 15% of EDA value (stochastic sampling) | |
| assert abs(adjourn_rate - p_adj_eda) < 0.15, ( | |
| f"Adjournment rate {adjourn_rate:.2%} too far from EDA {p_adj_eda:.2%}" | |
| ) | |
| print("\n✅ GAP 1 FIXED: RL training now uses EDA-derived parameters\n") | |
| def test_gap2_ripeness_feedback(): | |
| """Test that ripeness feedback loop works.""" | |
| print("\n" + "=" * 70) | |
| print("GAP 2: Testing Ripeness Feedback Loop") | |
| print("=" * 70) | |
| # Create metrics tracker | |
| metrics = RipenessMetrics() | |
| # Simulate predictions and outcomes (need 50+ for calibrator) | |
| test_cases = [] | |
| # Pattern: 50% false positives (RIPE but adjourned), 50% false negatives | |
| for i in range(50): | |
| if i % 4 == 0: | |
| test_cases.append((f"case{i}", RipenessStatus.RIPE, False)) # Correct RIPE | |
| elif i % 4 == 1: | |
| test_cases.append((f"case{i}", RipenessStatus.RIPE, True)) # False positive | |
| elif i % 4 == 2: | |
| test_cases.append( | |
| (f"case{i}", RipenessStatus.UNRIPE_SUMMONS, True) | |
| ) # Correct UNRIPE | |
| else: | |
| test_cases.append( | |
| (f"case{i}", RipenessStatus.UNRIPE_SUMMONS, False) | |
| ) # False negative | |
| prediction_date = datetime(2024, 1, 1) | |
| outcome_date = datetime(2024, 1, 2) | |
| for case_id, predicted_status, was_adjourned in test_cases: | |
| metrics.record_prediction(case_id, predicted_status, prediction_date) | |
| actual_outcome = "ADJOURNED" if was_adjourned else "ARGUMENTS" | |
| metrics.record_outcome(case_id, actual_outcome, was_adjourned, outcome_date) | |
| print(f"Recorded {len(test_cases)} predictions and outcomes") | |
| # Get accuracy metrics | |
| accuracy = metrics.get_accuracy_metrics() | |
| print("\n Accuracy Metrics:") | |
| print(f" False positive rate: {accuracy['false_positive_rate']:.1%}") | |
| print(f" False negative rate: {accuracy['false_negative_rate']:.1%}") | |
| print(f" RIPE precision: {accuracy['ripe_precision']:.1%}") | |
| print(f" UNRIPE recall: {accuracy['unripe_recall']:.1%}") | |
| # Expected: 2/4 false positives (50%), 1/2 false negatives (50%) | |
| assert accuracy["false_positive_rate"] > 0.4, "Should detect false positives" | |
| assert accuracy["false_negative_rate"] > 0.4, "Should detect false negatives" | |
| print("\nRipenessMetrics successfully tracks classification accuracy") | |
| # Test calibrator | |
| adjustments = RipenessCalibrator.analyze_metrics(metrics) | |
| print(f"\nRipenessCalibrator generated {len(adjustments)} adjustment suggestions:") | |
| for adj in adjustments: | |
| print( | |
| f" - {adj.threshold_name}: {adj.current_value} → {adj.suggested_value}" | |
| ) | |
| print(f" Reason: {adj.reason[:80]}...") | |
| assert len(adjustments) > 0, "Should suggest at least one adjustment" | |
| # Test threshold configuration | |
| original_thresholds = RipenessClassifier.get_current_thresholds() | |
| print(f"\nCurrent thresholds: {original_thresholds}") | |
| # Apply test adjustment | |
| test_thresholds = {"MIN_SERVICE_HEARINGS": 2} | |
| RipenessClassifier.set_thresholds(test_thresholds) | |
| new_thresholds = RipenessClassifier.get_current_thresholds() | |
| assert new_thresholds["MIN_SERVICE_HEARINGS"] == 2, "Threshold should be updated" | |
| print(f"Thresholds successfully updated: {new_thresholds}") | |
| # Restore original | |
| RipenessClassifier.set_thresholds(original_thresholds) | |
| print("\n✅ GAP 2 FIXED: Ripeness feedback loop fully operational\n") | |
| def test_end_to_end(): | |
| """Quick end-to-end test with small training run.""" | |
| print("\n" + "=" * 70) | |
| print("END-TO-END: Testing Both Gaps Together") | |
| print("=" * 70) | |
| # Create agent | |
| agent = TabularQAgent(learning_rate=0.15, epsilon=0.4, discount=0.95) | |
| # Minimal training config | |
| config = RLTrainingConfig( | |
| episodes=2, | |
| episode_length_days=10, | |
| cases_per_episode=50, | |
| training_seed=42, | |
| ) | |
| print("Running mini training (2 episodes, 50 cases, 10 days)...") | |
| stats = train_agent(agent, rl_config=config, verbose=False) | |
| assert len(stats["episodes"]) == 2, "Should complete 2 episodes" | |
| assert stats["episodes"][-1] == 1, "Last episode should be episode 1" | |
| print(f"Training completed: {len(stats['episodes'])} episodes") | |
| print(f" Final disposal rate: {stats['disposal_rates'][-1]:.1%}") | |
| print(f" States explored: {stats['states_explored'][-1]}") | |
| print("\n✅ END-TO-END: Both gaps working together successfully\n") | |
| if __name__ == "__main__": | |
| print("\n" + "=" * 70) | |
| print("TESTING GAP FIXES") | |
| print("=" * 70) | |
| try: | |
| test_gap1_eda_alignment() | |
| test_gap2_ripeness_feedback() | |
| test_end_to_end() | |
| print("\n" + "=" * 70) | |
| print("ALL TESTS PASSED") | |
| print("=" * 70) | |
| print("\nSummary:") | |
| print(" ✅ Gap 1: RL training aligned with EDA parameters") | |
| print(" ✅ Gap 2: Ripeness feedback loop operational") | |
| print(" ✅ End-to-end: Both gaps working together") | |
| print("\nBoth confirmed gaps are now FIXED!") | |
| print("=" * 70 + "\n") | |
| except Exception as e: | |
| print(f"\nTEST FAILED: {e}") | |
| raise | |