Spaces:
Sleeping
Sleeping
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| import sys | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | |
| from engine.analytics import AnalyticsEngine | |
| from engine.backtest import build_benchmark_suite, run_backtest | |
| def test_backtest_uses_holdout_suite_separate_from_tuning(): | |
| suite = build_benchmark_suite(reference_time=datetime.now(timezone.utc)) | |
| tuning_names = {scenario.name for scenario in suite.tuning} | |
| holdout_names = {scenario.name for scenario in suite.holdout} | |
| assert tuning_names | |
| assert holdout_names | |
| assert tuning_names.isdisjoint(holdout_names) | |
| def test_sentiment_holdout_backtest_has_reasonable_accuracy_without_overcalling(): | |
| suite = build_benchmark_suite(reference_time=datetime.now(timezone.utc)) | |
| backtest = run_backtest(suite.holdout, engine=AnalyticsEngine()) | |
| metrics = backtest["metrics"] | |
| assert metrics["scenario_count"] == len(suite.holdout) | |
| assert metrics["overall_accuracy"] >= 0.78 | |
| assert metrics["decisive_precision"] >= 0.75 | |
| assert metrics["mixed_accuracy"] >= 0.66 | |
| assert metrics["overcall_rate"] <= 0.22 | |
| assert metrics["coverage"] >= 0.3 | |