Spaces:
Running
Running
| """Tests for the backtest framework. | |
| The mock-LLM path is exercised end-to-end so the test suite stays fast | |
| (<5s) and offline. Real-LLM behaviour is asserted indirectly via the | |
| LLM-factory swap test. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import pytest | |
| from polyglot_alpha.backtest.outcome_matcher import ( | |
| OutcomeComparison, | |
| compare_questions, | |
| infer_category, | |
| infer_framing, | |
| ) | |
| from polyglot_alpha.backtest.roi_estimator import ( | |
| BUILDER_FEE_BPS, | |
| CAPTURE_RATE_FAIL, | |
| CAPTURE_RATE_PASS, | |
| CAPTURE_RATE_PASS_HIGH, | |
| HIGH_CONFIDENCE_THRESHOLD, | |
| estimate_roi, | |
| ) | |
| from polyglot_alpha.backtest.runner import ( | |
| MarketRecord, | |
| _pick_winner, | |
| load_markets, | |
| run_backtest, | |
| ) | |
| # --------------------------------------------------------------------------- # | |
| # Fixtures # | |
| # --------------------------------------------------------------------------- # | |
| def mock_markets() -> list[MarketRecord]: | |
| """Three deterministic market records spanning YES / NO / dispute.""" | |
| return [ | |
| MarketRecord( | |
| market_id="bt-1", | |
| question="Will Bitcoin exceed $100k by 2026-12-31?", | |
| category="crypto", | |
| outcome="YES", | |
| total_volume_usdc=50_000.0, | |
| uma_dispute=False, | |
| resolution_source="https://example.com/btc", | |
| ), | |
| MarketRecord( | |
| market_id="bt-2", | |
| question="Will the Fed cut rates in March 2026?", | |
| category="economics", | |
| outcome="NO", | |
| total_volume_usdc=20_000.0, | |
| uma_dispute=True, | |
| resolution_source="https://example.com/fed", | |
| ), | |
| MarketRecord( | |
| market_id="bt-3", | |
| question="Will Apple announce a foldable iPhone before 2026-12-31?", | |
| category="tech", | |
| outcome="NO", | |
| total_volume_usdc=8_000.0, | |
| uma_dispute=False, | |
| resolution_source="https://example.com/aapl", | |
| ), | |
| ] | |
| # --------------------------------------------------------------------------- # | |
| # ROI estimator # | |
| # --------------------------------------------------------------------------- # | |
| class TestRoiEstimator: | |
| def test_pass_high_confidence_uses_top_capture_rate(self) -> None: | |
| roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD) | |
| assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS_HIGH) | |
| expected_fee = 100_000.0 * CAPTURE_RATE_PASS_HIGH * (BUILDER_FEE_BPS / 10_000.0) | |
| assert roi.builder_fee_usdc == pytest.approx(expected_fee) | |
| # Net = builder_fee - agent_cost; should still be positive on a 100k market. | |
| assert roi.net_roi_usdc > 0 | |
| def test_pass_normal_uses_lower_capture_rate(self) -> None: | |
| roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD - 1) | |
| assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS) | |
| def test_fail_returns_zero_fee(self) -> None: | |
| roi = estimate_roi(1_000_000.0, "FAIL", 0) | |
| assert roi.capture_rate == pytest.approx(CAPTURE_RATE_FAIL) | |
| assert roi.builder_fee_usdc == 0.0 | |
| # Net negative because of agent_cost stub. | |
| assert roi.net_roi_usdc < 0 | |
| def test_zero_volume_returns_zero(self) -> None: | |
| roi = estimate_roi(0.0, "PASS", 95.0) | |
| assert roi.builder_fee_usdc == 0.0 | |
| def test_negative_volume_clamped_to_zero(self) -> None: | |
| roi = estimate_roi(-500.0, "PASS", 95.0) | |
| assert roi.builder_fee_usdc == 0.0 | |
| # --------------------------------------------------------------------------- # | |
| # Outcome matcher # | |
| # --------------------------------------------------------------------------- # | |
| class TestOutcomeMatcher: | |
| def test_identical_questions_match_with_jaccard(self) -> None: | |
| result: OutcomeComparison = compare_questions( | |
| "Will Bitcoin exceed $100k by 2026-12-31?", | |
| "Will Bitcoin exceed $100k by 2026-12-31?", | |
| "YES", | |
| use_embeddings=False, | |
| ) | |
| assert result.semantic_similarity == pytest.approx(1.0) | |
| assert result.semantic_match is True | |
| assert result.framing_predicted == "YES" | |
| assert result.outcome_match is True | |
| def test_disjoint_questions_low_similarity(self) -> None: | |
| result = compare_questions( | |
| "Will Apple ship a foldable iPhone?", | |
| "Will the Fed cut interest rates?", | |
| "NO", | |
| use_embeddings=False, | |
| ) | |
| assert result.semantic_similarity < 0.3 | |
| def test_framing_yes_matches_yes_outcome(self) -> None: | |
| result = compare_questions( | |
| "Will the policy be announced before December?", | |
| "Will the policy be announced before December?", | |
| "YES", | |
| use_embeddings=False, | |
| ) | |
| assert result.framing_predicted == "YES" | |
| assert result.outcome_match is True | |
| def test_framing_yes_misses_on_no_resolution(self) -> None: | |
| result = compare_questions( | |
| "Will Apple announce a foldable iPhone?", | |
| "Will Apple announce a foldable iPhone?", | |
| "NO", | |
| use_embeddings=False, | |
| ) | |
| # Question framing is YES, actual is NO → miss. | |
| assert result.framing_predicted == "YES" | |
| assert result.outcome_match is False | |
| def test_non_binary_outcome_is_not_matched(self) -> None: | |
| result = compare_questions( | |
| "Will Verstappen win the race?", | |
| "Race winner?", | |
| "Verstappen", | |
| use_embeddings=False, | |
| ) | |
| assert result.outcome_match is False | |
| assert "non-binary" in result.notes | |
| def test_infer_framing_yes(self) -> None: | |
| assert infer_framing("Will X reach 100 by year-end?") == "YES" | |
| def test_infer_framing_no(self) -> None: | |
| # "Below" should mark this as a NO-framing. | |
| assert infer_framing("Will X fail to stay below the limit?") == "NO" | |
| def test_infer_framing_unknown(self) -> None: | |
| assert infer_framing("xyz") == "UNKNOWN" | |
| def test_infer_category_crypto(self) -> None: | |
| assert infer_category("Will Bitcoin reach $100k?") == "crypto" | |
| def test_infer_category_other(self) -> None: | |
| assert infer_category("Random unrelated string") == "other" | |
| # --------------------------------------------------------------------------- # | |
| # Auction logic # | |
| # --------------------------------------------------------------------------- # | |
| class TestAuction: | |
| def test_pick_winner_picks_lowest_bid(self) -> None: | |
| import random as _random | |
| rng = _random.Random(0) | |
| bids = {"gemini": 0.30, "deepseek": 0.75, "qwen": 0.40} | |
| assert _pick_winner(bids, rng=rng) == "gemini" | |
| def test_pick_winner_handles_ties_deterministically(self) -> None: | |
| import random as _random | |
| rng = _random.Random(123) | |
| bids = {"a": 0.5, "b": 0.5, "c": 0.5} | |
| first = _pick_winner(bids, rng=rng) | |
| # With the same seed we get the same answer. | |
| rng = _random.Random(123) | |
| second = _pick_winner(bids, rng=rng) | |
| assert first == second | |
| # --------------------------------------------------------------------------- # | |
| # Market loader # | |
| # --------------------------------------------------------------------------- # | |
| class TestLoadMarkets: | |
| def test_loads_from_real_parquet_if_available(self) -> None: | |
| repo_root = Path(__file__).resolve().parents[1] | |
| parquet = repo_root / "corpus" / "polymarket_resolved.parquet" | |
| if not parquet.exists(): | |
| pytest.skip("resolved markets parquet not present in this checkout") | |
| markets = load_markets(n=3, parquet_path=parquet, seed=42) | |
| assert len(markets) == 3 | |
| assert all(isinstance(m, MarketRecord) for m in markets) | |
| assert all(m.question for m in markets) | |
| def test_falls_back_to_sample_json(self, tmp_path: Path) -> None: | |
| # Point at a non-existent parquet so the loader uses sample_*.json | |
| # via the default ``outputs/`` directory. | |
| markets = load_markets(n=2, parquet_path=tmp_path / "missing.parquet", seed=42) | |
| assert len(markets) >= 1 | |
| assert all(isinstance(m, MarketRecord) for m in markets) | |
| # --------------------------------------------------------------------------- # | |
| # End-to-end smoke test (mock LLM) # | |
| # --------------------------------------------------------------------------- # | |
| class TestRunBacktestSmoke: | |
| def test_full_pipeline_with_mock_llm( | |
| self, | |
| mock_markets: list[MarketRecord], | |
| tmp_path: Path, | |
| ) -> None: | |
| import asyncio | |
| from polyglot_alpha.backtest.runner import run_backtest_async | |
| summary = asyncio.run( | |
| run_backtest_async( | |
| n=len(mock_markets), | |
| seed=42, | |
| output_dir=tmp_path, | |
| mock_llm=True, | |
| use_embeddings=False, # avoid the sentence-transformers download | |
| markets=mock_markets, | |
| ) | |
| ) | |
| assert summary["n_markets"] == len(mock_markets) | |
| # Output files should have landed in tmp_path. | |
| jsonl = tmp_path / "per_market_results.jsonl" | |
| summary_path = tmp_path / "summary.json" | |
| report_path = tmp_path / "backtest_report.md" | |
| assert jsonl.exists() | |
| assert summary_path.exists() | |
| assert report_path.exists() | |
| # Re-read JSONL and confirm one row per market. | |
| rows = [json.loads(line) for line in jsonl.read_text().splitlines() if line] | |
| assert len(rows) == len(mock_markets) | |
| row0 = rows[0] | |
| # Sanity-check required fields per the spec. | |
| for key in ( | |
| "market_id", | |
| "actual_question", | |
| "actual_outcome", | |
| "actual_volume", | |
| "agent_winner", | |
| "agent_question", | |
| "judge_verdict", | |
| "judge_score", | |
| "semantic_similarity", | |
| "outcome_match", | |
| "estimated_roi_usdc", | |
| "uma_dispute", | |
| "category", | |
| "notes", | |
| ): | |
| assert key in row0, f"missing key {key} in row" | |
| # Markdown report is non-empty and labelled. | |
| report_text = report_path.read_text() | |
| assert "PolyglotAlpha v2 Backtest Report" in report_text | |
| assert "Executive summary" in report_text | |
| def test_run_backtest_sync_wrapper( | |
| self, | |
| mock_markets: list[MarketRecord], | |
| tmp_path: Path, | |
| ) -> None: | |
| summary = run_backtest( | |
| n=len(mock_markets), | |
| seed=99, | |
| output_dir=tmp_path, | |
| mock_llm=True, | |
| use_embeddings=False, | |
| markets=mock_markets, | |
| ) | |
| assert summary["n_markets"] == len(mock_markets) | |
| assert "outcome_accuracy" in summary | |
| assert "estimated_total_roi_usdc" in summary | |