File size: 11,274 Bytes
88d2f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""Tests for the backtest framework.

The mock-LLM path is exercised end-to-end so the test suite stays fast
(<5s) and offline. Real-LLM behaviour is asserted indirectly via the
LLM-factory swap test.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from polyglot_alpha.backtest.outcome_matcher import (
    OutcomeComparison,
    compare_questions,
    infer_category,
    infer_framing,
)
from polyglot_alpha.backtest.roi_estimator import (
    BUILDER_FEE_BPS,
    CAPTURE_RATE_FAIL,
    CAPTURE_RATE_PASS,
    CAPTURE_RATE_PASS_HIGH,
    HIGH_CONFIDENCE_THRESHOLD,
    estimate_roi,
)
from polyglot_alpha.backtest.runner import (
    MarketRecord,
    _pick_winner,
    load_markets,
    run_backtest,
)


# --------------------------------------------------------------------------- #
# Fixtures                                                                    #
# --------------------------------------------------------------------------- #


@pytest.fixture()
def mock_markets() -> list[MarketRecord]:
    """Three deterministic market records spanning YES / NO / dispute."""

    return [
        MarketRecord(
            market_id="bt-1",
            question="Will Bitcoin exceed $100k by 2026-12-31?",
            category="crypto",
            outcome="YES",
            total_volume_usdc=50_000.0,
            uma_dispute=False,
            resolution_source="https://example.com/btc",
        ),
        MarketRecord(
            market_id="bt-2",
            question="Will the Fed cut rates in March 2026?",
            category="economics",
            outcome="NO",
            total_volume_usdc=20_000.0,
            uma_dispute=True,
            resolution_source="https://example.com/fed",
        ),
        MarketRecord(
            market_id="bt-3",
            question="Will Apple announce a foldable iPhone before 2026-12-31?",
            category="tech",
            outcome="NO",
            total_volume_usdc=8_000.0,
            uma_dispute=False,
            resolution_source="https://example.com/aapl",
        ),
    ]


# --------------------------------------------------------------------------- #
# ROI estimator                                                               #
# --------------------------------------------------------------------------- #


class TestRoiEstimator:
    def test_pass_high_confidence_uses_top_capture_rate(self) -> None:
        roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS_HIGH)
        expected_fee = 100_000.0 * CAPTURE_RATE_PASS_HIGH * (BUILDER_FEE_BPS / 10_000.0)
        assert roi.builder_fee_usdc == pytest.approx(expected_fee)
        # Net = builder_fee - agent_cost; should still be positive on a 100k market.
        assert roi.net_roi_usdc > 0

    def test_pass_normal_uses_lower_capture_rate(self) -> None:
        roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD - 1)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS)

    def test_fail_returns_zero_fee(self) -> None:
        roi = estimate_roi(1_000_000.0, "FAIL", 0)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_FAIL)
        assert roi.builder_fee_usdc == 0.0
        # Net negative because of agent_cost stub.
        assert roi.net_roi_usdc < 0

    def test_zero_volume_returns_zero(self) -> None:
        roi = estimate_roi(0.0, "PASS", 95.0)
        assert roi.builder_fee_usdc == 0.0

    def test_negative_volume_clamped_to_zero(self) -> None:
        roi = estimate_roi(-500.0, "PASS", 95.0)
        assert roi.builder_fee_usdc == 0.0


# --------------------------------------------------------------------------- #
# Outcome matcher                                                             #
# --------------------------------------------------------------------------- #


class TestOutcomeMatcher:
    def test_identical_questions_match_with_jaccard(self) -> None:
        result: OutcomeComparison = compare_questions(
            "Will Bitcoin exceed $100k by 2026-12-31?",
            "Will Bitcoin exceed $100k by 2026-12-31?",
            "YES",
            use_embeddings=False,
        )
        assert result.semantic_similarity == pytest.approx(1.0)
        assert result.semantic_match is True
        assert result.framing_predicted == "YES"
        assert result.outcome_match is True

    def test_disjoint_questions_low_similarity(self) -> None:
        result = compare_questions(
            "Will Apple ship a foldable iPhone?",
            "Will the Fed cut interest rates?",
            "NO",
            use_embeddings=False,
        )
        assert result.semantic_similarity < 0.3

    def test_framing_yes_matches_yes_outcome(self) -> None:
        result = compare_questions(
            "Will the policy be announced before December?",
            "Will the policy be announced before December?",
            "YES",
            use_embeddings=False,
        )
        assert result.framing_predicted == "YES"
        assert result.outcome_match is True

    def test_framing_yes_misses_on_no_resolution(self) -> None:
        result = compare_questions(
            "Will Apple announce a foldable iPhone?",
            "Will Apple announce a foldable iPhone?",
            "NO",
            use_embeddings=False,
        )
        # Question framing is YES, actual is NO → miss.
        assert result.framing_predicted == "YES"
        assert result.outcome_match is False

    def test_non_binary_outcome_is_not_matched(self) -> None:
        result = compare_questions(
            "Will Verstappen win the race?",
            "Race winner?",
            "Verstappen",
            use_embeddings=False,
        )
        assert result.outcome_match is False
        assert "non-binary" in result.notes

    def test_infer_framing_yes(self) -> None:
        assert infer_framing("Will X reach 100 by year-end?") == "YES"

    def test_infer_framing_no(self) -> None:
        # "Below" should mark this as a NO-framing.
        assert infer_framing("Will X fail to stay below the limit?") == "NO"

    def test_infer_framing_unknown(self) -> None:
        assert infer_framing("xyz") == "UNKNOWN"

    def test_infer_category_crypto(self) -> None:
        assert infer_category("Will Bitcoin reach $100k?") == "crypto"

    def test_infer_category_other(self) -> None:
        assert infer_category("Random unrelated string") == "other"


# --------------------------------------------------------------------------- #
# Auction logic                                                               #
# --------------------------------------------------------------------------- #


class TestAuction:
    def test_pick_winner_picks_lowest_bid(self) -> None:
        import random as _random

        rng = _random.Random(0)
        bids = {"gemini": 0.30, "deepseek": 0.75, "qwen": 0.40}
        assert _pick_winner(bids, rng=rng) == "gemini"

    def test_pick_winner_handles_ties_deterministically(self) -> None:
        import random as _random

        rng = _random.Random(123)
        bids = {"a": 0.5, "b": 0.5, "c": 0.5}
        first = _pick_winner(bids, rng=rng)
        # With the same seed we get the same answer.
        rng = _random.Random(123)
        second = _pick_winner(bids, rng=rng)
        assert first == second


# --------------------------------------------------------------------------- #
# Market loader                                                               #
# --------------------------------------------------------------------------- #


class TestLoadMarkets:
    def test_loads_from_real_parquet_if_available(self) -> None:
        repo_root = Path(__file__).resolve().parents[1]
        parquet = repo_root / "corpus" / "polymarket_resolved.parquet"
        if not parquet.exists():
            pytest.skip("resolved markets parquet not present in this checkout")
        markets = load_markets(n=3, parquet_path=parquet, seed=42)
        assert len(markets) == 3
        assert all(isinstance(m, MarketRecord) for m in markets)
        assert all(m.question for m in markets)

    def test_falls_back_to_sample_json(self, tmp_path: Path) -> None:
        # Point at a non-existent parquet so the loader uses sample_*.json
        # via the default ``outputs/`` directory.
        markets = load_markets(n=2, parquet_path=tmp_path / "missing.parquet", seed=42)
        assert len(markets) >= 1
        assert all(isinstance(m, MarketRecord) for m in markets)


# --------------------------------------------------------------------------- #
# End-to-end smoke test (mock LLM)                                            #
# --------------------------------------------------------------------------- #


class TestRunBacktestSmoke:
    def test_full_pipeline_with_mock_llm(
        self,
        mock_markets: list[MarketRecord],
        tmp_path: Path,
    ) -> None:
        import asyncio

        from polyglot_alpha.backtest.runner import run_backtest_async

        summary = asyncio.run(
            run_backtest_async(
                n=len(mock_markets),
                seed=42,
                output_dir=tmp_path,
                mock_llm=True,
                use_embeddings=False,  # avoid the sentence-transformers download
                markets=mock_markets,
            )
        )
        assert summary["n_markets"] == len(mock_markets)
        # Output files should have landed in tmp_path.
        jsonl = tmp_path / "per_market_results.jsonl"
        summary_path = tmp_path / "summary.json"
        report_path = tmp_path / "backtest_report.md"
        assert jsonl.exists()
        assert summary_path.exists()
        assert report_path.exists()

        # Re-read JSONL and confirm one row per market.
        rows = [json.loads(line) for line in jsonl.read_text().splitlines() if line]
        assert len(rows) == len(mock_markets)
        row0 = rows[0]
        # Sanity-check required fields per the spec.
        for key in (
            "market_id",
            "actual_question",
            "actual_outcome",
            "actual_volume",
            "agent_winner",
            "agent_question",
            "judge_verdict",
            "judge_score",
            "semantic_similarity",
            "outcome_match",
            "estimated_roi_usdc",
            "uma_dispute",
            "category",
            "notes",
        ):
            assert key in row0, f"missing key {key} in row"

        # Markdown report is non-empty and labelled.
        report_text = report_path.read_text()
        assert "PolyglotAlpha v2 Backtest Report" in report_text
        assert "Executive summary" in report_text

    def test_run_backtest_sync_wrapper(
        self,
        mock_markets: list[MarketRecord],
        tmp_path: Path,
    ) -> None:
        summary = run_backtest(
            n=len(mock_markets),
            seed=99,
            output_dir=tmp_path,
            mock_llm=True,
            use_embeddings=False,
            markets=mock_markets,
        )
        assert summary["n_markets"] == len(mock_markets)
        assert "outcome_accuracy" in summary
        assert "estimated_total_roi_usdc" in summary