Spaces:

messili
/

polyglot-alpha

Running

File size: 11,274 Bytes

88d2f2a

"""Tests for the backtest framework.

The mock-LLM path is exercised end-to-end so the test suite stays fast
(<5s) and offline. Real-LLM behaviour is asserted indirectly via the
LLM-factory swap test.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from polyglot_alpha.backtest.outcome_matcher import (
    OutcomeComparison,
    compare_questions,
    infer_category,
    infer_framing,
)
from polyglot_alpha.backtest.roi_estimator import (
    BUILDER_FEE_BPS,
    CAPTURE_RATE_FAIL,
    CAPTURE_RATE_PASS,
    CAPTURE_RATE_PASS_HIGH,
    HIGH_CONFIDENCE_THRESHOLD,
    estimate_roi,
)
from polyglot_alpha.backtest.runner import (
    MarketRecord,
    _pick_winner,
    load_markets,
    run_backtest,
)


# --------------------------------------------------------------------------- #
# Fixtures                                                                    #
# --------------------------------------------------------------------------- #


@pytest.fixture()
def mock_markets() -> list[MarketRecord]:
    """Three deterministic market records spanning YES / NO / dispute."""

    return [
        MarketRecord(
            market_id="bt-1",
            question="Will Bitcoin exceed $100k by 2026-12-31?",
            category="crypto",
            outcome="YES",
            total_volume_usdc=50_000.0,
            uma_dispute=False,
            resolution_source="https://example.com/btc",
        ),
        MarketRecord(
            market_id="bt-2",
            question="Will the Fed cut rates in March 2026?",
            category="economics",
            outcome="NO",
            total_volume_usdc=20_000.0,
            uma_dispute=True,
            resolution_source="https://example.com/fed",
        ),
        MarketRecord(
            market_id="bt-3",
            question="Will Apple announce a foldable iPhone before 2026-12-31?",
            category="tech",
            outcome="NO",
            total_volume_usdc=8_000.0,
            uma_dispute=False,
            resolution_source="https://example.com/aapl",
        ),
    ]


# --------------------------------------------------------------------------- #
# ROI estimator                                                               #
# --------------------------------------------------------------------------- #


class TestRoiEstimator:
    def test_pass_high_confidence_uses_top_capture_rate(self) -> None:
        roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS_HIGH)
        expected_fee = 100_000.0 * CAPTURE_RATE_PASS_HIGH * (BUILDER_FEE_BPS / 10_000.0)
        assert roi.builder_fee_usdc == pytest.approx(expected_fee)
        # Net = builder_fee - agent_cost; should still be positive on a 100k market.
        assert roi.net_roi_usdc > 0

    def test_pass_normal_uses_lower_capture_rate(self) -> None:
        roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD - 1)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS)

    def test_fail_returns_zero_fee(self) -> None:
        roi = estimate_roi(1_000_000.0, "FAIL", 0)
        assert roi.capture_rate == pytest.approx(CAPTURE_RATE_FAIL)
        assert roi.builder_fee_usdc == 0.0
        # Net negative because of agent_cost stub.
        assert roi.net_roi_usdc < 0

    def test_zero_volume_returns_zero(self) -> None:
        roi = estimate_roi(0.0, "PASS", 95.0)
        assert roi.builder_fee_usdc == 0.0

    def test_negative_volume_clamped_to_zero(self) -> None:
        roi = estimate_roi(-500.0, "PASS", 95.0)
        assert roi.builder_fee_usdc == 0.0


# --------------------------------------------------------------------------- #
# Outcome matcher                                                             #
# --------------------------------------------------------------------------- #


class TestOutcomeMatcher:
    def test_identical_questions_match_with_jaccard(self) -> None:
        result: OutcomeComparison = compare_questions(
            "Will Bitcoin exceed $100k by 2026-12-31?",
            "Will Bitcoin exceed $100k by 2026-12-31?",
            "YES",
            use_embeddings=False,
        )
        assert result.semantic_similarity == pytest.approx(1.0)
        assert result.semantic_match is True
        assert result.framing_predicted == "YES"
        assert result.outcome_match is True

    def test_disjoint_questions_low_similarity(self) -> None:
        result = compare_questions(
            "Will Apple ship a foldable iPhone?",
            "Will the Fed cut interest rates?",
            "NO",
            use_embeddings=False,
        )
        assert result.semantic_similarity < 0.3

    def test_framing_yes_matches_yes_outcome(self) -> None:
        result = compare_questions(
            "Will the policy be announced before December?",
            "Will the policy be announced before December?",
            "YES",
            use_embeddings=False,
        )
        assert result.framing_predicted == "YES"
        assert result.outcome_match is True

    def test_framing_yes_misses_on_no_resolution(self) -> None:
        result = compare_questions(
            "Will Apple announce a foldable iPhone?",
            "Will Apple announce a foldable iPhone?",
            "NO",
            use_embeddings=False,
        )
        # Question framing is YES, actual is NO → miss.
        assert result.framing_predicted == "YES"
        assert result.outcome_match is False

    def test_non_binary_outcome_is_not_matched(self) -> None:
        result = compare_questions(
            "Will Verstappen win the race?",
            "Race winner?",
            "Verstappen",
            use_embeddings=False,
        )
        assert result.outcome_match is False
        assert "non-binary" in result.notes

    def test_infer_framing_yes(self) -> None:
        assert infer_framing("Will X reach 100 by year-end?") == "YES"

    def test_infer_framing_no(self) -> None:
        # "Below" should mark this as a NO-framing.
        assert infer_framing("Will X fail to stay below the limit?") == "NO"

    def test_infer_framing_unknown(self) -> None:
        assert infer_framing("xyz") == "UNKNOWN"

    def test_infer_category_crypto(self) -> None:
        assert infer_category("Will Bitcoin reach $100k?") == "crypto"

    def test_infer_category_other(self) -> None:
        assert infer_category("Random unrelated string") == "other"


# --------------------------------------------------------------------------- #
# Auction logic                                                               #
# --------------------------------------------------------------------------- #


class TestAuction:
    def test_pick_winner_picks_lowest_bid(self) -> None:
        import random as _random

        rng = _random.Random(0)
        bids = {"gemini": 0.30, "deepseek": 0.75, "qwen": 0.40}
        assert _pick_winner(bids, rng=rng) == "gemini"

    def test_pick_winner_handles_ties_deterministically(self) -> None:
        import random as _random

        rng = _random.Random(123)
        bids = {"a": 0.5, "b": 0.5, "c": 0.5}
        first = _pick_winner(bids, rng=rng)
        # With the same seed we get the same answer.
        rng = _random.Random(123)
        second = _pick_winner(bids, rng=rng)
        assert first == second


# --------------------------------------------------------------------------- #
# Market loader                                                               #
# --------------------------------------------------------------------------- #


class TestLoadMarkets:
    def test_loads_from_real_parquet_if_available(self) -> None:
        repo_root = Path(__file__).resolve().parents[1]
        parquet = repo_root / "corpus" / "polymarket_resolved.parquet"
        if not parquet.exists():
            pytest.skip("resolved markets parquet not present in this checkout")
        markets = load_markets(n=3, parquet_path=parquet, seed=42)
        assert len(markets) == 3
        assert all(isinstance(m, MarketRecord) for m in markets)
        assert all(m.question for m in markets)

    def test_falls_back_to_sample_json(self, tmp_path: Path) -> None:
        # Point at a non-existent parquet so the loader uses sample_*.json
        # via the default ``outputs/`` directory.
        markets = load_markets(n=2, parquet_path=tmp_path / "missing.parquet", seed=42)
        assert len(markets) >= 1
        assert all(isinstance(m, MarketRecord) for m in markets)


# --------------------------------------------------------------------------- #
# End-to-end smoke test (mock LLM)                                            #
# --------------------------------------------------------------------------- #


class TestRunBacktestSmoke:
    def test_full_pipeline_with_mock_llm(
        self,
        mock_markets: list[MarketRecord],
        tmp_path: Path,
    ) -> None:
        import asyncio

        from polyglot_alpha.backtest.runner import run_backtest_async

        summary = asyncio.run(
            run_backtest_async(
                n=len(mock_markets),
                seed=42,
                output_dir=tmp_path,
                mock_llm=True,
                use_embeddings=False,  # avoid the sentence-transformers download
                markets=mock_markets,
            )
        )
        assert summary["n_markets"] == len(mock_markets)
        # Output files should have landed in tmp_path.
        jsonl = tmp_path / "per_market_results.jsonl"
        summary_path = tmp_path / "summary.json"
        report_path = tmp_path / "backtest_report.md"
        assert jsonl.exists()
        assert summary_path.exists()
        assert report_path.exists()

        # Re-read JSONL and confirm one row per market.
        rows = [json.loads(line) for line in jsonl.read_text().splitlines() if line]
        assert len(rows) == len(mock_markets)
        row0 = rows[0]
        # Sanity-check required fields per the spec.
        for key in (
            "market_id",
            "actual_question",
            "actual_outcome",
            "actual_volume",
            "agent_winner",
            "agent_question",
            "judge_verdict",
            "judge_score",
            "semantic_similarity",
            "outcome_match",
            "estimated_roi_usdc",
            "uma_dispute",
            "category",
            "notes",
        ):
            assert key in row0, f"missing key {key} in row"

        # Markdown report is non-empty and labelled.
        report_text = report_path.read_text()
        assert "PolyglotAlpha v2 Backtest Report" in report_text
        assert "Executive summary" in report_text

    def test_run_backtest_sync_wrapper(
        self,
        mock_markets: list[MarketRecord],
        tmp_path: Path,
    ) -> None:
        summary = run_backtest(
            n=len(mock_markets),
            seed=99,
            output_dir=tmp_path,
            mock_llm=True,
            use_embeddings=False,
            markets=mock_markets,
        )
        assert summary["n_markets"] == len(mock_markets)
        assert "outcome_accuracy" in summary
        assert "estimated_total_roi_usdc" in summary