polyglot-alpha / tests /test_backtest.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Tests for the backtest framework.
The mock-LLM path is exercised end-to-end so the test suite stays fast
(<5s) and offline. Real-LLM behaviour is asserted indirectly via the
LLM-factory swap test.
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from polyglot_alpha.backtest.outcome_matcher import (
OutcomeComparison,
compare_questions,
infer_category,
infer_framing,
)
from polyglot_alpha.backtest.roi_estimator import (
BUILDER_FEE_BPS,
CAPTURE_RATE_FAIL,
CAPTURE_RATE_PASS,
CAPTURE_RATE_PASS_HIGH,
HIGH_CONFIDENCE_THRESHOLD,
estimate_roi,
)
from polyglot_alpha.backtest.runner import (
MarketRecord,
_pick_winner,
load_markets,
run_backtest,
)
# --------------------------------------------------------------------------- #
# Fixtures #
# --------------------------------------------------------------------------- #
@pytest.fixture()
def mock_markets() -> list[MarketRecord]:
"""Three deterministic market records spanning YES / NO / dispute."""
return [
MarketRecord(
market_id="bt-1",
question="Will Bitcoin exceed $100k by 2026-12-31?",
category="crypto",
outcome="YES",
total_volume_usdc=50_000.0,
uma_dispute=False,
resolution_source="https://example.com/btc",
),
MarketRecord(
market_id="bt-2",
question="Will the Fed cut rates in March 2026?",
category="economics",
outcome="NO",
total_volume_usdc=20_000.0,
uma_dispute=True,
resolution_source="https://example.com/fed",
),
MarketRecord(
market_id="bt-3",
question="Will Apple announce a foldable iPhone before 2026-12-31?",
category="tech",
outcome="NO",
total_volume_usdc=8_000.0,
uma_dispute=False,
resolution_source="https://example.com/aapl",
),
]
# --------------------------------------------------------------------------- #
# ROI estimator #
# --------------------------------------------------------------------------- #
class TestRoiEstimator:
def test_pass_high_confidence_uses_top_capture_rate(self) -> None:
roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD)
assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS_HIGH)
expected_fee = 100_000.0 * CAPTURE_RATE_PASS_HIGH * (BUILDER_FEE_BPS / 10_000.0)
assert roi.builder_fee_usdc == pytest.approx(expected_fee)
# Net = builder_fee - agent_cost; should still be positive on a 100k market.
assert roi.net_roi_usdc > 0
def test_pass_normal_uses_lower_capture_rate(self) -> None:
roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD - 1)
assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS)
def test_fail_returns_zero_fee(self) -> None:
roi = estimate_roi(1_000_000.0, "FAIL", 0)
assert roi.capture_rate == pytest.approx(CAPTURE_RATE_FAIL)
assert roi.builder_fee_usdc == 0.0
# Net negative because of agent_cost stub.
assert roi.net_roi_usdc < 0
def test_zero_volume_returns_zero(self) -> None:
roi = estimate_roi(0.0, "PASS", 95.0)
assert roi.builder_fee_usdc == 0.0
def test_negative_volume_clamped_to_zero(self) -> None:
roi = estimate_roi(-500.0, "PASS", 95.0)
assert roi.builder_fee_usdc == 0.0
# --------------------------------------------------------------------------- #
# Outcome matcher #
# --------------------------------------------------------------------------- #
class TestOutcomeMatcher:
def test_identical_questions_match_with_jaccard(self) -> None:
result: OutcomeComparison = compare_questions(
"Will Bitcoin exceed $100k by 2026-12-31?",
"Will Bitcoin exceed $100k by 2026-12-31?",
"YES",
use_embeddings=False,
)
assert result.semantic_similarity == pytest.approx(1.0)
assert result.semantic_match is True
assert result.framing_predicted == "YES"
assert result.outcome_match is True
def test_disjoint_questions_low_similarity(self) -> None:
result = compare_questions(
"Will Apple ship a foldable iPhone?",
"Will the Fed cut interest rates?",
"NO",
use_embeddings=False,
)
assert result.semantic_similarity < 0.3
def test_framing_yes_matches_yes_outcome(self) -> None:
result = compare_questions(
"Will the policy be announced before December?",
"Will the policy be announced before December?",
"YES",
use_embeddings=False,
)
assert result.framing_predicted == "YES"
assert result.outcome_match is True
def test_framing_yes_misses_on_no_resolution(self) -> None:
result = compare_questions(
"Will Apple announce a foldable iPhone?",
"Will Apple announce a foldable iPhone?",
"NO",
use_embeddings=False,
)
# Question framing is YES, actual is NO → miss.
assert result.framing_predicted == "YES"
assert result.outcome_match is False
def test_non_binary_outcome_is_not_matched(self) -> None:
result = compare_questions(
"Will Verstappen win the race?",
"Race winner?",
"Verstappen",
use_embeddings=False,
)
assert result.outcome_match is False
assert "non-binary" in result.notes
def test_infer_framing_yes(self) -> None:
assert infer_framing("Will X reach 100 by year-end?") == "YES"
def test_infer_framing_no(self) -> None:
# "Below" should mark this as a NO-framing.
assert infer_framing("Will X fail to stay below the limit?") == "NO"
def test_infer_framing_unknown(self) -> None:
assert infer_framing("xyz") == "UNKNOWN"
def test_infer_category_crypto(self) -> None:
assert infer_category("Will Bitcoin reach $100k?") == "crypto"
def test_infer_category_other(self) -> None:
assert infer_category("Random unrelated string") == "other"
# --------------------------------------------------------------------------- #
# Auction logic #
# --------------------------------------------------------------------------- #
class TestAuction:
def test_pick_winner_picks_lowest_bid(self) -> None:
import random as _random
rng = _random.Random(0)
bids = {"gemini": 0.30, "deepseek": 0.75, "qwen": 0.40}
assert _pick_winner(bids, rng=rng) == "gemini"
def test_pick_winner_handles_ties_deterministically(self) -> None:
import random as _random
rng = _random.Random(123)
bids = {"a": 0.5, "b": 0.5, "c": 0.5}
first = _pick_winner(bids, rng=rng)
# With the same seed we get the same answer.
rng = _random.Random(123)
second = _pick_winner(bids, rng=rng)
assert first == second
# --------------------------------------------------------------------------- #
# Market loader #
# --------------------------------------------------------------------------- #
class TestLoadMarkets:
def test_loads_from_real_parquet_if_available(self) -> None:
repo_root = Path(__file__).resolve().parents[1]
parquet = repo_root / "corpus" / "polymarket_resolved.parquet"
if not parquet.exists():
pytest.skip("resolved markets parquet not present in this checkout")
markets = load_markets(n=3, parquet_path=parquet, seed=42)
assert len(markets) == 3
assert all(isinstance(m, MarketRecord) for m in markets)
assert all(m.question for m in markets)
def test_falls_back_to_sample_json(self, tmp_path: Path) -> None:
# Point at a non-existent parquet so the loader uses sample_*.json
# via the default ``outputs/`` directory.
markets = load_markets(n=2, parquet_path=tmp_path / "missing.parquet", seed=42)
assert len(markets) >= 1
assert all(isinstance(m, MarketRecord) for m in markets)
# --------------------------------------------------------------------------- #
# End-to-end smoke test (mock LLM) #
# --------------------------------------------------------------------------- #
class TestRunBacktestSmoke:
def test_full_pipeline_with_mock_llm(
self,
mock_markets: list[MarketRecord],
tmp_path: Path,
) -> None:
import asyncio
from polyglot_alpha.backtest.runner import run_backtest_async
summary = asyncio.run(
run_backtest_async(
n=len(mock_markets),
seed=42,
output_dir=tmp_path,
mock_llm=True,
use_embeddings=False, # avoid the sentence-transformers download
markets=mock_markets,
)
)
assert summary["n_markets"] == len(mock_markets)
# Output files should have landed in tmp_path.
jsonl = tmp_path / "per_market_results.jsonl"
summary_path = tmp_path / "summary.json"
report_path = tmp_path / "backtest_report.md"
assert jsonl.exists()
assert summary_path.exists()
assert report_path.exists()
# Re-read JSONL and confirm one row per market.
rows = [json.loads(line) for line in jsonl.read_text().splitlines() if line]
assert len(rows) == len(mock_markets)
row0 = rows[0]
# Sanity-check required fields per the spec.
for key in (
"market_id",
"actual_question",
"actual_outcome",
"actual_volume",
"agent_winner",
"agent_question",
"judge_verdict",
"judge_score",
"semantic_similarity",
"outcome_match",
"estimated_roi_usdc",
"uma_dispute",
"category",
"notes",
):
assert key in row0, f"missing key {key} in row"
# Markdown report is non-empty and labelled.
report_text = report_path.read_text()
assert "PolyglotAlpha v2 Backtest Report" in report_text
assert "Executive summary" in report_text
def test_run_backtest_sync_wrapper(
self,
mock_markets: list[MarketRecord],
tmp_path: Path,
) -> None:
summary = run_backtest(
n=len(mock_markets),
seed=99,
output_dir=tmp_path,
mock_llm=True,
use_embeddings=False,
markets=mock_markets,
)
assert summary["n_markets"] == len(mock_markets)
assert "outcome_accuracy" in summary
assert "estimated_total_roi_usdc" in summary