Spaces:

messili
/

polyglot-alpha

Running

polyglot-alpha / tests /test_backtest.py

licaomeng

deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)

88d2f2a 5 days ago

11.3 kB

	"""Tests for the backtest framework.

	The mock-LLM path is exercised end-to-end so the test suite stays fast
	(<5s) and offline. Real-LLM behaviour is asserted indirectly via the
	LLM-factory swap test.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	import pytest

	from polyglot_alpha.backtest.outcome_matcher import (
	OutcomeComparison,
	compare_questions,
	infer_category,
	infer_framing,
	)
	from polyglot_alpha.backtest.roi_estimator import (
	BUILDER_FEE_BPS,
	CAPTURE_RATE_FAIL,
	CAPTURE_RATE_PASS,
	CAPTURE_RATE_PASS_HIGH,
	HIGH_CONFIDENCE_THRESHOLD,
	estimate_roi,
	)
	from polyglot_alpha.backtest.runner import (
	MarketRecord,
	_pick_winner,
	load_markets,
	run_backtest,
	)


	# --------------------------------------------------------------------------- #
	# Fixtures #
	# --------------------------------------------------------------------------- #


	@pytest.fixture()
	def mock_markets() -> list[MarketRecord]:
	"""Three deterministic market records spanning YES / NO / dispute."""

	return [
	MarketRecord(
	market_id="bt-1",
	question="Will Bitcoin exceed $100k by 2026-12-31?",
	category="crypto",
	outcome="YES",
	total_volume_usdc=50_000.0,
	uma_dispute=False,
	resolution_source="https://example.com/btc",
	),
	MarketRecord(
	market_id="bt-2",
	question="Will the Fed cut rates in March 2026?",
	category="economics",
	outcome="NO",
	total_volume_usdc=20_000.0,
	uma_dispute=True,
	resolution_source="https://example.com/fed",
	),
	MarketRecord(
	market_id="bt-3",
	question="Will Apple announce a foldable iPhone before 2026-12-31?",
	category="tech",
	outcome="NO",
	total_volume_usdc=8_000.0,
	uma_dispute=False,
	resolution_source="https://example.com/aapl",
	),
	]


	# --------------------------------------------------------------------------- #
	# ROI estimator #
	# --------------------------------------------------------------------------- #


	class TestRoiEstimator:
	def test_pass_high_confidence_uses_top_capture_rate(self) -> None:
	roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD)
	assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS_HIGH)
	expected_fee = 100_000.0 * CAPTURE_RATE_PASS_HIGH * (BUILDER_FEE_BPS / 10_000.0)
	assert roi.builder_fee_usdc == pytest.approx(expected_fee)
	# Net = builder_fee - agent_cost; should still be positive on a 100k market.
	assert roi.net_roi_usdc > 0

	def test_pass_normal_uses_lower_capture_rate(self) -> None:
	roi = estimate_roi(100_000.0, "PASS", HIGH_CONFIDENCE_THRESHOLD - 1)
	assert roi.capture_rate == pytest.approx(CAPTURE_RATE_PASS)

	def test_fail_returns_zero_fee(self) -> None:
	roi = estimate_roi(1_000_000.0, "FAIL", 0)
	assert roi.capture_rate == pytest.approx(CAPTURE_RATE_FAIL)
	assert roi.builder_fee_usdc == 0.0
	# Net negative because of agent_cost stub.
	assert roi.net_roi_usdc < 0

	def test_zero_volume_returns_zero(self) -> None:
	roi = estimate_roi(0.0, "PASS", 95.0)
	assert roi.builder_fee_usdc == 0.0

	def test_negative_volume_clamped_to_zero(self) -> None:
	roi = estimate_roi(-500.0, "PASS", 95.0)
	assert roi.builder_fee_usdc == 0.0


	# --------------------------------------------------------------------------- #
	# Outcome matcher #
	# --------------------------------------------------------------------------- #


	class TestOutcomeMatcher:
	def test_identical_questions_match_with_jaccard(self) -> None:
	result: OutcomeComparison = compare_questions(
	"Will Bitcoin exceed $100k by 2026-12-31?",
	"Will Bitcoin exceed $100k by 2026-12-31?",
	"YES",
	use_embeddings=False,
	)
	assert result.semantic_similarity == pytest.approx(1.0)
	assert result.semantic_match is True
	assert result.framing_predicted == "YES"
	assert result.outcome_match is True

	def test_disjoint_questions_low_similarity(self) -> None:
	result = compare_questions(
	"Will Apple ship a foldable iPhone?",
	"Will the Fed cut interest rates?",
	"NO",
	use_embeddings=False,
	)
	assert result.semantic_similarity < 0.3

	def test_framing_yes_matches_yes_outcome(self) -> None:
	result = compare_questions(
	"Will the policy be announced before December?",
	"Will the policy be announced before December?",
	"YES",
	use_embeddings=False,
	)
	assert result.framing_predicted == "YES"
	assert result.outcome_match is True

	def test_framing_yes_misses_on_no_resolution(self) -> None:
	result = compare_questions(
	"Will Apple announce a foldable iPhone?",
	"Will Apple announce a foldable iPhone?",
	"NO",
	use_embeddings=False,
	)
	# Question framing is YES, actual is NO → miss.
	assert result.framing_predicted == "YES"
	assert result.outcome_match is False

	def test_non_binary_outcome_is_not_matched(self) -> None:
	result = compare_questions(
	"Will Verstappen win the race?",
	"Race winner?",
	"Verstappen",
	use_embeddings=False,
	)
	assert result.outcome_match is False
	assert "non-binary" in result.notes

	def test_infer_framing_yes(self) -> None:
	assert infer_framing("Will X reach 100 by year-end?") == "YES"

	def test_infer_framing_no(self) -> None:
	# "Below" should mark this as a NO-framing.
	assert infer_framing("Will X fail to stay below the limit?") == "NO"

	def test_infer_framing_unknown(self) -> None:
	assert infer_framing("xyz") == "UNKNOWN"

	def test_infer_category_crypto(self) -> None:
	assert infer_category("Will Bitcoin reach $100k?") == "crypto"

	def test_infer_category_other(self) -> None:
	assert infer_category("Random unrelated string") == "other"


	# --------------------------------------------------------------------------- #
	# Auction logic #
	# --------------------------------------------------------------------------- #


	class TestAuction:
	def test_pick_winner_picks_lowest_bid(self) -> None:
	import random as _random

	rng = _random.Random(0)
	bids = {"gemini": 0.30, "deepseek": 0.75, "qwen": 0.40}
	assert _pick_winner(bids, rng=rng) == "gemini"

	def test_pick_winner_handles_ties_deterministically(self) -> None:
	import random as _random

	rng = _random.Random(123)
	bids = {"a": 0.5, "b": 0.5, "c": 0.5}
	first = _pick_winner(bids, rng=rng)
	# With the same seed we get the same answer.
	rng = _random.Random(123)
	second = _pick_winner(bids, rng=rng)
	assert first == second


	# --------------------------------------------------------------------------- #
	# Market loader #
	# --------------------------------------------------------------------------- #


	class TestLoadMarkets:
	def test_loads_from_real_parquet_if_available(self) -> None:
	repo_root = Path(__file__).resolve().parents[1]
	parquet = repo_root / "corpus" / "polymarket_resolved.parquet"
	if not parquet.exists():
	pytest.skip("resolved markets parquet not present in this checkout")
	markets = load_markets(n=3, parquet_path=parquet, seed=42)
	assert len(markets) == 3
	assert all(isinstance(m, MarketRecord) for m in markets)
	assert all(m.question for m in markets)

	def test_falls_back_to_sample_json(self, tmp_path: Path) -> None:
	# Point at a non-existent parquet so the loader uses sample_*.json
	# via the default ``outputs/`` directory.
	markets = load_markets(n=2, parquet_path=tmp_path / "missing.parquet", seed=42)
	assert len(markets) >= 1
	assert all(isinstance(m, MarketRecord) for m in markets)


	# --------------------------------------------------------------------------- #
	# End-to-end smoke test (mock LLM) #
	# --------------------------------------------------------------------------- #


	class TestRunBacktestSmoke:
	def test_full_pipeline_with_mock_llm(
	self,
	mock_markets: list[MarketRecord],
	tmp_path: Path,
	) -> None:
	import asyncio

	from polyglot_alpha.backtest.runner import run_backtest_async

	summary = asyncio.run(
	run_backtest_async(
	n=len(mock_markets),
	seed=42,
	output_dir=tmp_path,
	mock_llm=True,
	use_embeddings=False, # avoid the sentence-transformers download
	markets=mock_markets,
	)
	)
	assert summary["n_markets"] == len(mock_markets)
	# Output files should have landed in tmp_path.
	jsonl = tmp_path / "per_market_results.jsonl"
	summary_path = tmp_path / "summary.json"
	report_path = tmp_path / "backtest_report.md"
	assert jsonl.exists()
	assert summary_path.exists()
	assert report_path.exists()

	# Re-read JSONL and confirm one row per market.
	rows = [json.loads(line) for line in jsonl.read_text().splitlines() if line]
	assert len(rows) == len(mock_markets)
	row0 = rows[0]
	# Sanity-check required fields per the spec.
	for key in (
	"market_id",
	"actual_question",
	"actual_outcome",
	"actual_volume",
	"agent_winner",
	"agent_question",
	"judge_verdict",
	"judge_score",
	"semantic_similarity",
	"outcome_match",
	"estimated_roi_usdc",
	"uma_dispute",
	"category",
	"notes",
	):
	assert key in row0, f"missing key {key} in row"

	# Markdown report is non-empty and labelled.
	report_text = report_path.read_text()
	assert "PolyglotAlpha v2 Backtest Report" in report_text
	assert "Executive summary" in report_text

	def test_run_backtest_sync_wrapper(
	self,
	mock_markets: list[MarketRecord],
	tmp_path: Path,
	) -> None:
	summary = run_backtest(
	n=len(mock_markets),
	seed=99,
	output_dir=tmp_path,
	mock_llm=True,
	use_embeddings=False,
	markets=mock_markets,
	)
	assert summary["n_markets"] == len(mock_markets)
	assert "outcome_accuracy" in summary
	assert "estimated_total_roi_usdc" in summary