polyglot-alpha / tests /test_corpus.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Tests for the corpus subpackage.
Network access is fully mocked via ``unittest.mock.patch`` on
``requests.Session.get`` — the fixture in ``tests/fixtures/`` plays the
role of a Gamma API response.
"""
from __future__ import annotations
import asyncio
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
import pytest
from polyglot_alpha.corpus import (
Lookup,
SimilarHit,
classify_pattern,
summarize_patterns,
)
from polyglot_alpha.corpus import embed as embed_module
from polyglot_alpha.corpus import few_shots as few_shots_module
from polyglot_alpha.corpus import pattern_analysis as pattern_module
from polyglot_alpha.corpus import resolved_analysis as resolved_analysis_module
from polyglot_alpha.corpus import resolved_scraper as resolved_scraper_module
from polyglot_alpha.corpus import scraper as scraper_module
from polyglot_alpha.corpus import style_guide as style_guide_module
FIXTURE_PATH = Path(__file__).parent / "fixtures" / "gamma_events_page.json"
# --------------------------------------------------------------------------- #
# Helpers. #
# --------------------------------------------------------------------------- #
def _load_fixture() -> list[dict]:
return json.loads(FIXTURE_PATH.read_text())
class _StubEncoder:
"""Deterministic, dependency-free embedding stub.
Each text becomes a 384-dim vector derived from its hash modulo a
small prime; the same text always yields the same vector and the
vector is unit-normalized to match the real encoder's contract.
"""
def __init__(self, dim: int = 384) -> None:
self.dim = dim
def encode(
self,
texts,
*,
normalize_embeddings: bool = True,
convert_to_numpy: bool = True,
show_progress_bar: bool = False,
batch_size: int = 32,
):
vectors = []
for t in texts:
rng = np.random.default_rng(seed=abs(hash(t)) % (2**32))
v = rng.normal(size=self.dim).astype("float32")
if normalize_embeddings:
n = np.linalg.norm(v) or 1.0
v = v / n
vectors.append(v)
arr = np.stack(vectors).astype("float32")
return arr
# --------------------------------------------------------------------------- #
# Test 1 — scraper normalization with mocked HTTP. #
# --------------------------------------------------------------------------- #
def test_scraper_flattens_events_and_filters_multi_outcome(tmp_path: Path) -> None:
page = _load_fixture()
# Two pages: real fixture, then empty list to terminate the crawl.
mock_resp_full = MagicMock(status_code=200)
mock_resp_full.json.return_value = page
mock_resp_full.raise_for_status.return_value = None
mock_resp_empty = MagicMock(status_code=200)
mock_resp_empty.json.return_value = []
mock_resp_empty.raise_for_status.return_value = None
with patch.object(
scraper_module.requests.Session,
"get",
side_effect=[
mock_resp_full,
mock_resp_empty,
mock_resp_empty,
mock_resp_empty,
mock_resp_empty,
],
):
rows = scraper_module.scrape_polymarket(
target_rows=1000, page_size=100, include_closed=False
)
questions = {r.question for r in rows}
assert "Will Bitcoin be above $200,000 by December 31, 2026?" in questions
assert "Which team will win MVP this season?" not in questions, (
"Multi-outcome markets must be filtered out"
)
# Categories propagate from event.tags or event.category.
btc_row = next(
r
for r in rows
if r.question.startswith("Will Bitcoin be above $200,000")
)
assert btc_row.category == "Crypto"
assert btc_row.market_id == "m-9001-1"
# Round-trip through parquet.
out = scraper_module.save_parquet(rows, tmp_path / "corpus.parquet")
df = pd.read_parquet(out)
assert len(df) == len(rows)
assert set(["market_id", "question", "category"]).issubset(df.columns)
# --------------------------------------------------------------------------- #
# Test 2 — pattern classification. #
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize(
"question,expected",
[
("Will Bitcoin be above $200,000 by December 31, 2026?", "P3"),
("Will the Fed cut rates by July 31?", "P1"),
("Who will be the next US President?", "P4"),
("How many SpaceX launches by Dec 31, 2026?", "P6"),
(
"Will the Fed cut rates between July 1 and September 30 2026?",
"P5",
),
("2028 GOP Nominee?", "P2"),
("Next President of France?", "P2"),
("This is not a question", "OTHER"),
],
)
def test_classify_pattern(question: str, expected: str) -> None:
assert classify_pattern(question) == expected
def test_summarize_patterns_produces_percentages() -> None:
labels = ["P1"] * 3 + ["P2"] * 1 + ["OTHER"] * 1
stats = summarize_patterns(labels)
pcts = stats.percentages()
assert stats.total == 5
assert stats.counts["P1"] == 3
assert pcts["P1"] == pytest.approx(60.0)
report = pattern_module.stats_to_report(stats)
assert "Polymarket Question Framing Patterns" in report
assert "60.0%" in report
# --------------------------------------------------------------------------- #
# Test 3 — embed + FAISS round trip. #
# --------------------------------------------------------------------------- #
def test_embed_and_index_round_trip(tmp_path: Path) -> None:
df = pd.DataFrame(
[
{
"market_id": "m1",
"question": "Will BTC reach 200k by end of 2026?",
"category": "Crypto",
},
{
"market_id": "m2",
"question": "Will Argentina win the 2026 FIFA World Cup?",
"category": "Sports",
},
{
"market_id": "m3",
"question": "Who will be the next US President?",
"category": "Politics",
},
]
)
parquet_path = tmp_path / "questions.parquet"
df.to_parquet(parquet_path, index=False)
index_path = tmp_path / "idx.faiss"
meta_path = tmp_path / "idx_meta.json"
encoder = _StubEncoder()
embed_module.build_corpus_index(
parquet_path,
index_path=index_path,
meta_path=meta_path,
model=encoder,
)
assert index_path.exists()
assert meta_path.exists()
meta = json.loads(meta_path.read_text())
assert len(meta["records"]) == 3
assert meta["records"][0]["market_id"] == "m1"
# --------------------------------------------------------------------------- #
# Test 4 — Lookup.find_similar returns sensible results. #
# --------------------------------------------------------------------------- #
def test_find_similar_returns_self_as_best_match() -> None:
questions = [
("m1", "Will BTC reach 200k by end of 2026?", "Crypto"),
("m2", "Will Argentina win the 2026 FIFA World Cup?", "Sports"),
("m3", "Who will be the next US President?", "Politics"),
]
encoder = _StubEncoder()
texts = [q for _, q, _ in questions]
embeddings = embed_module.embed_texts(texts, model=encoder)
index = embed_module.build_faiss_index(embeddings)
meta_records = [
{"idx": i, "market_id": mid, "question": q, "category": cat}
for i, (mid, q, cat) in enumerate(questions)
]
lookup = Lookup.from_components(index, meta_records, encoder)
hits = lookup.find_similar(
"Will BTC reach 200k by end of 2026?", k=2
)
assert len(hits) == 2
assert isinstance(hits[0], SimilarHit)
assert hits[0].market_id == "m1", (
"Exact-match query should retrieve itself as the top neighbour"
)
# Cosine similarity of an exact match against a unit-normalized
# vector is ~1.0; the score must be at least notably higher than
# the runner-up since hashes of distinct strings collide rarely.
assert hits[0].score >= hits[1].score
assert hits[0].score == pytest.approx(1.0, abs=1e-3)
def test_find_similar_clamps_k_and_handles_empty_query() -> None:
encoder = _StubEncoder()
embeddings = embed_module.embed_texts(["only one question"], model=encoder)
index = embed_module.build_faiss_index(embeddings)
lookup = Lookup.from_components(
index,
[{"market_id": "m1", "question": "only one question", "category": "X"}],
encoder,
)
# Requesting k=10 against a 1-row index must clamp, not crash.
hits = lookup.find_similar("only one question", k=10)
assert len(hits) == 1
# Empty query returns no hits.
assert lookup.find_similar(" ", k=3) == []
# --------------------------------------------------------------------------- #
# Test 5 — few-shots diversity. #
# --------------------------------------------------------------------------- #
def test_build_few_shots_diversifies_categories(tmp_path: Path) -> None:
rows = []
# 6 categories x 4 questions each -> 24 candidate rows.
cats = ["politics", "sports", "crypto", "geopolitics", "entertainment", "weather"]
for cat_idx, cat in enumerate(cats):
for j in range(4):
rows.append(
{
"market_id": f"m-{cat_idx}-{j}",
"question": f"Will {cat} event {j} happen by 2027?",
"category": cat,
"resolution_criteria": "Resolves YES if ...",
"volume_usd": (cat_idx + 1) * 100 + (3 - j),
}
)
df = pd.DataFrame(rows)
few = few_shots_module.build_few_shots(df, target_count=12)
assert len(few) == 12
seen_cats = {fs.category for fs in few}
assert len(seen_cats) >= 5, (
f"Expected at least 5 distinct categories in 12 picks, got {seen_cats}"
)
# First-round picks should be the highest-volume row in each bucket.
expected_first_titles = {
f"Will {cat} event 0 happen by 2027?" for cat in cats
}
actual_titles = {fs.title for fs in few[: len(cats)]}
assert actual_titles == expected_first_titles
# Saved file round-trips as valid JSON.
out = few_shots_module.save_few_shots(few, tmp_path / "few.json")
payload = json.loads(out.read_text())
assert payload["count"] == 12
assert len(payload["examples"]) == 12
# --------------------------------------------------------------------------- #
# Test 6 — style guide distillation with a stub LLM. #
# --------------------------------------------------------------------------- #
# --------------------------------------------------------------------------- #
# Test 7 — resolved-scraper outcome classifier + dispute detection. #
# --------------------------------------------------------------------------- #
def _resolved_market(**overrides):
base = {
"id": "42",
"question": "Will BTC be above $200k by EOY?",
"closed": True,
"outcomes": '["Yes", "No"]',
"outcomePrices": '["1", "0"]',
"umaResolutionStatuses": '["proposed", "resolved"]',
"category": "",
"volumeNum": 1500.0,
"closedTime": "2026-01-01 00:00:00+00",
"createdAt": "2025-12-01T00:00:00Z",
"endDate": "2026-01-01T00:00:00Z",
"resolutionSource": "https://example.com",
"events": [{"title": "BTC price markets", "category": None}],
}
base.update(overrides)
return base
def test_resolved_scraper_classifies_yes_no_disputed_refunded() -> None:
yes_row = resolved_scraper_module.market_to_row(_resolved_market())
assert yes_row is not None
assert yes_row.outcome == "YES"
assert yes_row.uma_dispute is False
# Category is keyword-derived from question/event title ("btc" -> Crypto).
assert yes_row.category == "Crypto"
no_row = resolved_scraper_module.market_to_row(
_resolved_market(outcomePrices='["0", "1"]')
)
assert no_row.outcome == "NO"
disputed_row = resolved_scraper_module.market_to_row(
_resolved_market(
umaResolutionStatuses='["proposed", "disputed", "proposed", "resolved"]',
outcomePrices='["1", "0"]',
)
)
assert disputed_row.outcome == "YES" # outcome still YES, prices are clean
assert disputed_row.uma_dispute is True # but dispute flag set
refunded_row = resolved_scraper_module.market_to_row(
_resolved_market(outcomePrices='["0", "0"]')
)
assert refunded_row.outcome == "REFUNDED"
def test_resolved_scraper_skips_non_binary_and_open() -> None:
multi = resolved_scraper_module.market_to_row(
_resolved_market(outcomes='["A","B","C"]', outcomePrices='["0.4","0.3","0.3"]')
)
assert multi is None
open_market = resolved_scraper_module.market_to_row(_resolved_market(closed=False))
assert open_market is None
def test_resolved_analysis_distribution_and_markdown() -> None:
df = pd.DataFrame(
[
{
"market_id": "1",
"question": "Will Trump win the election?",
"category": "Politics",
"created_at": "2024-01-01",
"end_date": "2024-11-05",
"resolved_at": "2024-11-06",
"outcome": "YES",
"outcome_prices": [1.0, 0.0],
"total_volume_usdc": 5_000_000.0,
"uma_dispute": False,
"resolution_source": "",
"winning_outcome": "Yes",
},
{
"market_id": "2",
"question": "Will the Fed cut by June?",
"category": "Economics",
"created_at": "2024-03-01",
"end_date": "2024-06-30",
"resolved_at": "2024-07-01",
"outcome": "NO",
"outcome_prices": [0.0, 1.0],
"total_volume_usdc": 50_000.0,
"uma_dispute": True,
"resolution_source": "",
"winning_outcome": "No",
},
{
"market_id": "3",
"question": "Will it rain?",
"category": "Weather",
"created_at": "2024-05-01",
"end_date": "2024-05-02",
"resolved_at": "2024-05-03",
"outcome": "DISPUTED",
"outcome_prices": [0.0, 0.0],
"total_volume_usdc": 100.0,
"uma_dispute": True,
"resolution_source": "",
"winning_outcome": None,
},
]
)
dist = resolved_analysis_module.compute_distribution(df)
assert dist["total_markets"] == 3
assert dist["yes_rate_overall"] == pytest.approx(1 / 3)
assert dist["no_rate_overall"] == pytest.approx(1 / 3)
assert dist["disputed_rate_overall"] == pytest.approx(1 / 3)
assert dist["uma_dispute_rate_overall"] == pytest.approx(2 / 3)
assert dist["by_category"]["Politics"]["yes"] == 1
assert dist["by_category"]["Economics"]["no"] == 1
assert dist["by_volume_tier"]["high"]["total"] == 1
assert dist["by_volume_tier"]["mid"]["total"] == 1
assert dist["by_volume_tier"]["low"]["total"] == 1
md = resolved_analysis_module.build_summary_markdown(df, dist)
assert "# Polymarket Resolved Markets" in md
assert "Total markets" in md
# Top-volume table should include the high-volume market.
assert "Will Trump win the election" in md
# --------------------------------------------------------------------------- #
# Test 8 — style guide distillation with a stub LLM. #
# --------------------------------------------------------------------------- #
def test_distill_style_guide_uses_llm_stub() -> None:
df = pd.DataFrame(
[
{"question": "Will A by 2026?", "category": "Crypto"},
{"question": "Who will be the next mayor?", "category": "Politics"},
{"question": "How many launches by EOY?", "category": "Tech"},
{"question": "Will it rain in NYC on July 4?", "category": "Weather"},
]
)
captured: dict = {}
async def fake_complete(prompt: str, *, system=None) -> str:
captured["prompt"] = prompt
captured["system"] = system
return (
"```markdown\n"
"# Polymarket Question Style Guide\n"
"- Be concise.\n"
"```\n"
)
md = asyncio.run(
style_guide_module.distill_style_guide(
df, sample_size=4, llm_complete=fake_complete
)
)
assert "# Polymarket Question Style Guide" in md
assert "```" not in md, "Markdown fences must be stripped"
assert "Be concise." in md
# The prompt contains numbered question samples.
assert "1." in captured["prompt"]
assert captured["system"]