Spaces:

messili
/

polyglot-alpha

Running

licaomeng

deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)

88d2f2a 5 days ago

17.4 kB

	"""Tests for the corpus subpackage.

	Network access is fully mocked via ``unittest.mock.patch`` on
	``requests.Session.get`` — the fixture in ``tests/fixtures/`` plays the
	role of a Gamma API response.
	"""
	from __future__ import annotations

	import asyncio
	import json
	from pathlib import Path
	from unittest.mock import MagicMock, patch

	import numpy as np
	import pandas as pd
	import pytest

	from polyglot_alpha.corpus import (
	Lookup,
	SimilarHit,
	classify_pattern,
	summarize_patterns,
	)
	from polyglot_alpha.corpus import embed as embed_module
	from polyglot_alpha.corpus import few_shots as few_shots_module
	from polyglot_alpha.corpus import pattern_analysis as pattern_module
	from polyglot_alpha.corpus import resolved_analysis as resolved_analysis_module
	from polyglot_alpha.corpus import resolved_scraper as resolved_scraper_module
	from polyglot_alpha.corpus import scraper as scraper_module
	from polyglot_alpha.corpus import style_guide as style_guide_module

	FIXTURE_PATH = Path(__file__).parent / "fixtures" / "gamma_events_page.json"


	# --------------------------------------------------------------------------- #
	# Helpers. #
	# --------------------------------------------------------------------------- #


	def _load_fixture() -> list[dict]:
	return json.loads(FIXTURE_PATH.read_text())


	class _StubEncoder:
	"""Deterministic, dependency-free embedding stub.

	Each text becomes a 384-dim vector derived from its hash modulo a
	small prime; the same text always yields the same vector and the
	vector is unit-normalized to match the real encoder's contract.
	"""

	def __init__(self, dim: int = 384) -> None:
	self.dim = dim

	def encode(
	self,
	texts,
	*,
	normalize_embeddings: bool = True,
	convert_to_numpy: bool = True,
	show_progress_bar: bool = False,
	batch_size: int = 32,
	):
	vectors = []
	for t in texts:
	rng = np.random.default_rng(seed=abs(hash(t)) % (2**32))
	v = rng.normal(size=self.dim).astype("float32")
	if normalize_embeddings:
	n = np.linalg.norm(v) or 1.0
	v = v / n
	vectors.append(v)
	arr = np.stack(vectors).astype("float32")
	return arr


	# --------------------------------------------------------------------------- #
	# Test 1 — scraper normalization with mocked HTTP. #
	# --------------------------------------------------------------------------- #


	def test_scraper_flattens_events_and_filters_multi_outcome(tmp_path: Path) -> None:
	page = _load_fixture()
	# Two pages: real fixture, then empty list to terminate the crawl.
	mock_resp_full = MagicMock(status_code=200)
	mock_resp_full.json.return_value = page
	mock_resp_full.raise_for_status.return_value = None

	mock_resp_empty = MagicMock(status_code=200)
	mock_resp_empty.json.return_value = []
	mock_resp_empty.raise_for_status.return_value = None

	with patch.object(
	scraper_module.requests.Session,
	"get",
	side_effect=[
	mock_resp_full,
	mock_resp_empty,
	mock_resp_empty,
	mock_resp_empty,
	mock_resp_empty,
	],
	):
	rows = scraper_module.scrape_polymarket(
	target_rows=1000, page_size=100, include_closed=False
	)

	questions = {r.question for r in rows}
	assert "Will Bitcoin be above $200,000 by December 31, 2026?" in questions
	assert "Which team will win MVP this season?" not in questions, (
	"Multi-outcome markets must be filtered out"
	)
	# Categories propagate from event.tags or event.category.
	btc_row = next(
	r
	for r in rows
	if r.question.startswith("Will Bitcoin be above $200,000")
	)
	assert btc_row.category == "Crypto"
	assert btc_row.market_id == "m-9001-1"

	# Round-trip through parquet.
	out = scraper_module.save_parquet(rows, tmp_path / "corpus.parquet")
	df = pd.read_parquet(out)
	assert len(df) == len(rows)
	assert set(["market_id", "question", "category"]).issubset(df.columns)


	# --------------------------------------------------------------------------- #
	# Test 2 — pattern classification. #
	# --------------------------------------------------------------------------- #


	@pytest.mark.parametrize(
	"question,expected",
	[
	("Will Bitcoin be above $200,000 by December 31, 2026?", "P3"),
	("Will the Fed cut rates by July 31?", "P1"),
	("Who will be the next US President?", "P4"),
	("How many SpaceX launches by Dec 31, 2026?", "P6"),
	(
	"Will the Fed cut rates between July 1 and September 30 2026?",
	"P5",
	),
	("2028 GOP Nominee?", "P2"),
	("Next President of France?", "P2"),
	("This is not a question", "OTHER"),
	],
	)
	def test_classify_pattern(question: str, expected: str) -> None:
	assert classify_pattern(question) == expected


	def test_summarize_patterns_produces_percentages() -> None:
	labels = ["P1"] * 3 + ["P2"] * 1 + ["OTHER"] * 1
	stats = summarize_patterns(labels)
	pcts = stats.percentages()
	assert stats.total == 5
	assert stats.counts["P1"] == 3
	assert pcts["P1"] == pytest.approx(60.0)
	report = pattern_module.stats_to_report(stats)
	assert "Polymarket Question Framing Patterns" in report
	assert "60.0%" in report


	# --------------------------------------------------------------------------- #
	# Test 3 — embed + FAISS round trip. #
	# --------------------------------------------------------------------------- #


	def test_embed_and_index_round_trip(tmp_path: Path) -> None:
	df = pd.DataFrame(
	[
	{
	"market_id": "m1",
	"question": "Will BTC reach 200k by end of 2026?",
	"category": "Crypto",
	},
	{
	"market_id": "m2",
	"question": "Will Argentina win the 2026 FIFA World Cup?",
	"category": "Sports",
	},
	{
	"market_id": "m3",
	"question": "Who will be the next US President?",
	"category": "Politics",
	},
	]
	)
	parquet_path = tmp_path / "questions.parquet"
	df.to_parquet(parquet_path, index=False)

	index_path = tmp_path / "idx.faiss"
	meta_path = tmp_path / "idx_meta.json"

	encoder = _StubEncoder()
	embed_module.build_corpus_index(
	parquet_path,
	index_path=index_path,
	meta_path=meta_path,
	model=encoder,
	)
	assert index_path.exists()
	assert meta_path.exists()
	meta = json.loads(meta_path.read_text())
	assert len(meta["records"]) == 3
	assert meta["records"][0]["market_id"] == "m1"


	# --------------------------------------------------------------------------- #
	# Test 4 — Lookup.find_similar returns sensible results. #
	# --------------------------------------------------------------------------- #


	def test_find_similar_returns_self_as_best_match() -> None:
	questions = [
	("m1", "Will BTC reach 200k by end of 2026?", "Crypto"),
	("m2", "Will Argentina win the 2026 FIFA World Cup?", "Sports"),
	("m3", "Who will be the next US President?", "Politics"),
	]
	encoder = _StubEncoder()
	texts = [q for _, q, _ in questions]
	embeddings = embed_module.embed_texts(texts, model=encoder)
	index = embed_module.build_faiss_index(embeddings)
	meta_records = [
	{"idx": i, "market_id": mid, "question": q, "category": cat}
	for i, (mid, q, cat) in enumerate(questions)
	]
	lookup = Lookup.from_components(index, meta_records, encoder)

	hits = lookup.find_similar(
	"Will BTC reach 200k by end of 2026?", k=2
	)
	assert len(hits) == 2
	assert isinstance(hits[0], SimilarHit)
	assert hits[0].market_id == "m1", (
	"Exact-match query should retrieve itself as the top neighbour"
	)
	# Cosine similarity of an exact match against a unit-normalized
	# vector is ~1.0; the score must be at least notably higher than
	# the runner-up since hashes of distinct strings collide rarely.
	assert hits[0].score >= hits[1].score
	assert hits[0].score == pytest.approx(1.0, abs=1e-3)


	def test_find_similar_clamps_k_and_handles_empty_query() -> None:
	encoder = _StubEncoder()
	embeddings = embed_module.embed_texts(["only one question"], model=encoder)
	index = embed_module.build_faiss_index(embeddings)
	lookup = Lookup.from_components(
	index,
	[{"market_id": "m1", "question": "only one question", "category": "X"}],
	encoder,
	)
	# Requesting k=10 against a 1-row index must clamp, not crash.
	hits = lookup.find_similar("only one question", k=10)
	assert len(hits) == 1
	# Empty query returns no hits.
	assert lookup.find_similar(" ", k=3) == []


	# --------------------------------------------------------------------------- #
	# Test 5 — few-shots diversity. #
	# --------------------------------------------------------------------------- #


	def test_build_few_shots_diversifies_categories(tmp_path: Path) -> None:
	rows = []
	# 6 categories x 4 questions each -> 24 candidate rows.
	cats = ["politics", "sports", "crypto", "geopolitics", "entertainment", "weather"]
	for cat_idx, cat in enumerate(cats):
	for j in range(4):
	rows.append(
	{
	"market_id": f"m-{cat_idx}-{j}",
	"question": f"Will {cat} event {j} happen by 2027?",
	"category": cat,
	"resolution_criteria": "Resolves YES if ...",
	"volume_usd": (cat_idx + 1) * 100 + (3 - j),
	}
	)
	df = pd.DataFrame(rows)
	few = few_shots_module.build_few_shots(df, target_count=12)
	assert len(few) == 12
	seen_cats = {fs.category for fs in few}
	assert len(seen_cats) >= 5, (
	f"Expected at least 5 distinct categories in 12 picks, got {seen_cats}"
	)
	# First-round picks should be the highest-volume row in each bucket.
	expected_first_titles = {
	f"Will {cat} event 0 happen by 2027?" for cat in cats
	}
	actual_titles = {fs.title for fs in few[: len(cats)]}
	assert actual_titles == expected_first_titles

	# Saved file round-trips as valid JSON.
	out = few_shots_module.save_few_shots(few, tmp_path / "few.json")
	payload = json.loads(out.read_text())
	assert payload["count"] == 12
	assert len(payload["examples"]) == 12


	# --------------------------------------------------------------------------- #
	# Test 6 — style guide distillation with a stub LLM. #
	# --------------------------------------------------------------------------- #


	# --------------------------------------------------------------------------- #
	# Test 7 — resolved-scraper outcome classifier + dispute detection. #
	# --------------------------------------------------------------------------- #


	def _resolved_market(**overrides):
	base = {
	"id": "42",
	"question": "Will BTC be above $200k by EOY?",
	"closed": True,
	"outcomes": '["Yes", "No"]',
	"outcomePrices": '["1", "0"]',
	"umaResolutionStatuses": '["proposed", "resolved"]',
	"category": "",
	"volumeNum": 1500.0,
	"closedTime": "2026-01-01 00:00:00+00",
	"createdAt": "2025-12-01T00:00:00Z",
	"endDate": "2026-01-01T00:00:00Z",
	"resolutionSource": "https://example.com",
	"events": [{"title": "BTC price markets", "category": None}],
	}
	base.update(overrides)
	return base


	def test_resolved_scraper_classifies_yes_no_disputed_refunded() -> None:
	yes_row = resolved_scraper_module.market_to_row(_resolved_market())
	assert yes_row is not None
	assert yes_row.outcome == "YES"
	assert yes_row.uma_dispute is False
	# Category is keyword-derived from question/event title ("btc" -> Crypto).
	assert yes_row.category == "Crypto"

	no_row = resolved_scraper_module.market_to_row(
	_resolved_market(outcomePrices='["0", "1"]')
	)
	assert no_row.outcome == "NO"

	disputed_row = resolved_scraper_module.market_to_row(
	_resolved_market(
	umaResolutionStatuses='["proposed", "disputed", "proposed", "resolved"]',
	outcomePrices='["1", "0"]',
	)
	)
	assert disputed_row.outcome == "YES" # outcome still YES, prices are clean
	assert disputed_row.uma_dispute is True # but dispute flag set

	refunded_row = resolved_scraper_module.market_to_row(
	_resolved_market(outcomePrices='["0", "0"]')
	)
	assert refunded_row.outcome == "REFUNDED"


	def test_resolved_scraper_skips_non_binary_and_open() -> None:
	multi = resolved_scraper_module.market_to_row(
	_resolved_market(outcomes='["A","B","C"]', outcomePrices='["0.4","0.3","0.3"]')
	)
	assert multi is None

	open_market = resolved_scraper_module.market_to_row(_resolved_market(closed=False))
	assert open_market is None


	def test_resolved_analysis_distribution_and_markdown() -> None:
	df = pd.DataFrame(
	[
	{
	"market_id": "1",
	"question": "Will Trump win the election?",
	"category": "Politics",
	"created_at": "2024-01-01",
	"end_date": "2024-11-05",
	"resolved_at": "2024-11-06",
	"outcome": "YES",
	"outcome_prices": [1.0, 0.0],
	"total_volume_usdc": 5_000_000.0,
	"uma_dispute": False,
	"resolution_source": "",
	"winning_outcome": "Yes",
	},
	{
	"market_id": "2",
	"question": "Will the Fed cut by June?",
	"category": "Economics",
	"created_at": "2024-03-01",
	"end_date": "2024-06-30",
	"resolved_at": "2024-07-01",
	"outcome": "NO",
	"outcome_prices": [0.0, 1.0],
	"total_volume_usdc": 50_000.0,
	"uma_dispute": True,
	"resolution_source": "",
	"winning_outcome": "No",
	},
	{
	"market_id": "3",
	"question": "Will it rain?",
	"category": "Weather",
	"created_at": "2024-05-01",
	"end_date": "2024-05-02",
	"resolved_at": "2024-05-03",
	"outcome": "DISPUTED",
	"outcome_prices": [0.0, 0.0],
	"total_volume_usdc": 100.0,
	"uma_dispute": True,
	"resolution_source": "",
	"winning_outcome": None,
	},
	]
	)
	dist = resolved_analysis_module.compute_distribution(df)
	assert dist["total_markets"] == 3
	assert dist["yes_rate_overall"] == pytest.approx(1 / 3)
	assert dist["no_rate_overall"] == pytest.approx(1 / 3)
	assert dist["disputed_rate_overall"] == pytest.approx(1 / 3)
	assert dist["uma_dispute_rate_overall"] == pytest.approx(2 / 3)
	assert dist["by_category"]["Politics"]["yes"] == 1
	assert dist["by_category"]["Economics"]["no"] == 1
	assert dist["by_volume_tier"]["high"]["total"] == 1
	assert dist["by_volume_tier"]["mid"]["total"] == 1
	assert dist["by_volume_tier"]["low"]["total"] == 1

	md = resolved_analysis_module.build_summary_markdown(df, dist)
	assert "# Polymarket Resolved Markets" in md
	assert "Total markets" in md
	# Top-volume table should include the high-volume market.
	assert "Will Trump win the election" in md


	# --------------------------------------------------------------------------- #
	# Test 8 — style guide distillation with a stub LLM. #
	# --------------------------------------------------------------------------- #


	def test_distill_style_guide_uses_llm_stub() -> None:
	df = pd.DataFrame(
	[
	{"question": "Will A by 2026?", "category": "Crypto"},
	{"question": "Who will be the next mayor?", "category": "Politics"},
	{"question": "How many launches by EOY?", "category": "Tech"},
	{"question": "Will it rain in NYC on July 4?", "category": "Weather"},
	]
	)
	captured: dict = {}

	async def fake_complete(prompt: str, *, system=None) -> str:
	captured["prompt"] = prompt
	captured["system"] = system
	return (
	"```markdown\n"
	"# Polymarket Question Style Guide\n"
	"- Be concise.\n"
	"```\n"
	)

	md = asyncio.run(
	style_guide_module.distill_style_guide(
	df, sample_size=4, llm_complete=fake_complete
	)
	)
	assert "# Polymarket Question Style Guide" in md
	assert "```" not in md, "Markdown fences must be stripped"
	assert "Be concise." in md
	# The prompt contains numbered question samples.
	assert "1." in captured["prompt"]
	assert captured["system"]