Spaces:
Running
Running
| """Main entry point for the backtest framework. | |
| ``run_backtest`` ties together: | |
| 1. Loading N resolved markets (parquet or mock fallback). | |
| 2. Reverse-engineering a synthetic Chinese-language news event for each | |
| so the existing pipeline can run unchanged. | |
| 3. Running a mock 4-agent auction (the live on-chain auction is replaced | |
| with a deterministic in-process winner pick — the existing bid | |
| strategies still decide the winner). | |
| 4. Running the winning agent's pipeline to produce a candidate Question. | |
| 5. Running the 11-judge panel against the candidate. | |
| 6. Scoring vs. the historical outcome and computing hypothetical ROI. | |
| The framework is async because the agent pipeline + judge panel are | |
| both async; ``run_backtest`` is a synchronous wrapper that drives the | |
| event loop. Use ``run_backtest_async`` directly from async contexts | |
| (notebooks, FastAPI handlers). | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import logging | |
| import os | |
| import random | |
| import time | |
| from dataclasses import asdict, dataclass, field | |
| from pathlib import Path | |
| from typing import Any, Awaitable, Callable, Iterable, Optional | |
| from ..llm import LLMCallable, MockLLM | |
| from ..schemas import ( | |
| AnalystReport, | |
| NewsEvent, | |
| Question, | |
| TranslationCandidate, | |
| ) | |
| from .outcome_matcher import ( | |
| OutcomeComparison, | |
| compare_questions, | |
| infer_category, | |
| ) | |
| from .roi_estimator import RoiEstimate, estimate_roi | |
| LOGGER = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- # | |
| # Defaults & types # | |
| # --------------------------------------------------------------------------- # | |
| _REPO_ROOT = Path(__file__).resolve().parents[2] | |
| DEFAULT_RESOLVED_PARQUET = _REPO_ROOT / "corpus" / "polymarket_resolved.parquet" | |
| DEFAULT_SAMPLE_GLOB = _REPO_ROOT / "outputs" # sample_*.json placeholders | |
| DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "backtest" | |
| AGENT_NAMES: tuple[str, ...] = ("gemini", "deepseek", "qwen", "llama") | |
| # Hex-stub wallets so the per-market record looks realistic without | |
| # touching any real chain state. | |
| _AGENT_WALLET_STUBS: dict[str, str] = { | |
| "gemini": "0xG3M1N1" + "0" * 34, | |
| "deepseek": "0xD33P53" + "0" * 34, | |
| "qwen": "0xQW3N25" + "0" * 34, | |
| "llama": "0xLL4M43" + "0" * 34, | |
| } | |
| LLMFactory = Callable[[str], LLMCallable] | |
| class MarketRecord: | |
| """Minimal view of a resolved Polymarket question.""" | |
| market_id: str | |
| question: str | |
| category: str | |
| outcome: str | |
| total_volume_usdc: float | |
| uma_dispute: bool | |
| resolution_source: str | |
| def from_row(cls, row: dict) -> "MarketRecord": | |
| category = str(row.get("category") or "") or infer_category(str(row.get("question") or "")) | |
| return cls( | |
| market_id=str(row.get("market_id") or ""), | |
| question=str(row.get("question") or ""), | |
| category=category, | |
| outcome=str(row.get("outcome") or ""), | |
| total_volume_usdc=float(row.get("total_volume_usdc") or 0.0), | |
| uma_dispute=bool(row.get("uma_dispute") or False), | |
| resolution_source=str(row.get("resolution_source") or ""), | |
| ) | |
| class BacktestResult: | |
| """Per-market backtest record (one row of ``per_market_results.jsonl``).""" | |
| market_id: str | |
| actual_question: str | |
| actual_outcome: str | |
| actual_volume: float | |
| agent_winner: str | |
| agent_winner_address: str | |
| agent_question: str | |
| judge_verdict: str | |
| judge_score: float | |
| semantic_similarity: float | |
| outcome_match: bool | |
| estimated_roi_usdc: float | |
| uma_dispute: bool | |
| category: str | |
| notes: str | |
| # Internal / extra fields for the report builder. | |
| framing_predicted: str = "" | |
| capture_rate: float = 0.0 | |
| builder_fee_usdc: float = 0.0 | |
| d5_passed: Optional[bool] = None | |
| bids: dict[str, float] = field(default_factory=dict) | |
| def as_dict(self) -> dict[str, Any]: | |
| return asdict(self) | |
| # --------------------------------------------------------------------------- # | |
| # Mock LLM for deterministic, fast runs. # | |
| # --------------------------------------------------------------------------- # | |
| class BacktestMockLLM: | |
| """Deterministic LLM that produces a usable analyst summary AND a | |
| pipeline-compatible JSON candidate. | |
| The base ``MockLLM`` always returns the same canned JSON, which | |
| works for ``translators.propose_candidates`` but not for | |
| ``analysts.run_analysts`` (which parses ``SUMMARY: ... JSON: ...`` | |
| format). This mock branches on whether the prompt looks like an | |
| analyst prompt or a translator prompt. | |
| """ | |
| def __init__(self, agent_name: str, market: "MarketRecord") -> None: | |
| self.agent_name = agent_name | |
| self.market = market | |
| async def __call__(self, prompt: str) -> str: | |
| await asyncio.sleep(0) | |
| if "analyst" in prompt.lower() and "polymarket house style" not in prompt.lower(): | |
| return self._analyst_response() | |
| return self._translator_response() | |
| def _analyst_response(self) -> str: | |
| return ( | |
| f"SUMMARY: Mock analyst summary for market {self.market.market_id} " | |
| f"({self.agent_name}): the event concerns {self.market.question[:80]}.\n" | |
| 'JSON: {"entities": ["entity_a"], "risks": ["risk_a"]}' | |
| ) | |
| def _translator_response(self) -> str: | |
| # Mirror the actual question so semantic similarity is non-trivial | |
| # but inject the agent name so different agents produce different | |
| # candidates (the synthesizer picks longest resolution_criteria). | |
| q = self.market.question.strip() | |
| suffix = "" if q.endswith("?") else "?" | |
| question_en = f"{q}{suffix}" | |
| criteria_lengths = {"gemini": 80, "deepseek": 120, "qwen": 60, "llama": 100} | |
| n_pad = criteria_lengths.get(self.agent_name, 80) | |
| resolution_criteria = ( | |
| "Resolves YES if the underlying event occurs by the cutoff " | |
| f"(synthetic backtest criteria from {self.agent_name})." | |
| ).ljust(n_pad, ".") | |
| payload = { | |
| "question_en": question_en, | |
| "resolution_criteria": resolution_criteria, | |
| "end_date_iso": "2026-12-31T23:59:59Z", | |
| "tags": [self.market.category, "backtest"], | |
| } | |
| return json.dumps(payload) | |
| def make_backtest_llm_factory(market: MarketRecord, *, real: bool = False) -> LLMFactory: | |
| """Return a per-agent LLM factory. | |
| ``real=True`` uses the existing :func:`polyglot_alpha.llm.make_llm` | |
| (which itself falls back to ``MockLLM`` if API keys are absent). | |
| ``real=False`` uses :class:`BacktestMockLLM`. | |
| """ | |
| if real: | |
| from ..llm import make_llm | |
| def _factory(model_id: str) -> LLMCallable: | |
| return make_llm(model_id) | |
| return _factory | |
| def _mock_factory(model_id: str) -> LLMCallable: # noqa: ARG001 — unused | |
| # ``model_id`` is unused for the mock — the agent identity is | |
| # captured via closure when ``_run_agent_pipeline`` builds the LLM. | |
| return MockLLM(model_id=model_id) | |
| return _mock_factory | |
| # --------------------------------------------------------------------------- # | |
| # Reverse-engineer a "trigger event" so the agent pipeline has input. # | |
| # --------------------------------------------------------------------------- # | |
| def synthesize_trigger_event(market: MarketRecord) -> NewsEvent: | |
| """Construct a NewsEvent that could plausibly have triggered ``market``. | |
| For the deterministic backtest path we don't need a real Chinese | |
| headline — the pipeline only requires the ``title_zh`` / ``body_zh`` | |
| fields to be populated. We re-use the market's own question as the | |
| Chinese body (the analysts + translators run downstream regardless). | |
| """ | |
| body = ( | |
| f"事件背景: {market.question}\n" | |
| f"类别: {market.category}\n" | |
| f"参考来源: {market.resolution_source or 'unknown'}" | |
| ) | |
| return NewsEvent( | |
| event_id=f"backtest-{market.market_id}", | |
| url=market.resolution_source or "https://backtest.local/", | |
| title_zh=market.question, | |
| body_zh=body, | |
| cutoff_ts=int(time.time()) + 86400, | |
| topic=market.category, | |
| source="backtest", | |
| ) | |
| # --------------------------------------------------------------------------- # | |
| # Mock auction: pick a winner from the three seeder bid strategies. # | |
| # --------------------------------------------------------------------------- # | |
| def _build_bid_table(event_dict: dict[str, Any]) -> dict[str, float]: | |
| """Run each agent's static ``bid_strategy`` against the event dict. | |
| We instantiate the agent classes with a fake wallet PK so we don't | |
| need any real chain state. The bid strategies are pure functions | |
| over the event dict, so the construction is cheap. | |
| """ | |
| from ..agents import AGENT_REGISTRY | |
| bids: dict[str, float] = {} | |
| for name, cls in AGENT_REGISTRY.items(): | |
| # ``base.py`` validates the PK is truthy; the value itself is | |
| # never used because we don't touch the chain. | |
| agent = cls(wallet_pk="0x" + "11" * 32) | |
| bids[name] = float(agent.bid_strategy(event_dict)) | |
| return bids | |
| def _pick_winner(bids: dict[str, float], *, rng: random.Random) -> str: | |
| """Auction logic: lowest bid wins. | |
| The on-chain auction is reputation-weighted (score = bid / rep) but | |
| we treat reputation as 1.0 across the board for the backtest. Ties | |
| are broken by deterministic RNG so reruns with the same seed | |
| produce the same winner. | |
| """ | |
| if not bids: | |
| raise ValueError("no bids provided") | |
| min_bid = min(bids.values()) | |
| candidates = sorted(name for name, b in bids.items() if abs(b - min_bid) < 1e-9) | |
| return rng.choice(candidates) | |
| # --------------------------------------------------------------------------- # | |
| # Agent pipeline (decoupled from on-chain plumbing). # | |
| # --------------------------------------------------------------------------- # | |
| async def _run_agent_pipeline( | |
| agent_name: str, | |
| market: MarketRecord, | |
| *, | |
| llm_factory: LLMFactory, | |
| mock_llm: bool, | |
| ) -> Question: | |
| """Run analysts -> translators -> synthesizer for one agent. | |
| Mirrors ``BaseTranslatorAgent.run_pipeline`` but takes a | |
| ``MarketRecord`` instead of a chain-event dict, and avoids | |
| constructing the agent class (which insists on a wallet PK). | |
| """ | |
| from .. import analysts, quality_eval, synthesizer, translators | |
| event = synthesize_trigger_event(market) | |
| if mock_llm: | |
| llm: LLMCallable = BacktestMockLLM(agent_name=agent_name, market=market) | |
| else: | |
| llm = llm_factory(_model_for(agent_name)) | |
| reports: list[AnalystReport] = await analysts.run_analysts(event, llm) | |
| candidates: list[TranslationCandidate] = await translators.propose_candidates( | |
| event, reports, llm | |
| ) | |
| question = synthesizer.synthesize(event, candidates) | |
| score = quality_eval.score_question(question) | |
| question.confidence = score.score | |
| question.quality_score = score.score | |
| return question | |
| def _model_for(agent_name: str) -> str: | |
| """Map seeder slot -> LLM model id. | |
| After the OpenRouter swap, every seeder runs on Anthropic Haiku 4.5; | |
| the per-slot specialisation lives in the seeders' system prompts and | |
| bid strategies, not the model id. ``CLAUDE_HAIKU`` is therefore | |
| returned for every slot. | |
| """ | |
| from ..llm import CLAUDE_HAIKU | |
| table = { | |
| "gemini": CLAUDE_HAIKU, | |
| "deepseek": CLAUDE_HAIKU, | |
| "qwen": CLAUDE_HAIKU, | |
| } | |
| return table.get(agent_name, CLAUDE_HAIKU) | |
| # --------------------------------------------------------------------------- # | |
| # Judge panel — wrapped so a missing FAISS index doesn't crash the run. # | |
| # --------------------------------------------------------------------------- # | |
| async def _run_judges( | |
| question: Question, | |
| market: MarketRecord, | |
| *, | |
| llm_factory: LLMFactory, | |
| mock_llm: bool, | |
| ) -> dict[str, Any]: | |
| """Run the 11-judge panel and return a JSON-serializable verdict dict. | |
| Failures (missing optional model weights, network errors, etc.) are | |
| caught and reported as a synthetic FAIL verdict so the backtest run | |
| never aborts mid-stream. | |
| """ | |
| from ..judges.panel import evaluate | |
| panel_payload = { | |
| "title": question.question_en, | |
| "description": question.resolution_criteria, | |
| "resolution_criteria": question.resolution_criteria, | |
| "resolution_source": market.resolution_source or "", | |
| "cutoff_ts": question.end_date_iso, | |
| "category": market.category, | |
| "source_news": market.question, | |
| "source_language": "en", | |
| "target_language": "en", | |
| "reference_translation": market.question, | |
| } | |
| llm_call: Optional[LLMCallable] = None | |
| if not mock_llm: | |
| try: | |
| llm_call = llm_factory("gemini-2.0-flash") | |
| except Exception: | |
| llm_call = None | |
| try: | |
| verdict = await evaluate(panel_payload, market.question, llm_call=llm_call) | |
| return verdict.as_dict() | |
| except Exception as exc: # pragma: no cover - defensive | |
| LOGGER.exception("judge panel crashed for market=%s", market.market_id) | |
| return { | |
| "overall_pass": False, | |
| "verdict": "FAIL", | |
| "overall_score": 0, | |
| "translation_scores": {}, | |
| "style_alignment_passes": {}, | |
| "judge_results": [], | |
| "notes": [f"panel crashed: {exc!r}"], | |
| } | |
| # --------------------------------------------------------------------------- # | |
| # Market loader. # | |
| # --------------------------------------------------------------------------- # | |
| def load_markets( | |
| n: int, | |
| *, | |
| parquet_path: Optional[Path] = None, | |
| seed: int = 42, | |
| ) -> list[MarketRecord]: | |
| """Load ``n`` resolved markets, or fall back to ``outputs/sample_*.json``. | |
| Falling back is intentional: the operator can run a smoke test | |
| before the full resolved-markets parquet has been generated upstream. | |
| """ | |
| target = parquet_path or DEFAULT_RESOLVED_PARQUET | |
| if target.exists(): | |
| try: | |
| import pandas as pd | |
| df = pd.read_parquet(target) | |
| # ``sample`` keeps determinism with the seed AND avoids | |
| # always grabbing the same prefix of the file. | |
| if len(df) > n: | |
| df = df.sample(n=n, random_state=seed) | |
| records = [MarketRecord.from_row(row) for row in df.to_dict(orient="records")] | |
| LOGGER.info("loaded n=%d markets from %s", len(records), target) | |
| return records | |
| except Exception: # pragma: no cover - defensive | |
| LOGGER.exception("failed to load parquet %s; falling back to samples", target) | |
| LOGGER.warning( | |
| "resolved parquet missing at %s; falling back to outputs/sample_*.json", target | |
| ) | |
| samples = _load_sample_fallback(DEFAULT_SAMPLE_GLOB, n=n) | |
| if not samples: | |
| raise FileNotFoundError( | |
| f"No resolved markets parquet at {target} and no outputs/sample_*.json fallback." | |
| ) | |
| return samples | |
| def _load_sample_fallback(samples_dir: Path, *, n: int) -> list[MarketRecord]: | |
| """Build mock ``MarketRecord``s from the legacy ``outputs/sample_*.json``.""" | |
| records: list[MarketRecord] = [] | |
| for idx, path in enumerate(sorted(samples_dir.glob("sample_*.json"))): | |
| if len(records) >= n: | |
| break | |
| try: | |
| data = json.loads(path.read_text()) | |
| except json.JSONDecodeError: # pragma: no cover - corrupt fixture | |
| continue | |
| # Synthesize a plausible outcome from alternating YES/NO so the | |
| # outcome-matcher branch is exercised. | |
| outcome = "YES" if idx % 2 == 0 else "NO" | |
| records.append( | |
| MarketRecord( | |
| market_id=f"sample-{idx}", | |
| question=str(data.get("title") or "Untitled sample"), | |
| category=str(data.get("category") or "sample"), | |
| outcome=outcome, | |
| total_volume_usdc=10_000.0 * (idx + 1), | |
| uma_dispute=False, | |
| resolution_source=str(data.get("resolution_source") or ""), | |
| ) | |
| ) | |
| return records | |
| # --------------------------------------------------------------------------- # | |
| # Core async driver. # | |
| # --------------------------------------------------------------------------- # | |
| async def _run_one_market( | |
| market: MarketRecord, | |
| *, | |
| rng: random.Random, | |
| llm_factory: LLMFactory, | |
| mock_llm: bool, | |
| use_embeddings: bool, | |
| ) -> BacktestResult: | |
| """Backtest a single market end-to-end.""" | |
| event_dict = { | |
| "event_id": f"backtest-{market.market_id}", | |
| "title_zh": market.question, | |
| "body_zh": market.question, | |
| "topic": market.category, | |
| "url": market.resolution_source, | |
| } | |
| bids = _build_bid_table(event_dict) | |
| winner = _pick_winner(bids, rng=rng) | |
| question = await _run_agent_pipeline( | |
| winner, market, llm_factory=llm_factory, mock_llm=mock_llm | |
| ) | |
| verdict = await _run_judges( | |
| question, market, llm_factory=llm_factory, mock_llm=mock_llm | |
| ) | |
| comparison: OutcomeComparison = compare_questions( | |
| question.question_en, | |
| market.question, | |
| market.outcome, | |
| use_embeddings=use_embeddings, | |
| ) | |
| roi: RoiEstimate = estimate_roi( | |
| market.total_volume_usdc, | |
| verdict.get("verdict", "FAIL"), | |
| float(verdict.get("overall_score", 0)), | |
| ) | |
| style_passes = verdict.get("style_alignment_passes") or {} | |
| d5_passed: Optional[bool] | |
| if "d5" in style_passes: | |
| d5_passed = bool(style_passes["d5"]) | |
| else: | |
| d5_passed = None | |
| notes_parts: list[str] = [] | |
| if comparison.notes: | |
| notes_parts.append(comparison.notes) | |
| panel_notes = verdict.get("notes") or [] | |
| if panel_notes: | |
| notes_parts.append("; ".join(str(n) for n in panel_notes)[:240]) | |
| return BacktestResult( | |
| market_id=market.market_id, | |
| actual_question=market.question, | |
| actual_outcome=market.outcome, | |
| actual_volume=market.total_volume_usdc, | |
| agent_winner=winner, | |
| agent_winner_address=_AGENT_WALLET_STUBS.get(winner, "0x" + "0" * 40), | |
| agent_question=question.question_en, | |
| judge_verdict=str(verdict.get("verdict", "FAIL")), | |
| judge_score=float(verdict.get("overall_score", 0)), | |
| semantic_similarity=comparison.semantic_similarity, | |
| outcome_match=comparison.outcome_match, | |
| estimated_roi_usdc=roi.net_roi_usdc, | |
| uma_dispute=market.uma_dispute, | |
| category=market.category, | |
| notes=" | ".join(notes_parts), | |
| framing_predicted=comparison.framing_predicted, | |
| capture_rate=roi.capture_rate, | |
| builder_fee_usdc=roi.builder_fee_usdc, | |
| d5_passed=d5_passed, | |
| bids=bids, | |
| ) | |
| async def run_backtest_async( | |
| *, | |
| n: int = 100, | |
| seed: int = 42, | |
| output_dir: Optional[Path] = None, | |
| mock_llm: bool = True, | |
| use_embeddings: Optional[bool] = None, | |
| parquet_path: Optional[Path] = None, | |
| markets: Optional[Iterable[MarketRecord]] = None, | |
| progress_callback: Optional[Callable[[int, int], None]] = None, | |
| ) -> dict[str, Any]: | |
| """Run the full backtest. Returns a summary dict. | |
| ``use_embeddings=None`` defaults to ``not mock_llm`` (mock runs skip | |
| the heavy sentence-transformers download for speed; real runs use | |
| it for proper semantic similarity). | |
| """ | |
| rng = random.Random(seed) | |
| output_dir = output_dir or DEFAULT_OUTPUT_DIR | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| if use_embeddings is None: | |
| use_embeddings = not mock_llm | |
| if markets is None: | |
| markets_list = load_markets(n, parquet_path=parquet_path, seed=seed) | |
| else: | |
| markets_list = list(markets) | |
| if not markets_list: | |
| raise RuntimeError("No markets to backtest.") | |
| llm_factory = make_backtest_llm_factory(markets_list[0], real=not mock_llm) | |
| results: list[BacktestResult] = [] | |
| total = len(markets_list) | |
| for idx, market in enumerate(markets_list): | |
| if progress_callback is not None: | |
| progress_callback(idx, total) | |
| try: | |
| result = await _run_one_market( | |
| market, | |
| rng=rng, | |
| llm_factory=llm_factory, | |
| mock_llm=mock_llm, | |
| use_embeddings=use_embeddings, | |
| ) | |
| except Exception as exc: # pragma: no cover - defensive | |
| LOGGER.exception("market %s failed; recording skeleton", market.market_id) | |
| result = BacktestResult( | |
| market_id=market.market_id, | |
| actual_question=market.question, | |
| actual_outcome=market.outcome, | |
| actual_volume=market.total_volume_usdc, | |
| agent_winner="", | |
| agent_winner_address="", | |
| agent_question="", | |
| judge_verdict="ERROR", | |
| judge_score=0.0, | |
| semantic_similarity=0.0, | |
| outcome_match=False, | |
| estimated_roi_usdc=0.0, | |
| uma_dispute=market.uma_dispute, | |
| category=market.category, | |
| notes=f"pipeline error: {exc!r}", | |
| ) | |
| results.append(result) | |
| summary = _summarize(results) | |
| _write_artifacts(results, summary, output_dir=output_dir) | |
| return summary | |
| def run_backtest(**kwargs: Any) -> dict[str, Any]: | |
| """Sync wrapper around :func:`run_backtest_async`.""" | |
| return asyncio.run(run_backtest_async(**kwargs)) | |
| # --------------------------------------------------------------------------- # | |
| # Summary + I/O helpers. # | |
| # --------------------------------------------------------------------------- # | |
| def _summarize(results: list[BacktestResult]) -> dict[str, Any]: | |
| """Compute aggregate stats for ``summary.json``.""" | |
| n = len(results) | |
| if n == 0: | |
| return {"n_markets": 0} | |
| verdict_counts = {"PASS": 0, "FAIL": 0, "BORDERLINE": 0, "ERROR": 0} | |
| for r in results: | |
| verdict_counts[r.judge_verdict] = verdict_counts.get(r.judge_verdict, 0) + 1 | |
| outcome_matches = sum(1 for r in results if r.outcome_match) | |
| similarity_total = sum(r.semantic_similarity for r in results) | |
| roi_total = sum(r.estimated_roi_usdc for r in results) | |
| # Per-category breakdown. | |
| per_category: dict[str, dict[str, Any]] = {} | |
| for r in results: | |
| bucket = per_category.setdefault( | |
| r.category or "other", | |
| {"n": 0, "matches": 0, "roi": 0.0, "passes": 0}, | |
| ) | |
| bucket["n"] += 1 | |
| bucket["matches"] += int(r.outcome_match) | |
| bucket["roi"] += r.estimated_roi_usdc | |
| bucket["passes"] += int(r.judge_verdict == "PASS") | |
| for cat, data in per_category.items(): | |
| n_cat = max(1, data["n"]) | |
| data["accuracy"] = data["matches"] / n_cat | |
| data["pass_rate"] = data["passes"] / n_cat | |
| # Drop the intermediate counters that the report doesn't need. | |
| data.pop("matches") | |
| data.pop("passes") | |
| # D5 dispute-detection scorecard. | |
| uma_total = sum(1 for r in results if r.uma_dispute) | |
| uma_caught_by_d5 = sum( | |
| 1 for r in results if r.uma_dispute and r.d5_passed is False | |
| ) | |
| uma_missed_by_d5 = sum( | |
| 1 for r in results if r.uma_dispute and r.d5_passed is True | |
| ) | |
| return { | |
| "n_markets": n, | |
| "n_PASS": verdict_counts.get("PASS", 0), | |
| "n_FAIL": verdict_counts.get("FAIL", 0), | |
| "n_BORDERLINE": verdict_counts.get("BORDERLINE", 0), | |
| "n_ERROR": verdict_counts.get("ERROR", 0), | |
| "outcome_accuracy": outcome_matches / n, | |
| "semantic_similarity_avg": similarity_total / n, | |
| "estimated_total_roi_usdc": roi_total, | |
| "per_category": per_category, | |
| "uma_dispute_total": uma_total, | |
| "uma_dispute_caught_by_D5": uma_caught_by_d5, | |
| "uma_dispute_missed_by_D5": uma_missed_by_d5, | |
| } | |
| def _write_artifacts( | |
| results: list[BacktestResult], | |
| summary: dict[str, Any], | |
| *, | |
| output_dir: Path, | |
| ) -> None: | |
| """Persist per-market JSONL, summary JSON, and Markdown report.""" | |
| from .reporter import generate_report | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| jsonl_path = output_dir / "per_market_results.jsonl" | |
| summary_path = output_dir / "summary.json" | |
| report_path = output_dir / "backtest_report.md" | |
| with jsonl_path.open("w", encoding="utf-8") as fh: | |
| for r in results: | |
| fh.write(json.dumps(r.as_dict(), ensure_ascii=False) + "\n") | |
| summary_path.write_text( | |
| json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8" | |
| ) | |
| report_path.write_text(generate_report(results, summary), encoding="utf-8") | |
| LOGGER.info( | |
| "wrote backtest artifacts: %s, %s, %s", | |
| jsonl_path, | |
| summary_path, | |
| report_path, | |
| ) | |