#!/usr/bin/env python3 """Convert a ranked alpha CSV into a JSONL pack for the standalone robustness runner.""" from __future__ import annotations import argparse import json from pathlib import Path from typing import Any import pandas as pd def _optional_int(value: Any) -> int | None: if value is None: return None if isinstance(value, str) and not value.strip(): return None if pd.isna(value): return None return int(value) def _optional_float(value: Any) -> float | None: if value is None: return None if isinstance(value, str) and not value.strip(): return None if pd.isna(value): return None return float(value) def _require_columns(frame: pd.DataFrame, required: list[str]) -> None: missing = [col for col in required if col not in frame.columns] if missing: raise ValueError(f"Missing required columns: {missing}") def _build_records(frame: pd.DataFrame) -> list[dict[str, Any]]: records: list[dict[str, Any]] = [] for rank, row in enumerate(frame.to_dict(orient="records"), start=1): factor_name = str(row.get("factor_name") or "").strip() factor_expr = str(row.get("factor_expr") or row.get("expr") or "").strip() source_seed_name = str(row.get("seed_name") or factor_name or f"alpha_{rank}").strip() alpha_name = factor_name or source_seed_name or f"alpha_{rank}" if not factor_expr: continue records.append( { "source": "seed_baseline", "candidate_scope": "seed_baseline", "seed_name": alpha_name, "seed_expr": factor_expr, "factor_name": alpha_name, "factor_expr": factor_expr, "origin_seed_name": source_seed_name, "origin_factor_name": factor_name or alpha_name, "alpha_rank_by_input": rank, "origin_turn": _optional_int(row.get("turn")), "origin_call_index": _optional_int(row.get("call_index")), "origin_proposal_rank": _optional_int(row.get("proposal_rank")), "origin_performance_return": _optional_float(row.get("performance_return")), "origin_performance_return_pct": _optional_float(row.get("performance_return_pct")), "origin_ir": _optional_float(row.get("ir")), "origin_sharpe": _optional_float(row.get("sharpe")), "origin_winrate": _optional_float(row.get("winrate")), "origin_mdd": _optional_float(row.get("mdd")), } ) return records def main() -> None: parser = argparse.ArgumentParser(description="Prepare JSONL input for alpha robustness runs") parser.add_argument("--input-csv", required=True, help="Ranked alpha CSV with factor_name and factor_expr") parser.add_argument("--output-jsonl", required=True, help="Output JSONL path for the robustness runner") parser.add_argument("--top-n", type=int, default=0, help="Optional top-N truncation; 0 keeps all rows") args = parser.parse_args() input_csv = Path(args.input_csv).expanduser().resolve() output_jsonl = Path(args.output_jsonl).expanduser().resolve() if not input_csv.exists(): raise FileNotFoundError(f"Input CSV not found: {input_csv}") frame = pd.read_csv(input_csv) _require_columns(frame, ["seed_name", "factor_name", "factor_expr"]) if int(args.top_n) > 0: frame = frame.head(int(args.top_n)).copy() records = _build_records(frame) output_jsonl.parent.mkdir(parents=True, exist_ok=True) with output_jsonl.open("w", encoding="utf-8") as fh: for record in records: fh.write(json.dumps(record, ensure_ascii=False) + "\n") print(f"input_csv={input_csv}") print(f"output_jsonl={output_jsonl}") print(f"record_count={len(records)}") if __name__ == "__main__": main()