| |
| """Convert a ranked alpha CSV into a JSONL pack for the standalone robustness runner.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any |
|
|
| import pandas as pd |
|
|
|
|
| def _optional_int(value: Any) -> int | None: |
| if value is None: |
| return None |
| if isinstance(value, str) and not value.strip(): |
| return None |
| if pd.isna(value): |
| return None |
| return int(value) |
|
|
|
|
| def _optional_float(value: Any) -> float | None: |
| if value is None: |
| return None |
| if isinstance(value, str) and not value.strip(): |
| return None |
| if pd.isna(value): |
| return None |
| return float(value) |
|
|
|
|
| def _require_columns(frame: pd.DataFrame, required: list[str]) -> None: |
| missing = [col for col in required if col not in frame.columns] |
| if missing: |
| raise ValueError(f"Missing required columns: {missing}") |
|
|
|
|
| def _build_records(frame: pd.DataFrame) -> list[dict[str, Any]]: |
| records: list[dict[str, Any]] = [] |
| for rank, row in enumerate(frame.to_dict(orient="records"), start=1): |
| factor_name = str(row.get("factor_name") or "").strip() |
| factor_expr = str(row.get("factor_expr") or row.get("expr") or "").strip() |
| source_seed_name = str(row.get("seed_name") or factor_name or f"alpha_{rank}").strip() |
| alpha_name = factor_name or source_seed_name or f"alpha_{rank}" |
| if not factor_expr: |
| continue |
| records.append( |
| { |
| "source": "seed_baseline", |
| "candidate_scope": "seed_baseline", |
| "seed_name": alpha_name, |
| "seed_expr": factor_expr, |
| "factor_name": alpha_name, |
| "factor_expr": factor_expr, |
| "origin_seed_name": source_seed_name, |
| "origin_factor_name": factor_name or alpha_name, |
| "alpha_rank_by_input": rank, |
| "origin_turn": _optional_int(row.get("turn")), |
| "origin_call_index": _optional_int(row.get("call_index")), |
| "origin_proposal_rank": _optional_int(row.get("proposal_rank")), |
| "origin_performance_return": _optional_float(row.get("performance_return")), |
| "origin_performance_return_pct": _optional_float(row.get("performance_return_pct")), |
| "origin_ir": _optional_float(row.get("ir")), |
| "origin_sharpe": _optional_float(row.get("sharpe")), |
| "origin_winrate": _optional_float(row.get("winrate")), |
| "origin_mdd": _optional_float(row.get("mdd")), |
| } |
| ) |
| return records |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Prepare JSONL input for alpha robustness runs") |
| parser.add_argument("--input-csv", required=True, help="Ranked alpha CSV with factor_name and factor_expr") |
| parser.add_argument("--output-jsonl", required=True, help="Output JSONL path for the robustness runner") |
| parser.add_argument("--top-n", type=int, default=0, help="Optional top-N truncation; 0 keeps all rows") |
| args = parser.parse_args() |
|
|
| input_csv = Path(args.input_csv).expanduser().resolve() |
| output_jsonl = Path(args.output_jsonl).expanduser().resolve() |
|
|
| if not input_csv.exists(): |
| raise FileNotFoundError(f"Input CSV not found: {input_csv}") |
|
|
| frame = pd.read_csv(input_csv) |
| _require_columns(frame, ["seed_name", "factor_name", "factor_expr"]) |
| if int(args.top_n) > 0: |
| frame = frame.head(int(args.top_n)).copy() |
|
|
| records = _build_records(frame) |
| output_jsonl.parent.mkdir(parents=True, exist_ok=True) |
| with output_jsonl.open("w", encoding="utf-8") as fh: |
| for record in records: |
| fh.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
| print(f"input_csv={input_csv}") |
| print(f"output_jsonl={output_jsonl}") |
| print(f"record_count={len(records)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|