File size: 3,968 Bytes
79e6483 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | #!/usr/bin/env python3
"""Convert a ranked alpha CSV into a JSONL pack for the standalone robustness runner."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
import pandas as pd
def _optional_int(value: Any) -> int | None:
if value is None:
return None
if isinstance(value, str) and not value.strip():
return None
if pd.isna(value):
return None
return int(value)
def _optional_float(value: Any) -> float | None:
if value is None:
return None
if isinstance(value, str) and not value.strip():
return None
if pd.isna(value):
return None
return float(value)
def _require_columns(frame: pd.DataFrame, required: list[str]) -> None:
missing = [col for col in required if col not in frame.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
def _build_records(frame: pd.DataFrame) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
for rank, row in enumerate(frame.to_dict(orient="records"), start=1):
factor_name = str(row.get("factor_name") or "").strip()
factor_expr = str(row.get("factor_expr") or row.get("expr") or "").strip()
source_seed_name = str(row.get("seed_name") or factor_name or f"alpha_{rank}").strip()
alpha_name = factor_name or source_seed_name or f"alpha_{rank}"
if not factor_expr:
continue
records.append(
{
"source": "seed_baseline",
"candidate_scope": "seed_baseline",
"seed_name": alpha_name,
"seed_expr": factor_expr,
"factor_name": alpha_name,
"factor_expr": factor_expr,
"origin_seed_name": source_seed_name,
"origin_factor_name": factor_name or alpha_name,
"alpha_rank_by_input": rank,
"origin_turn": _optional_int(row.get("turn")),
"origin_call_index": _optional_int(row.get("call_index")),
"origin_proposal_rank": _optional_int(row.get("proposal_rank")),
"origin_performance_return": _optional_float(row.get("performance_return")),
"origin_performance_return_pct": _optional_float(row.get("performance_return_pct")),
"origin_ir": _optional_float(row.get("ir")),
"origin_sharpe": _optional_float(row.get("sharpe")),
"origin_winrate": _optional_float(row.get("winrate")),
"origin_mdd": _optional_float(row.get("mdd")),
}
)
return records
def main() -> None:
parser = argparse.ArgumentParser(description="Prepare JSONL input for alpha robustness runs")
parser.add_argument("--input-csv", required=True, help="Ranked alpha CSV with factor_name and factor_expr")
parser.add_argument("--output-jsonl", required=True, help="Output JSONL path for the robustness runner")
parser.add_argument("--top-n", type=int, default=0, help="Optional top-N truncation; 0 keeps all rows")
args = parser.parse_args()
input_csv = Path(args.input_csv).expanduser().resolve()
output_jsonl = Path(args.output_jsonl).expanduser().resolve()
if not input_csv.exists():
raise FileNotFoundError(f"Input CSV not found: {input_csv}")
frame = pd.read_csv(input_csv)
_require_columns(frame, ["seed_name", "factor_name", "factor_expr"])
if int(args.top_n) > 0:
frame = frame.head(int(args.top_n)).copy()
records = _build_records(frame)
output_jsonl.parent.mkdir(parents=True, exist_ok=True)
with output_jsonl.open("w", encoding="utf-8") as fh:
for record in records:
fh.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"input_csv={input_csv}")
print(f"output_jsonl={output_jsonl}")
print(f"record_count={len(records)}")
if __name__ == "__main__":
main()
|