File size: 1,993 Bytes
e4b1ed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""Prepare leakage-safe feature metadata for the synthetic v0 modeling run."""

from __future__ import annotations

import argparse
from pathlib import Path

try:
    from .common import DEFAULT_SEED, determine_feature_columns, ensure_dir, load_feature_table, make_episode_split, write_json
except ImportError:  # pragma: no cover - direct script execution
    from common import DEFAULT_SEED, determine_feature_columns, ensure_dir, load_feature_table, make_episode_split, write_json


def prepare_features(features_path: Path, output_dir: Path, seed: int = DEFAULT_SEED) -> dict[str, object]:
    df = load_feature_table(features_path)
    split_df, split_manifest = make_episode_split(df, seed=seed)
    feature_columns, excluded_columns = determine_feature_columns(split_df)

    ensure_dir(output_dir)
    write_json(output_dir / "split_manifest.json", split_manifest)
    write_json(output_dir / "feature_columns.json", feature_columns)
    write_json(output_dir / "excluded_columns.json", excluded_columns)

    return {
        "rows": int(len(split_df)),
        "episodes": int(split_df["episode_id"].nunique()),
        "feature_count": int(len(feature_columns)),
        "split_summary": split_manifest["summary"],
    }


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--features", type=Path, required=True)
    parser.add_argument("--output", type=Path, required=True)
    parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
    args = parser.parse_args(argv)
    summary = prepare_features(args.features, args.output, seed=args.seed)
    print(f"rows: {summary['rows']}")
    print(f"episodes: {summary['episodes']}")
    print(f"model_features: {summary['feature_count']}")
    for split, values in summary["split_summary"].items():
        print(f"{split}: rows={values['rows']} episodes={values['episodes']}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())