File size: 3,171 Bytes
8bb21fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# scripts/train_final.py
from __future__ import annotations

import pandas as pd
import sys
from pathlib import Path
import numpy as np
import os


import os
os.environ["OMP_NUM_THREADS"] = "64"
os.environ["OPENBLAS_NUM_THREADS"] = "64"
os.environ["MKL_NUM_THREADS"] = "64"
os.environ["NUMEXPR_NUM_THREADS"] = "64"


# Add project root to PYTHONPATH so `import src...` works when running as a script
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.pipeline import run_10fold_cv_paper_like
from src.pipeline import train_and_save_final_models


def load_data() -> tuple[pd.DataFrame, list[str], list[str]]:
    import pandas as pd

    morph_path = "Data/morph_dataset.csv"
    perf_path  = "Data/performance_dataset.csv"

    morph_df = pd.read_csv(morph_path)
    perf_df  = pd.read_csv(perf_path)

    # Convert -1 to NaN
    morph_df = morph_df.replace(-1, np.nan)
    perf_df  = perf_df.replace(-1, np.nan)

    # If row counts differ, stop (can't safely align)
    if len(morph_df) != len(perf_df):
        raise ValueError(
            f"Row counts differ: morph={len(morph_df)} perf={len(perf_df)}. "
            "Need a shared ID column to merge."
        )

    # Row-aligned join
    data = pd.concat([morph_df.reset_index(drop=True),
                      perf_df.reset_index(drop=True)], axis=1)

    # Define targets exactly like your performance CSV columns (excluding ecomorph label)
    targets = [
        "sprint", "endurance", "bite", "distance_capacity",
        "jump_distance", "jump_vel", "jump_accel", "jump_power", "angle"
    ]

    # Categorical columns to one-hot encode (based on your header)
    cat_cols = ["taxon", "genus", "species", "sex", "ecomorph"]

    # Morphology numeric columns (everything in morph_df except categorical)
    # Your morph headers: taxon, genus, species, sex, mass, svl, hl, hw, ...
    morph_numeric = [c for c in morph_df.columns if c not in cat_cols]

    # One-hot encode categorical columns (keep NaNs as its own category if any)
    data[cat_cols] = data[cat_cols].astype("Int64")  # keep missing-safe ints
    data = pd.get_dummies(data, columns=cat_cols, dummy_na=True)

    # After get_dummies, morph_cols should be:
    # - numeric morphology columns (mass, svl, hl, ...)
    # - plus the created one-hot columns for taxon/genus/species/sex/ecomorph
    dummy_cols = [c for c in data.columns if any(c.startswith(f"{cc}_") for cc in cat_cols)]
    morph_cols = morph_numeric + dummy_cols

    # Sanity check: ensure targets exist
    missing = [t for t in targets if t not in data.columns]
    if missing:
        raise ValueError(f"Targets missing from merged data: {missing}. Check perf CSV header.")

    return data, morph_cols, targets


def main():
    data, morph_cols, targets = load_data()

    saved = train_and_save_final_models(
        data=data,
        morph_cols=morph_cols,
        targets=targets,
        save_dir="artifacts_inference",
        experiment_name="InferenceModels_SM2_KNN",
        experiment_tag="final_fit",
    )
    print("\nSaved bundles:")
    print(saved)

if __name__ == "__main__":
    main()