| |
| from __future__ import annotations |
|
|
| import pandas as pd |
| import sys |
| from pathlib import Path |
| import numpy as np |
| import os |
|
|
|
|
| import os |
| os.environ["OMP_NUM_THREADS"] = "64" |
| os.environ["OPENBLAS_NUM_THREADS"] = "64" |
| os.environ["MKL_NUM_THREADS"] = "64" |
| os.environ["NUMEXPR_NUM_THREADS"] = "64" |
|
|
|
|
| |
| ROOT = Path(__file__).resolve().parents[1] |
| if str(ROOT) not in sys.path: |
| sys.path.insert(0, str(ROOT)) |
|
|
| from src.pipeline import run_10fold_cv_paper_like |
| from src.pipeline import train_and_save_final_models |
|
|
|
|
| def load_data() -> tuple[pd.DataFrame, list[str], list[str]]: |
| import pandas as pd |
|
|
| morph_path = "Data/morph_dataset.csv" |
| perf_path = "Data/performance_dataset.csv" |
|
|
| morph_df = pd.read_csv(morph_path) |
| perf_df = pd.read_csv(perf_path) |
|
|
| |
| morph_df = morph_df.replace(-1, np.nan) |
| perf_df = perf_df.replace(-1, np.nan) |
|
|
| |
| if len(morph_df) != len(perf_df): |
| raise ValueError( |
| f"Row counts differ: morph={len(morph_df)} perf={len(perf_df)}. " |
| "Need a shared ID column to merge." |
| ) |
|
|
| |
| data = pd.concat([morph_df.reset_index(drop=True), |
| perf_df.reset_index(drop=True)], axis=1) |
|
|
| |
| targets = [ |
| "sprint", "endurance", "bite", "distance_capacity", |
| "jump_distance", "jump_vel", "jump_accel", "jump_power", "angle" |
| ] |
|
|
| |
| cat_cols = ["taxon", "genus", "species", "sex", "ecomorph"] |
|
|
| |
| |
| morph_numeric = [c for c in morph_df.columns if c not in cat_cols] |
|
|
| |
| data[cat_cols] = data[cat_cols].astype("Int64") |
| data = pd.get_dummies(data, columns=cat_cols, dummy_na=True) |
|
|
| |
| |
| |
| dummy_cols = [c for c in data.columns if any(c.startswith(f"{cc}_") for cc in cat_cols)] |
| morph_cols = morph_numeric + dummy_cols |
|
|
| |
| missing = [t for t in targets if t not in data.columns] |
| if missing: |
| raise ValueError(f"Targets missing from merged data: {missing}. Check perf CSV header.") |
|
|
| return data, morph_cols, targets |
|
|
|
|
| def main(): |
| data, morph_cols, targets = load_data() |
|
|
| saved = train_and_save_final_models( |
| data=data, |
| morph_cols=morph_cols, |
| targets=targets, |
| save_dir="artifacts_inference", |
| experiment_name="InferenceModels_SM2_KNN", |
| experiment_tag="final_fit", |
| ) |
| print("\nSaved bundles:") |
| print(saved) |
|
|
| if __name__ == "__main__": |
| main() |
|
|