# scripts/train_eval.py from __future__ import annotations import pandas as pd import sys from pathlib import Path import numpy as np import os os.environ["OMP_NUM_THREADS"] = "64" os.environ["OPENBLAS_NUM_THREADS"] = "64" os.environ["MKL_NUM_THREADS"] = "64" os.environ["NUMEXPR_NUM_THREADS"] = "64" # Add project root to PYTHONPATH so `import src...` works when running as a script ROOT = Path(__file__).resolve().parents[1] if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from src.pipeline import run_10fold_cv_paper_like def load_data() -> tuple[pd.DataFrame, list[str], list[str]]: import pandas as pd morph_path = "Data/morph_dataset.csv" perf_path = "Data/performance_dataset.csv" morph_df = pd.read_csv(morph_path) perf_df = pd.read_csv(perf_path) # Convert -1 to NaN morph_df = morph_df.replace(-1, np.nan) perf_df = perf_df.replace(-1, np.nan) # If row counts differ, stop (can't safely align) if len(morph_df) != len(perf_df): raise ValueError( f"Row counts differ: morph={len(morph_df)} perf={len(perf_df)}. " "Need a shared ID column to merge." ) # Row-aligned join data = pd.concat([morph_df.reset_index(drop=True), perf_df.reset_index(drop=True)], axis=1) # Define targets exactly like your performance CSV columns (excluding ecomorph label) targets = [ "sprint", "endurance", "bite", "distance_capacity", "jump_distance", "jump_vel", "jump_accel", "jump_power", "angle" ] # Categorical columns to one-hot encode (based on your header) cat_cols = ["taxon", "genus", "species", "sex", "ecomorph"] # Morphology numeric columns (everything in morph_df except categorical) # Your morph headers: taxon, genus, species, sex, mass, svl, hl, hw, ... morph_numeric = [c for c in morph_df.columns if c not in cat_cols] # One-hot encode categorical columns (keep NaNs as its own category if any) data[cat_cols] = data[cat_cols].astype("Int64") # keep missing-safe ints data = pd.get_dummies(data, columns=cat_cols, dummy_na=True) # After get_dummies, morph_cols should be: # - numeric morphology columns (mass, svl, hl, ...) # - plus the created one-hot columns for taxon/genus/species/sex/ecomorph dummy_cols = [c for c in data.columns if any(c.startswith(f"{cc}_") for cc in cat_cols)] morph_cols = morph_numeric + dummy_cols # Sanity check: ensure targets exist missing = [t for t in targets if t not in data.columns] if missing: raise ValueError(f"Targets missing from merged data: {missing}. Check perf CSV header.") return data, morph_cols, targets def main(): data, morph_cols, targets = load_data() results = run_10fold_cv_paper_like( data=data, morph_cols=morph_cols, targets=targets, experiment_tag="paper_like_cv", max_folds=1, ) print("\nCV Results:") print(results.sort_values("target")) if __name__ == "__main__": main()