File size: 3,060 Bytes
8bb21fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | # scripts/train_eval.py
from __future__ import annotations
import pandas as pd
import sys
from pathlib import Path
import numpy as np
import os
os.environ["OMP_NUM_THREADS"] = "64"
os.environ["OPENBLAS_NUM_THREADS"] = "64"
os.environ["MKL_NUM_THREADS"] = "64"
os.environ["NUMEXPR_NUM_THREADS"] = "64"
# Add project root to PYTHONPATH so `import src...` works when running as a script
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from src.pipeline import run_10fold_cv_paper_like
def load_data() -> tuple[pd.DataFrame, list[str], list[str]]:
import pandas as pd
morph_path = "Data/morph_dataset.csv"
perf_path = "Data/performance_dataset.csv"
morph_df = pd.read_csv(morph_path)
perf_df = pd.read_csv(perf_path)
# Convert -1 to NaN
morph_df = morph_df.replace(-1, np.nan)
perf_df = perf_df.replace(-1, np.nan)
# If row counts differ, stop (can't safely align)
if len(morph_df) != len(perf_df):
raise ValueError(
f"Row counts differ: morph={len(morph_df)} perf={len(perf_df)}. "
"Need a shared ID column to merge."
)
# Row-aligned join
data = pd.concat([morph_df.reset_index(drop=True),
perf_df.reset_index(drop=True)], axis=1)
# Define targets exactly like your performance CSV columns (excluding ecomorph label)
targets = [
"sprint", "endurance", "bite", "distance_capacity",
"jump_distance", "jump_vel", "jump_accel", "jump_power", "angle"
]
# Categorical columns to one-hot encode (based on your header)
cat_cols = ["taxon", "genus", "species", "sex", "ecomorph"]
# Morphology numeric columns (everything in morph_df except categorical)
# Your morph headers: taxon, genus, species, sex, mass, svl, hl, hw, ...
morph_numeric = [c for c in morph_df.columns if c not in cat_cols]
# One-hot encode categorical columns (keep NaNs as its own category if any)
data[cat_cols] = data[cat_cols].astype("Int64") # keep missing-safe ints
data = pd.get_dummies(data, columns=cat_cols, dummy_na=True)
# After get_dummies, morph_cols should be:
# - numeric morphology columns (mass, svl, hl, ...)
# - plus the created one-hot columns for taxon/genus/species/sex/ecomorph
dummy_cols = [c for c in data.columns if any(c.startswith(f"{cc}_") for cc in cat_cols)]
morph_cols = morph_numeric + dummy_cols
# Sanity check: ensure targets exist
missing = [t for t in targets if t not in data.columns]
if missing:
raise ValueError(f"Targets missing from merged data: {missing}. Check perf CSV header.")
return data, morph_cols, targets
def main():
data, morph_cols, targets = load_data()
results = run_10fold_cv_paper_like(
data=data,
morph_cols=morph_cols,
targets=targets,
experiment_tag="paper_like_cv",
max_folds=1,
)
print("\nCV Results:")
print(results.sort_values("target"))
if __name__ == "__main__":
main()
|