"""Utility functions for model training and evaluation.""" import os from typing import List LANGS: List[str] = ["java", "python", "pharo"] def load_dataset_splits(base_dir=None, langs=None): """Load dataset splits from CSV files under data/raw. Expects files like data/raw/java_train.csv, data/raw/java_test.csv, etc. Returns a dict mapping split names (e.g. "java_test") to pandas DataFrames. Raises: FileNotFoundError: se la directory base o un file atteso non esiste. ImportError: se pandas non รจ installato. """ if base_dir is None: base_dir = os.path.join("data", "raw") if langs is None: langs = LANGS if not os.path.isdir(base_dir): raise FileNotFoundError( f"CSV datasets not found under {base_dir}; cannot load dataset splits." ) try: import pandas as pd except Exception as e: raise ImportError("pandas is required to load dataset splits") from e datasets = {} for lang in langs: for split in ("train", "test"): fname = f"{lang}_{split}.csv" path = os.path.join(base_dir, fname) if not os.path.isfile(path): raise FileNotFoundError(f"Expected dataset file missing: {path}") df = pd.read_csv(path) datasets[f"{lang}_{split}"] = df return datasets def parse_labels_column(df): """Parse the 'labels' column of a DataFrame into lists of integers.""" def _parse_one(x): if isinstance(x, str): s = x.strip() if s.startswith("[") and s.endswith("]"): s = s[1:-1] return [int(tok) for tok in s.split() if tok] try: import numpy as np if isinstance(x, np.ndarray): return [int(v) for v in x.tolist()] except ImportError: pass if isinstance(x, (list, tuple)): return [int(v) for v in x] raise ValueError(f"Formato labels non gestito: {type(x)} -> {x!r}") df["labels"] = df["labels"].apply(_parse_one) return df