""" Prépare les 5 fichiers XLSX de configuration dans data/. - CyberAggAdo : copie directe (gold + pred déjà dans le même fichier) - TextToKids : jointure gold + prédictions, normalisation des colonnes """ from __future__ import annotations import shutil from pathlib import Path import pandas as pd BASE_DIR = Path(__file__).resolve().parent WORKSPACE = BASE_DIR.parent DATA_DIR = BASE_DIR / "data" # ── Labels canoniques (sans accents, ordre du modèle) ────────────────────── ALL_LABELS = [ "Emo", "Comportementale", "Designee", "Montree", "Suggeree", "Base", "Complexe", "Admiration", "Autre", "Colere", "Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie", "Joie", "Peur", "Surprise", "Tristesse", ] PRED_SUFFIX = "_pred_emotyc" # Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques ACCENT_TO_CANONICAL = { "Colère": "Colere", "Dégoût": "Degout", "Culpabilité": "Culpabilite", "Fierté": "Fierte", "Désignée": "Designee", "Montrée": "Montree", "Suggérée": "Suggeree", "Émo": "Emo", } # ── Sources ──────────────────────────────────────────────────────────────── RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data" CYBER_CONFIGS = [ { "name": "CyberAggAdo200", "src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx", }, { "name": "CyberAggAdoGlobal_Context", "src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx", }, { "name": "CyberAggAdoGlobal_SansContexte", "src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx", }, ] TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx" TTK_CONFIGS = [ { "name": "TextToKids_Context", "src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx", }, { "name": "TextToKids_SansContexte", "src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx", }, ] def normalize_column(col: str) -> str: """Convertit un nom de colonne accentué en nom canonique.""" return ACCENT_TO_CANONICAL.get(col, col) def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None: """Joindre gold et prédictions TextToKids dans un seul XLSX.""" print(f" Lecture du gold : {gold_path.name}") gold_df = pd.read_excel(gold_path, engine="openpyxl") print(f" Lecture des prédictions : {pred_path.name}") pred_df = pd.read_excel(pred_path, engine="openpyxl") if len(gold_df) != len(pred_df): raise ValueError( f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}" ) # Renommer les colonnes de prédiction : accentuées → canoniques + suffixe pred_rename = {} for col in pred_df.columns: if col == "TEXT": continue canonical = normalize_column(col) pred_rename[col] = f"{canonical}{PRED_SUFFIX}" pred_df = pred_df.rename(columns=pred_rename) # Combiner : gold (toutes colonnes) + colonnes _pred_emotyc pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)] result_df = gold_df.copy() for col in pred_cols: result_df[col] = pred_df[col].values print(f" → {len(result_df)} lignes, {len(result_df.columns)} colonnes") result_df.to_excel(output_path, index=False, engine="openpyxl") print(f" Sauvegardé : {output_path.name}") def main() -> None: DATA_DIR.mkdir(exist_ok=True) # CyberAggAdo : copie directe for config in CYBER_CONFIGS: dst = DATA_DIR / f"{config['name']}.xlsx" print(f"Copie : {config['src'].name} → {dst.name}") shutil.copy2(config["src"], dst) # TextToKids : jointure gold + pred print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...") for config in TTK_CONFIGS: dst = DATA_DIR / f"{config['name']}.xlsx" print(f"\nPréparation : {config['name']}") prepare_texttokids(TTK_GOLD, config["src"], dst) print("\n✅ Préparation terminée. Fichiers dans data/ :") for f in sorted(DATA_DIR.glob("*.xlsx")): size_mb = f.stat().st_size / (1024 * 1024) print(f" {f.name} ({size_mb:.1f} MB)") if __name__ == "__main__": main()