Spaces:
Running
Running
| """ | |
| Prépare les 5 fichiers XLSX de configuration dans data/. | |
| - CyberAggAdo : copie directe (gold + pred déjà dans le même fichier) | |
| - TextToKids : jointure gold + prédictions, normalisation des colonnes | |
| """ | |
| from __future__ import annotations | |
| import shutil | |
| from pathlib import Path | |
| import pandas as pd | |
| BASE_DIR = Path(__file__).resolve().parent | |
| WORKSPACE = BASE_DIR.parent | |
| DATA_DIR = BASE_DIR / "data" | |
| # ── Labels canoniques (sans accents, ordre du modèle) ────────────────────── | |
| ALL_LABELS = [ | |
| "Emo", "Comportementale", "Designee", "Montree", "Suggeree", | |
| "Base", "Complexe", "Admiration", "Autre", "Colere", | |
| "Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie", | |
| "Joie", "Peur", "Surprise", "Tristesse", | |
| ] | |
| PRED_SUFFIX = "_pred_emotyc" | |
| # Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques | |
| ACCENT_TO_CANONICAL = { | |
| "Colère": "Colere", | |
| "Dégoût": "Degout", | |
| "Culpabilité": "Culpabilite", | |
| "Fierté": "Fierte", | |
| "Désignée": "Designee", | |
| "Montrée": "Montree", | |
| "Suggérée": "Suggeree", | |
| "Émo": "Emo", | |
| } | |
| # ── Sources ──────────────────────────────────────────────────────────────── | |
| RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data" | |
| CYBER_CONFIGS = [ | |
| { | |
| "name": "CyberAggAdo200", | |
| "src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx", | |
| }, | |
| { | |
| "name": "CyberAggAdoGlobal_Context", | |
| "src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx", | |
| }, | |
| { | |
| "name": "CyberAggAdoGlobal_SansContexte", | |
| "src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx", | |
| }, | |
| ] | |
| TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx" | |
| TTK_CONFIGS = [ | |
| { | |
| "name": "TextToKids_Context", | |
| "src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx", | |
| }, | |
| { | |
| "name": "TextToKids_SansContexte", | |
| "src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx", | |
| }, | |
| ] | |
| def normalize_column(col: str) -> str: | |
| """Convertit un nom de colonne accentué en nom canonique.""" | |
| return ACCENT_TO_CANONICAL.get(col, col) | |
| def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None: | |
| """Joindre gold et prédictions TextToKids dans un seul XLSX.""" | |
| print(f" Lecture du gold : {gold_path.name}") | |
| gold_df = pd.read_excel(gold_path, engine="openpyxl") | |
| print(f" Lecture des prédictions : {pred_path.name}") | |
| pred_df = pd.read_excel(pred_path, engine="openpyxl") | |
| if len(gold_df) != len(pred_df): | |
| raise ValueError( | |
| f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}" | |
| ) | |
| # Renommer les colonnes de prédiction : accentuées → canoniques + suffixe | |
| pred_rename = {} | |
| for col in pred_df.columns: | |
| if col == "TEXT": | |
| continue | |
| canonical = normalize_column(col) | |
| pred_rename[col] = f"{canonical}{PRED_SUFFIX}" | |
| pred_df = pred_df.rename(columns=pred_rename) | |
| # Combiner : gold (toutes colonnes) + colonnes _pred_emotyc | |
| pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)] | |
| result_df = gold_df.copy() | |
| for col in pred_cols: | |
| result_df[col] = pred_df[col].values | |
| print(f" → {len(result_df)} lignes, {len(result_df.columns)} colonnes") | |
| result_df.to_excel(output_path, index=False, engine="openpyxl") | |
| print(f" Sauvegardé : {output_path.name}") | |
| def main() -> None: | |
| DATA_DIR.mkdir(exist_ok=True) | |
| # CyberAggAdo : copie directe | |
| for config in CYBER_CONFIGS: | |
| dst = DATA_DIR / f"{config['name']}.xlsx" | |
| print(f"Copie : {config['src'].name} → {dst.name}") | |
| shutil.copy2(config["src"], dst) | |
| # TextToKids : jointure gold + pred | |
| print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...") | |
| for config in TTK_CONFIGS: | |
| dst = DATA_DIR / f"{config['name']}.xlsx" | |
| print(f"\nPréparation : {config['name']}") | |
| prepare_texttokids(TTK_GOLD, config["src"], dst) | |
| print("\n✅ Préparation terminée. Fichiers dans data/ :") | |
| for f in sorted(DATA_DIR.glob("*.xlsx")): | |
| size_mb = f.stat().st_size / (1024 * 1024) | |
| print(f" {f.name} ({size_mb:.1f} MB)") | |
| if __name__ == "__main__": | |
| main() | |