Visualisation1 / prepare_data.py
GwenONERA
new
c9b4ca7
"""
Prépare les 5 fichiers XLSX de configuration dans data/.
- CyberAggAdo : copie directe (gold + pred déjà dans le même fichier)
- TextToKids : jointure gold + prédictions, normalisation des colonnes
"""
from __future__ import annotations
import shutil
from pathlib import Path
import pandas as pd
BASE_DIR = Path(__file__).resolve().parent
WORKSPACE = BASE_DIR.parent
DATA_DIR = BASE_DIR / "data"
# ── Labels canoniques (sans accents, ordre du modèle) ──────────────────────
ALL_LABELS = [
"Emo", "Comportementale", "Designee", "Montree", "Suggeree",
"Base", "Complexe", "Admiration", "Autre", "Colere",
"Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie",
"Joie", "Peur", "Surprise", "Tristesse",
]
PRED_SUFFIX = "_pred_emotyc"
# Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques
ACCENT_TO_CANONICAL = {
"Colère": "Colere",
"Dégoût": "Degout",
"Culpabilité": "Culpabilite",
"Fierté": "Fierte",
"Désignée": "Designee",
"Montrée": "Montree",
"Suggérée": "Suggeree",
"Émo": "Emo",
}
# ── Sources ────────────────────────────────────────────────────────────────
RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data"
CYBER_CONFIGS = [
{
"name": "CyberAggAdo200",
"src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx",
},
{
"name": "CyberAggAdoGlobal_Context",
"src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx",
},
{
"name": "CyberAggAdoGlobal_SansContexte",
"src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx",
},
]
TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx"
TTK_CONFIGS = [
{
"name": "TextToKids_Context",
"src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx",
},
{
"name": "TextToKids_SansContexte",
"src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx",
},
]
def normalize_column(col: str) -> str:
"""Convertit un nom de colonne accentué en nom canonique."""
return ACCENT_TO_CANONICAL.get(col, col)
def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None:
"""Joindre gold et prédictions TextToKids dans un seul XLSX."""
print(f" Lecture du gold : {gold_path.name}")
gold_df = pd.read_excel(gold_path, engine="openpyxl")
print(f" Lecture des prédictions : {pred_path.name}")
pred_df = pd.read_excel(pred_path, engine="openpyxl")
if len(gold_df) != len(pred_df):
raise ValueError(
f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}"
)
# Renommer les colonnes de prédiction : accentuées → canoniques + suffixe
pred_rename = {}
for col in pred_df.columns:
if col == "TEXT":
continue
canonical = normalize_column(col)
pred_rename[col] = f"{canonical}{PRED_SUFFIX}"
pred_df = pred_df.rename(columns=pred_rename)
# Combiner : gold (toutes colonnes) + colonnes _pred_emotyc
pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)]
result_df = gold_df.copy()
for col in pred_cols:
result_df[col] = pred_df[col].values
print(f" → {len(result_df)} lignes, {len(result_df.columns)} colonnes")
result_df.to_excel(output_path, index=False, engine="openpyxl")
print(f" Sauvegardé : {output_path.name}")
def main() -> None:
DATA_DIR.mkdir(exist_ok=True)
# CyberAggAdo : copie directe
for config in CYBER_CONFIGS:
dst = DATA_DIR / f"{config['name']}.xlsx"
print(f"Copie : {config['src'].name}{dst.name}")
shutil.copy2(config["src"], dst)
# TextToKids : jointure gold + pred
print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...")
for config in TTK_CONFIGS:
dst = DATA_DIR / f"{config['name']}.xlsx"
print(f"\nPréparation : {config['name']}")
prepare_texttokids(TTK_GOLD, config["src"], dst)
print("\n✅ Préparation terminée. Fichiers dans data/ :")
for f in sorted(DATA_DIR.glob("*.xlsx")):
size_mb = f.stat().st_size / (1024 * 1024)
print(f" {f.name} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()