Spaces:

GwendalTsang
/

Visualisation1

Running

File size: 4,625 Bytes

c9b4ca7

"""
Prépare les 5 fichiers XLSX de configuration dans data/.

- CyberAggAdo : copie directe (gold + pred déjà dans le même fichier)
- TextToKids  : jointure gold + prédictions, normalisation des colonnes
"""

from __future__ import annotations

import shutil
from pathlib import Path

import pandas as pd

BASE_DIR = Path(__file__).resolve().parent
WORKSPACE = BASE_DIR.parent
DATA_DIR = BASE_DIR / "data"

# ── Labels canoniques (sans accents, ordre du modèle) ──────────────────────
ALL_LABELS = [
    "Emo", "Comportementale", "Designee", "Montree", "Suggeree",
    "Base", "Complexe", "Admiration", "Autre", "Colere",
    "Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie",
    "Joie", "Peur", "Surprise", "Tristesse",
]

PRED_SUFFIX = "_pred_emotyc"

# Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques
ACCENT_TO_CANONICAL = {
    "Colère": "Colere",
    "Dégoût": "Degout",
    "Culpabilité": "Culpabilite",
    "Fierté": "Fierte",
    "Désignée": "Designee",
    "Montrée": "Montree",
    "Suggérée": "Suggeree",
    "Émo": "Emo",
}

# ── Sources ────────────────────────────────────────────────────────────────
RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data"

CYBER_CONFIGS = [
    {
        "name": "CyberAggAdo200",
        "src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx",
    },
    {
        "name": "CyberAggAdoGlobal_Context",
        "src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx",
    },
    {
        "name": "CyberAggAdoGlobal_SansContexte",
        "src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx",
    },
]

TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx"

TTK_CONFIGS = [
    {
        "name": "TextToKids_Context",
        "src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx",
    },
    {
        "name": "TextToKids_SansContexte",
        "src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx",
    },
]


def normalize_column(col: str) -> str:
    """Convertit un nom de colonne accentué en nom canonique."""
    return ACCENT_TO_CANONICAL.get(col, col)


def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None:
    """Joindre gold et prédictions TextToKids dans un seul XLSX."""
    print(f"  Lecture du gold : {gold_path.name}")
    gold_df = pd.read_excel(gold_path, engine="openpyxl")

    print(f"  Lecture des prédictions : {pred_path.name}")
    pred_df = pd.read_excel(pred_path, engine="openpyxl")

    if len(gold_df) != len(pred_df):
        raise ValueError(
            f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}"
        )

    # Renommer les colonnes de prédiction : accentuées → canoniques + suffixe
    pred_rename = {}
    for col in pred_df.columns:
        if col == "TEXT":
            continue
        canonical = normalize_column(col)
        pred_rename[col] = f"{canonical}{PRED_SUFFIX}"

    pred_df = pred_df.rename(columns=pred_rename)

    # Combiner : gold (toutes colonnes) + colonnes _pred_emotyc
    pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)]
    result_df = gold_df.copy()
    for col in pred_cols:
        result_df[col] = pred_df[col].values

    print(f"  → {len(result_df)} lignes, {len(result_df.columns)} colonnes")
    result_df.to_excel(output_path, index=False, engine="openpyxl")
    print(f"  Sauvegardé : {output_path.name}")


def main() -> None:
    DATA_DIR.mkdir(exist_ok=True)

    # CyberAggAdo : copie directe
    for config in CYBER_CONFIGS:
        dst = DATA_DIR / f"{config['name']}.xlsx"
        print(f"Copie : {config['src'].name} → {dst.name}")
        shutil.copy2(config["src"], dst)

    # TextToKids : jointure gold + pred
    print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...")
    for config in TTK_CONFIGS:
        dst = DATA_DIR / f"{config['name']}.xlsx"
        print(f"\nPréparation : {config['name']}")
        prepare_texttokids(TTK_GOLD, config["src"], dst)

    print("\n✅ Préparation terminée. Fichiers dans data/ :")
    for f in sorted(DATA_DIR.glob("*.xlsx")):
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  {f.name} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()