File size: 4,625 Bytes
c9b4ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Prépare les 5 fichiers XLSX de configuration dans data/.

- CyberAggAdo : copie directe (gold + pred déjà dans le même fichier)
- TextToKids  : jointure gold + prédictions, normalisation des colonnes
"""

from __future__ import annotations

import shutil
from pathlib import Path

import pandas as pd

BASE_DIR = Path(__file__).resolve().parent
WORKSPACE = BASE_DIR.parent
DATA_DIR = BASE_DIR / "data"

# ── Labels canoniques (sans accents, ordre du modèle) ──────────────────────
ALL_LABELS = [
    "Emo", "Comportementale", "Designee", "Montree", "Suggeree",
    "Base", "Complexe", "Admiration", "Autre", "Colere",
    "Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie",
    "Joie", "Peur", "Surprise", "Tristesse",
]

PRED_SUFFIX = "_pred_emotyc"

# Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques
ACCENT_TO_CANONICAL = {
    "Colère": "Colere",
    "Dégoût": "Degout",
    "Culpabilité": "Culpabilite",
    "Fierté": "Fierte",
    "Désignée": "Designee",
    "Montrée": "Montree",
    "Suggérée": "Suggeree",
    "Émo": "Emo",
}

# ── Sources ────────────────────────────────────────────────────────────────
RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data"

CYBER_CONFIGS = [
    {
        "name": "CyberAggAdo200",
        "src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx",
    },
    {
        "name": "CyberAggAdoGlobal_Context",
        "src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx",
    },
    {
        "name": "CyberAggAdoGlobal_SansContexte",
        "src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx",
    },
]

TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx"

TTK_CONFIGS = [
    {
        "name": "TextToKids_Context",
        "src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx",
    },
    {
        "name": "TextToKids_SansContexte",
        "src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx",
    },
]


def normalize_column(col: str) -> str:
    """Convertit un nom de colonne accentué en nom canonique."""
    return ACCENT_TO_CANONICAL.get(col, col)


def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None:
    """Joindre gold et prédictions TextToKids dans un seul XLSX."""
    print(f"  Lecture du gold : {gold_path.name}")
    gold_df = pd.read_excel(gold_path, engine="openpyxl")

    print(f"  Lecture des prédictions : {pred_path.name}")
    pred_df = pd.read_excel(pred_path, engine="openpyxl")

    if len(gold_df) != len(pred_df):
        raise ValueError(
            f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}"
        )

    # Renommer les colonnes de prédiction : accentuées → canoniques + suffixe
    pred_rename = {}
    for col in pred_df.columns:
        if col == "TEXT":
            continue
        canonical = normalize_column(col)
        pred_rename[col] = f"{canonical}{PRED_SUFFIX}"

    pred_df = pred_df.rename(columns=pred_rename)

    # Combiner : gold (toutes colonnes) + colonnes _pred_emotyc
    pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)]
    result_df = gold_df.copy()
    for col in pred_cols:
        result_df[col] = pred_df[col].values

    print(f"  → {len(result_df)} lignes, {len(result_df.columns)} colonnes")
    result_df.to_excel(output_path, index=False, engine="openpyxl")
    print(f"  Sauvegardé : {output_path.name}")


def main() -> None:
    DATA_DIR.mkdir(exist_ok=True)

    # CyberAggAdo : copie directe
    for config in CYBER_CONFIGS:
        dst = DATA_DIR / f"{config['name']}.xlsx"
        print(f"Copie : {config['src'].name}{dst.name}")
        shutil.copy2(config["src"], dst)

    # TextToKids : jointure gold + pred
    print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...")
    for config in TTK_CONFIGS:
        dst = DATA_DIR / f"{config['name']}.xlsx"
        print(f"\nPréparation : {config['name']}")
        prepare_texttokids(TTK_GOLD, config["src"], dst)

    print("\n✅ Préparation terminée. Fichiers dans data/ :")
    for f in sorted(DATA_DIR.glob("*.xlsx")):
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  {f.name} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()