Spaces:
Running
Running
File size: 4,625 Bytes
c9b4ca7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
Prépare les 5 fichiers XLSX de configuration dans data/.
- CyberAggAdo : copie directe (gold + pred déjà dans le même fichier)
- TextToKids : jointure gold + prédictions, normalisation des colonnes
"""
from __future__ import annotations
import shutil
from pathlib import Path
import pandas as pd
BASE_DIR = Path(__file__).resolve().parent
WORKSPACE = BASE_DIR.parent
DATA_DIR = BASE_DIR / "data"
# ── Labels canoniques (sans accents, ordre du modèle) ──────────────────────
ALL_LABELS = [
"Emo", "Comportementale", "Designee", "Montree", "Suggeree",
"Base", "Complexe", "Admiration", "Autre", "Colere",
"Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie",
"Joie", "Peur", "Surprise", "Tristesse",
]
PRED_SUFFIX = "_pred_emotyc"
# Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques
ACCENT_TO_CANONICAL = {
"Colère": "Colere",
"Dégoût": "Degout",
"Culpabilité": "Culpabilite",
"Fierté": "Fierte",
"Désignée": "Designee",
"Montrée": "Montree",
"Suggérée": "Suggeree",
"Émo": "Emo",
}
# ── Sources ────────────────────────────────────────────────────────────────
RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data"
CYBER_CONFIGS = [
{
"name": "CyberAggAdo200",
"src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx",
},
{
"name": "CyberAggAdoGlobal_Context",
"src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx",
},
{
"name": "CyberAggAdoGlobal_SansContexte",
"src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx",
},
]
TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx"
TTK_CONFIGS = [
{
"name": "TextToKids_Context",
"src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx",
},
{
"name": "TextToKids_SansContexte",
"src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx",
},
]
def normalize_column(col: str) -> str:
"""Convertit un nom de colonne accentué en nom canonique."""
return ACCENT_TO_CANONICAL.get(col, col)
def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None:
"""Joindre gold et prédictions TextToKids dans un seul XLSX."""
print(f" Lecture du gold : {gold_path.name}")
gold_df = pd.read_excel(gold_path, engine="openpyxl")
print(f" Lecture des prédictions : {pred_path.name}")
pred_df = pd.read_excel(pred_path, engine="openpyxl")
if len(gold_df) != len(pred_df):
raise ValueError(
f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}"
)
# Renommer les colonnes de prédiction : accentuées → canoniques + suffixe
pred_rename = {}
for col in pred_df.columns:
if col == "TEXT":
continue
canonical = normalize_column(col)
pred_rename[col] = f"{canonical}{PRED_SUFFIX}"
pred_df = pred_df.rename(columns=pred_rename)
# Combiner : gold (toutes colonnes) + colonnes _pred_emotyc
pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)]
result_df = gold_df.copy()
for col in pred_cols:
result_df[col] = pred_df[col].values
print(f" → {len(result_df)} lignes, {len(result_df.columns)} colonnes")
result_df.to_excel(output_path, index=False, engine="openpyxl")
print(f" Sauvegardé : {output_path.name}")
def main() -> None:
DATA_DIR.mkdir(exist_ok=True)
# CyberAggAdo : copie directe
for config in CYBER_CONFIGS:
dst = DATA_DIR / f"{config['name']}.xlsx"
print(f"Copie : {config['src'].name} → {dst.name}")
shutil.copy2(config["src"], dst)
# TextToKids : jointure gold + pred
print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...")
for config in TTK_CONFIGS:
dst = DATA_DIR / f"{config['name']}.xlsx"
print(f"\nPréparation : {config['name']}")
prepare_texttokids(TTK_GOLD, config["src"], dst)
print("\n✅ Préparation terminée. Fichiers dans data/ :")
for f in sorted(DATA_DIR.glob("*.xlsx")):
size_mb = f.stat().st_size / (1024 * 1024)
print(f" {f.name} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()
|