Spaces:

GwendalTsang
/

Visualisation1

Running

GwenONERA

new

c9b4ca7 1 day ago

4.63 kB

	"""
	Prépare les 5 fichiers XLSX de configuration dans data/.

	- CyberAggAdo : copie directe (gold + pred déjà dans le même fichier)
	- TextToKids : jointure gold + prédictions, normalisation des colonnes
	"""

	from __future__ import annotations

	import shutil
	from pathlib import Path

	import pandas as pd

	BASE_DIR = Path(__file__).resolve().parent
	WORKSPACE = BASE_DIR.parent
	DATA_DIR = BASE_DIR / "data"

	# ── Labels canoniques (sans accents, ordre du modèle) ──────────────────────
	ALL_LABELS = [
	"Emo", "Comportementale", "Designee", "Montree", "Suggeree",
	"Base", "Complexe", "Admiration", "Autre", "Colere",
	"Culpabilite", "Degout", "Embarras", "Fierte", "Jalousie",
	"Joie", "Peur", "Surprise", "Tristesse",
	]

	PRED_SUFFIX = "_pred_emotyc"

	# Mapping : noms accentués (dans les fichiers TextToKids pred) → canoniques
	ACCENT_TO_CANONICAL = {
	"Colère": "Colere",
	"Dégoût": "Degout",
	"Culpabilité": "Culpabilite",
	"Fierté": "Fierte",
	"Désignée": "Designee",
	"Montrée": "Montree",
	"Suggérée": "Suggeree",
	"Émo": "Emo",
	}

	# ── Sources ────────────────────────────────────────────────────────────────
	RESULTS_DATA = WORKSPACE / "Visualisation1" / "results_data"

	CYBER_CONFIGS = [
	{
	"name": "CyberAggAdo200",
	"src": RESULTS_DATA / "CyberAggAdo200" / "all_xlsx_samples_pred_emotyc.xlsx",
	},
	{
	"name": "CyberAggAdoGlobal_Context",
	"src": RESULTS_DATA / "CyberAggAdoGlobal" / "Context" / "CyberAdoAgg_gold_global_total_context_pred_emotyc.xlsx",
	},
	{
	"name": "CyberAggAdoGlobal_SansContexte",
	"src": RESULTS_DATA / "CyberAggAdoGlobal" / "WithoutContext" / "CyberAdoAgg_gold_global_total_pred_emotyc.xlsx",
	},
	]

	TTK_GOLD = WORKSPACE / "Eval" / "golds" / "emotexttokids_gold_flat.xlsx"

	TTK_CONFIGS = [
	{
	"name": "TextToKids_Context",
	"src": RESULTS_DATA / "TextToKids" / "Context" / "Context_emotyc_predictions_output.xlsx",
	},
	{
	"name": "TextToKids_SansContexte",
	"src": RESULTS_DATA / "TextToKids" / "WithoutContext" / "NoContext_emotyc_predictions_output.xlsx",
	},
	]


	def normalize_column(col: str) -> str:
	"""Convertit un nom de colonne accentué en nom canonique."""
	return ACCENT_TO_CANONICAL.get(col, col)


	def prepare_texttokids(gold_path: Path, pred_path: Path, output_path: Path) -> None:
	"""Joindre gold et prédictions TextToKids dans un seul XLSX."""
	print(f" Lecture du gold : {gold_path.name}")
	gold_df = pd.read_excel(gold_path, engine="openpyxl")

	print(f" Lecture des prédictions : {pred_path.name}")
	pred_df = pd.read_excel(pred_path, engine="openpyxl")

	if len(gold_df) != len(pred_df):
	raise ValueError(
	f"Nombre de lignes différent : gold={len(gold_df)}, pred={len(pred_df)}"
	)

	# Renommer les colonnes de prédiction : accentuées → canoniques + suffixe
	pred_rename = {}
	for col in pred_df.columns:
	if col == "TEXT":
	continue
	canonical = normalize_column(col)
	pred_rename[col] = f"{canonical}{PRED_SUFFIX}"

	pred_df = pred_df.rename(columns=pred_rename)

	# Combiner : gold (toutes colonnes) + colonnes _pred_emotyc
	pred_cols = [c for c in pred_df.columns if c.endswith(PRED_SUFFIX)]
	result_df = gold_df.copy()
	for col in pred_cols:
	result_df[col] = pred_df[col].values

	print(f" → {len(result_df)} lignes, {len(result_df.columns)} colonnes")
	result_df.to_excel(output_path, index=False, engine="openpyxl")
	print(f" Sauvegardé : {output_path.name}")


	def main() -> None:
	DATA_DIR.mkdir(exist_ok=True)

	# CyberAggAdo : copie directe
	for config in CYBER_CONFIGS:
	dst = DATA_DIR / f"{config['name']}.xlsx"
	print(f"Copie : {config['src'].name} → {dst.name}")
	shutil.copy2(config["src"], dst)

	# TextToKids : jointure gold + pred
	print(f"\nChargement du gold TextToKids ({TTK_GOLD.name})...")
	for config in TTK_CONFIGS:
	dst = DATA_DIR / f"{config['name']}.xlsx"
	print(f"\nPréparation : {config['name']}")
	prepare_texttokids(TTK_GOLD, config["src"], dst)

	print("\n✅ Préparation terminée. Fichiers dans data/ :")
	for f in sorted(DATA_DIR.glob("*.xlsx")):
	size_mb = f.stat().st_size / (1024 * 1024)
	print(f" {f.name} ({size_mb:.1f} MB)")


	if __name__ == "__main__":
	main()