#!/usr/bin/env python3 import argparse import json import os import subprocess import sys from pathlib import Path MODEL_ID = "bench_fr_native1.py" BENCHMARK_NAME = "MTEB(fra, v1)" OUTPUT_ROOT = "results_mteb_french_native" SUMMARY_JSONL = "results_mteb_french_native_summary.jsonl" TASK_NAMES = [ "AlloProfClusteringP2P", "AlloProfClusteringS2S", "HALClusteringS2S", "AlloprofReranking", "SyntecReranking", "AlloprofRetrieval", "BSARDRetrieval", "SyntecRetrieval", "SICKFr", "SummEvalFr", ] def find_task_result_json(task_name): root = Path(OUTPUT_ROOT) files = list(root.rglob(f"{task_name}.json")) if not files: return None return files[0] def read_main_score(task_name): path = find_task_result_json(task_name) if path is None: return None with open(path, "r", encoding="utf-8") as f: data = json.load(f) scores = data.get("scores", {}) test = scores.get("test", []) if not test: return { "task_name": task_name, "main_score": None, "json_path": str(path), } first = test[0] return { "task_name": task_name, "main_score": first.get("main_score"), "cosine_spearman": first.get("cosine_spearman"), "cosine_pearson": first.get("cosine_pearson"), "spearman": first.get("spearman"), "pearson": first.get("pearson"), "ndcg_at_10": first.get("ndcg_at_10"), "map_at_10": first.get("map_at_10"), "recall_at_10": first.get("recall_at_10"), "json_path": str(path), } def run_worker(task_name, batch_size): import torch import mteb from mteb import MTEB from sentence_transformers import SentenceTransformer if not torch.cuda.is_available(): raise RuntimeError("CUDA non disponible. Vérifie avec: nvidia-smi") print("[WORKER] GPU:", torch.cuda.get_device_name(0)) print("[WORKER] Task:", task_name) benchmark = mteb.get_benchmark(BENCHMARK_NAME) tasks = [ task for task in benchmark.tasks if task.metadata.name == task_name ] if len(tasks) != 1: names = [task.metadata.name for task in benchmark.tasks] raise RuntimeError(f"Tâche introuvable: {task_name}. Disponibles: {names}") model = SentenceTransformer( MODEL_ID, device="cuda", trust_remote_code=True, ) evaluation = MTEB(tasks=tasks) results = evaluation.run( model, output_folder=OUTPUT_ROOT, eval_splits=["test"], batch_size=batch_size, ) print("[WORKER DONE]", task_name) print("[RAW RESULT]", results) score = read_main_score(task_name) print("") print("=" * 80) print("[TASK SCORE]") print("=" * 80) if score is None: print("task_name:", task_name) print("main_score: None") else: for k, v in score.items(): print(f"{k}: {v}") def run_parent(batch_size): if Path(SUMMARY_JSONL).exists(): print("[INFO] Existing summary found:", SUMMARY_JSONL) print("[INFO] New rows will be appended.") print("[INFO] French-native tasks only:", len(TASK_NAMES)) print("[INFO] Output:", OUTPUT_ROOT) for i, task_name in enumerate(TASK_NAMES, start=1): print("") print("#" * 100) print(f"[TASK {i}/{len(TASK_NAMES)}] {task_name}") print("#" * 100) env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = env.get("CUDA_VISIBLE_DEVICES", "0") env["PYTHONUNBUFFERED"] = "1" cmd = [ sys.executable, __file__, "--worker", "--task", task_name, "--batch-size", str(batch_size), ] proc = subprocess.run(cmd, env=env, text=True) status = "ok" if proc.returncode == 0 else "failed" score = read_main_score(task_name) if status == "ok" else None row = { "task_name": task_name, "status": status, "returncode": proc.returncode, "main_score": None if score is None else score.get("main_score"), "cosine_spearman": None if score is None else score.get("cosine_spearman"), "cosine_pearson": None if score is None else score.get("cosine_pearson"), "spearman": None if score is None else score.get("spearman"), "pearson": None if score is None else score.get("pearson"), "ndcg_at_10": None if score is None else score.get("ndcg_at_10"), "map_at_10": None if score is None else score.get("map_at_10"), "recall_at_10": None if score is None else score.get("recall_at_10"), "json_path": None if score is None else score.get("json_path"), } with open(SUMMARY_JSONL, "a", encoding="utf-8") as f: f.write(json.dumps(row, ensure_ascii=False) + "\n") print("") print("[SUMMARY ROW]") print(json.dumps(row, ensure_ascii=False, indent=2)) print_final_summary() def print_final_summary(): path = Path(SUMMARY_JSONL) if not path.exists(): print("[ERROR] No summary found.") return rows = [] with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: rows.append(json.loads(line)) latest = {} for row in rows: latest[row["task_name"]] = row final_rows = list(latest.values()) ok_rows = [ r for r in final_rows if r.get("status") == "ok" and isinstance(r.get("main_score"), (int, float)) ] failed_rows = [ r for r in final_rows if r.get("status") != "ok" ] mean_score = None if ok_rows: mean_score = sum(r["main_score"] for r in ok_rows) / len(ok_rows) print("") print("=" * 100) print("[FINAL SUMMARY]") print("=" * 100) print("tasks_total:", len(final_rows)) print("tasks_ok:", len(ok_rows)) print("tasks_failed:", len(failed_rows)) print("mean_main_score:", mean_score) print("") print("[OK]") for r in ok_rows: print(f'{r["task_name"]}: {r["main_score"]}') print("") print("[FAILED]") for r in failed_rows: print(f'{r["task_name"]}: returncode={r["returncode"]}') def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--worker", action="store_true") parser.add_argument("--task", default=None) parser.add_argument("--batch-size", type=int, default=1) parser.add_argument("--summary-only", action="store_true") return parser.parse_args() def main(): args = parse_args() if args.summary_only: print_final_summary() return if args.worker: if args.task is None: raise RuntimeError("--task requis avec --worker") run_worker(args.task, args.batch_size) else: run_parent(args.batch_size) if __name__ == "__main__": main()