| |
| import argparse |
| import json |
| import os |
| import subprocess |
| import sys |
| from pathlib import Path |
|
|
|
|
| MODEL_ID = "bench_fr_native1.py" |
| BENCHMARK_NAME = "MTEB(fra, v1)" |
| OUTPUT_ROOT = "results_mteb_french_native" |
| SUMMARY_JSONL = "results_mteb_french_native_summary.jsonl" |
|
|
|
|
| TASK_NAMES = [ |
| "AlloProfClusteringP2P", |
| "AlloProfClusteringS2S", |
| "HALClusteringS2S", |
| "AlloprofReranking", |
| "SyntecReranking", |
| "AlloprofRetrieval", |
| "BSARDRetrieval", |
| "SyntecRetrieval", |
| "SICKFr", |
| "SummEvalFr", |
| ] |
|
|
|
|
| def find_task_result_json(task_name): |
| root = Path(OUTPUT_ROOT) |
| files = list(root.rglob(f"{task_name}.json")) |
| if not files: |
| return None |
| return files[0] |
|
|
|
|
| def read_main_score(task_name): |
| path = find_task_result_json(task_name) |
| if path is None: |
| return None |
|
|
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
|
|
| scores = data.get("scores", {}) |
| test = scores.get("test", []) |
|
|
| if not test: |
| return { |
| "task_name": task_name, |
| "main_score": None, |
| "json_path": str(path), |
| } |
|
|
| first = test[0] |
|
|
| return { |
| "task_name": task_name, |
| "main_score": first.get("main_score"), |
| "cosine_spearman": first.get("cosine_spearman"), |
| "cosine_pearson": first.get("cosine_pearson"), |
| "spearman": first.get("spearman"), |
| "pearson": first.get("pearson"), |
| "ndcg_at_10": first.get("ndcg_at_10"), |
| "map_at_10": first.get("map_at_10"), |
| "recall_at_10": first.get("recall_at_10"), |
| "json_path": str(path), |
| } |
|
|
|
|
| def run_worker(task_name, batch_size): |
| import torch |
| import mteb |
| from mteb import MTEB |
| from sentence_transformers import SentenceTransformer |
|
|
| if not torch.cuda.is_available(): |
| raise RuntimeError("CUDA non disponible. Vérifie avec: nvidia-smi") |
|
|
| print("[WORKER] GPU:", torch.cuda.get_device_name(0)) |
| print("[WORKER] Task:", task_name) |
|
|
| benchmark = mteb.get_benchmark(BENCHMARK_NAME) |
|
|
| tasks = [ |
| task for task in benchmark.tasks |
| if task.metadata.name == task_name |
| ] |
|
|
| if len(tasks) != 1: |
| names = [task.metadata.name for task in benchmark.tasks] |
| raise RuntimeError(f"Tâche introuvable: {task_name}. Disponibles: {names}") |
|
|
| model = SentenceTransformer( |
| MODEL_ID, |
| device="cuda", |
| trust_remote_code=True, |
| ) |
|
|
| evaluation = MTEB(tasks=tasks) |
|
|
| results = evaluation.run( |
| model, |
| output_folder=OUTPUT_ROOT, |
| eval_splits=["test"], |
| batch_size=batch_size, |
| ) |
|
|
| print("[WORKER DONE]", task_name) |
| print("[RAW RESULT]", results) |
|
|
| score = read_main_score(task_name) |
|
|
| print("") |
| print("=" * 80) |
| print("[TASK SCORE]") |
| print("=" * 80) |
|
|
| if score is None: |
| print("task_name:", task_name) |
| print("main_score: None") |
| else: |
| for k, v in score.items(): |
| print(f"{k}: {v}") |
|
|
|
|
| def run_parent(batch_size): |
| if Path(SUMMARY_JSONL).exists(): |
| print("[INFO] Existing summary found:", SUMMARY_JSONL) |
| print("[INFO] New rows will be appended.") |
|
|
| print("[INFO] French-native tasks only:", len(TASK_NAMES)) |
| print("[INFO] Output:", OUTPUT_ROOT) |
|
|
| for i, task_name in enumerate(TASK_NAMES, start=1): |
| print("") |
| print("#" * 100) |
| print(f"[TASK {i}/{len(TASK_NAMES)}] {task_name}") |
| print("#" * 100) |
|
|
| env = os.environ.copy() |
| env["CUDA_VISIBLE_DEVICES"] = env.get("CUDA_VISIBLE_DEVICES", "0") |
| env["PYTHONUNBUFFERED"] = "1" |
|
|
| cmd = [ |
| sys.executable, |
| __file__, |
| "--worker", |
| "--task", |
| task_name, |
| "--batch-size", |
| str(batch_size), |
| ] |
|
|
| proc = subprocess.run(cmd, env=env, text=True) |
|
|
| status = "ok" if proc.returncode == 0 else "failed" |
| score = read_main_score(task_name) if status == "ok" else None |
|
|
| row = { |
| "task_name": task_name, |
| "status": status, |
| "returncode": proc.returncode, |
| "main_score": None if score is None else score.get("main_score"), |
| "cosine_spearman": None if score is None else score.get("cosine_spearman"), |
| "cosine_pearson": None if score is None else score.get("cosine_pearson"), |
| "spearman": None if score is None else score.get("spearman"), |
| "pearson": None if score is None else score.get("pearson"), |
| "ndcg_at_10": None if score is None else score.get("ndcg_at_10"), |
| "map_at_10": None if score is None else score.get("map_at_10"), |
| "recall_at_10": None if score is None else score.get("recall_at_10"), |
| "json_path": None if score is None else score.get("json_path"), |
| } |
|
|
| with open(SUMMARY_JSONL, "a", encoding="utf-8") as f: |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") |
|
|
| print("") |
| print("[SUMMARY ROW]") |
| print(json.dumps(row, ensure_ascii=False, indent=2)) |
|
|
| print_final_summary() |
|
|
|
|
| def print_final_summary(): |
| path = Path(SUMMARY_JSONL) |
|
|
| if not path.exists(): |
| print("[ERROR] No summary found.") |
| return |
|
|
| rows = [] |
|
|
| with open(path, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| rows.append(json.loads(line)) |
|
|
| latest = {} |
| for row in rows: |
| latest[row["task_name"]] = row |
|
|
| final_rows = list(latest.values()) |
|
|
| ok_rows = [ |
| r for r in final_rows |
| if r.get("status") == "ok" and isinstance(r.get("main_score"), (int, float)) |
| ] |
|
|
| failed_rows = [ |
| r for r in final_rows |
| if r.get("status") != "ok" |
| ] |
|
|
| mean_score = None |
| if ok_rows: |
| mean_score = sum(r["main_score"] for r in ok_rows) / len(ok_rows) |
|
|
| print("") |
| print("=" * 100) |
| print("[FINAL SUMMARY]") |
| print("=" * 100) |
| print("tasks_total:", len(final_rows)) |
| print("tasks_ok:", len(ok_rows)) |
| print("tasks_failed:", len(failed_rows)) |
| print("mean_main_score:", mean_score) |
|
|
| print("") |
| print("[OK]") |
| for r in ok_rows: |
| print(f'{r["task_name"]}: {r["main_score"]}') |
|
|
| print("") |
| print("[FAILED]") |
| for r in failed_rows: |
| print(f'{r["task_name"]}: returncode={r["returncode"]}') |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--worker", action="store_true") |
| parser.add_argument("--task", default=None) |
| parser.add_argument("--batch-size", type=int, default=1) |
| parser.add_argument("--summary-only", action="store_true") |
| return parser.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
|
|
| if args.summary_only: |
| print_final_summary() |
| return |
|
|
| if args.worker: |
| if args.task is None: |
| raise RuntimeError("--task requis avec --worker") |
| run_worker(args.task, args.batch_size) |
| else: |
| run_parent(args.batch_size) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|