PhysiQuanty's picture
Upload bench.py
50a4e60 verified
Raw
History Blame Contribute Delete
7.06 kB
#!/usr/bin/env python3
import argparse
import json
import os
import subprocess
import sys
from pathlib import Path
MODEL_ID = "bench_fr_native1.py"
BENCHMARK_NAME = "MTEB(fra, v1)"
OUTPUT_ROOT = "results_mteb_french_native"
SUMMARY_JSONL = "results_mteb_french_native_summary.jsonl"
TASK_NAMES = [
"AlloProfClusteringP2P",
"AlloProfClusteringS2S",
"HALClusteringS2S",
"AlloprofReranking",
"SyntecReranking",
"AlloprofRetrieval",
"BSARDRetrieval",
"SyntecRetrieval",
"SICKFr",
"SummEvalFr",
]
def find_task_result_json(task_name):
root = Path(OUTPUT_ROOT)
files = list(root.rglob(f"{task_name}.json"))
if not files:
return None
return files[0]
def read_main_score(task_name):
path = find_task_result_json(task_name)
if path is None:
return None
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
scores = data.get("scores", {})
test = scores.get("test", [])
if not test:
return {
"task_name": task_name,
"main_score": None,
"json_path": str(path),
}
first = test[0]
return {
"task_name": task_name,
"main_score": first.get("main_score"),
"cosine_spearman": first.get("cosine_spearman"),
"cosine_pearson": first.get("cosine_pearson"),
"spearman": first.get("spearman"),
"pearson": first.get("pearson"),
"ndcg_at_10": first.get("ndcg_at_10"),
"map_at_10": first.get("map_at_10"),
"recall_at_10": first.get("recall_at_10"),
"json_path": str(path),
}
def run_worker(task_name, batch_size):
import torch
import mteb
from mteb import MTEB
from sentence_transformers import SentenceTransformer
if not torch.cuda.is_available():
raise RuntimeError("CUDA non disponible. Vérifie avec: nvidia-smi")
print("[WORKER] GPU:", torch.cuda.get_device_name(0))
print("[WORKER] Task:", task_name)
benchmark = mteb.get_benchmark(BENCHMARK_NAME)
tasks = [
task for task in benchmark.tasks
if task.metadata.name == task_name
]
if len(tasks) != 1:
names = [task.metadata.name for task in benchmark.tasks]
raise RuntimeError(f"Tâche introuvable: {task_name}. Disponibles: {names}")
model = SentenceTransformer(
MODEL_ID,
device="cuda",
trust_remote_code=True,
)
evaluation = MTEB(tasks=tasks)
results = evaluation.run(
model,
output_folder=OUTPUT_ROOT,
eval_splits=["test"],
batch_size=batch_size,
)
print("[WORKER DONE]", task_name)
print("[RAW RESULT]", results)
score = read_main_score(task_name)
print("")
print("=" * 80)
print("[TASK SCORE]")
print("=" * 80)
if score is None:
print("task_name:", task_name)
print("main_score: None")
else:
for k, v in score.items():
print(f"{k}: {v}")
def run_parent(batch_size):
if Path(SUMMARY_JSONL).exists():
print("[INFO] Existing summary found:", SUMMARY_JSONL)
print("[INFO] New rows will be appended.")
print("[INFO] French-native tasks only:", len(TASK_NAMES))
print("[INFO] Output:", OUTPUT_ROOT)
for i, task_name in enumerate(TASK_NAMES, start=1):
print("")
print("#" * 100)
print(f"[TASK {i}/{len(TASK_NAMES)}] {task_name}")
print("#" * 100)
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = env.get("CUDA_VISIBLE_DEVICES", "0")
env["PYTHONUNBUFFERED"] = "1"
cmd = [
sys.executable,
__file__,
"--worker",
"--task",
task_name,
"--batch-size",
str(batch_size),
]
proc = subprocess.run(cmd, env=env, text=True)
status = "ok" if proc.returncode == 0 else "failed"
score = read_main_score(task_name) if status == "ok" else None
row = {
"task_name": task_name,
"status": status,
"returncode": proc.returncode,
"main_score": None if score is None else score.get("main_score"),
"cosine_spearman": None if score is None else score.get("cosine_spearman"),
"cosine_pearson": None if score is None else score.get("cosine_pearson"),
"spearman": None if score is None else score.get("spearman"),
"pearson": None if score is None else score.get("pearson"),
"ndcg_at_10": None if score is None else score.get("ndcg_at_10"),
"map_at_10": None if score is None else score.get("map_at_10"),
"recall_at_10": None if score is None else score.get("recall_at_10"),
"json_path": None if score is None else score.get("json_path"),
}
with open(SUMMARY_JSONL, "a", encoding="utf-8") as f:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
print("")
print("[SUMMARY ROW]")
print(json.dumps(row, ensure_ascii=False, indent=2))
print_final_summary()
def print_final_summary():
path = Path(SUMMARY_JSONL)
if not path.exists():
print("[ERROR] No summary found.")
return
rows = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
latest = {}
for row in rows:
latest[row["task_name"]] = row
final_rows = list(latest.values())
ok_rows = [
r for r in final_rows
if r.get("status") == "ok" and isinstance(r.get("main_score"), (int, float))
]
failed_rows = [
r for r in final_rows
if r.get("status") != "ok"
]
mean_score = None
if ok_rows:
mean_score = sum(r["main_score"] for r in ok_rows) / len(ok_rows)
print("")
print("=" * 100)
print("[FINAL SUMMARY]")
print("=" * 100)
print("tasks_total:", len(final_rows))
print("tasks_ok:", len(ok_rows))
print("tasks_failed:", len(failed_rows))
print("mean_main_score:", mean_score)
print("")
print("[OK]")
for r in ok_rows:
print(f'{r["task_name"]}: {r["main_score"]}')
print("")
print("[FAILED]")
for r in failed_rows:
print(f'{r["task_name"]}: returncode={r["returncode"]}')
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--worker", action="store_true")
parser.add_argument("--task", default=None)
parser.add_argument("--batch-size", type=int, default=1)
parser.add_argument("--summary-only", action="store_true")
return parser.parse_args()
def main():
args = parse_args()
if args.summary_only:
print_final_summary()
return
if args.worker:
if args.task is None:
raise RuntimeError("--task requis avec --worker")
run_worker(args.task, args.batch_size)
else:
run_parent(args.batch_size)
if __name__ == "__main__":
main()