eCREAM_LLM_Leaderboard_Beta / Gen_llm_eval_output.py
Sfarzi's picture
Initial clone with modifications
a05efde
#!/usr/bin/env python3
#python Gen_llm_eval_output.py --p1 csv_files/llm_scores_p1.xlsx --p2 csv_files/llm_scores_p2.xlsx --p3 csv_files/llm_scores_p3.xlsx --output-dir csv_files/outputs
import argparse
import os
import re
import math
import pandas as pd
import numpy as np
REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"]
def read_scores(path: str) -> pd.DataFrame:
df = pd.read_excel(path)
# normalize columns
df.columns = [c.strip().lower() for c in df.columns]
if "prompts" not in df.columns and "prompt" in df.columns:
df["prompts"] = df["prompt"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
raise ValueError(f"{path} is missing required columns: {missing}")
# keep only required, coerce f1 to numeric
df = df[REQUIRED_COLS].copy()
df["f1"] = pd.to_numeric(df["f1"], errors="coerce")
df = df.dropna(subset=["f1"])
return df
def sanitize_filename(s: str) -> str:
return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip())
def format_float(x):
if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))):
return "nan"
return f"{x:.4f}"
def prompt_order_key(label: str):
# Sort by the number in "prompt-<n>" if present; fallback to string
m = re.search(r"(\d+)", str(label))
return (0, int(m.group(1))) if m else (1, str(label))
def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str:
# Collect all prompt-level f1 values (across tasks and prompts)
prompt_values = g["f1"].to_numpy(dtype=float)
if prompt_values.size > 0:
gen_value = float(np.mean(prompt_values))
gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0
else:
gen_value, gen_stderr = float("nan"), 0.0
# Build table text
if configuration=="0shot" : configuration='0'
if configuration=="10shot" : configuration='10'
model = model.split("__")[0]+'/'+model.split("__")[1]
#if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B'
#if model =='HiTZ__Medical-mT5-large' : model=''
#if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model
#if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model
#if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model
#if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model=''
#if model =='epfl-llm__meditron-7b' : model=''
#if model =='google__gemma-2-9b-it' : model=''
#if model =='google__gemma-3-27b-it' : model=''
#if model =='google__medgemma-27b-text-it' : model=''
#if model =='google__medgemma-4b-it' : model=''
#if model =='microsoft__MediPhi-Clinical' : model=''
#if model =='microsoft__MediPhi-Instruct' : model=''
#if model =='mistralai__Mistral-7B-Instruct-v0.2' : model=''
#if model =='mistralai__Mistral-Nemo-Instruct-2407' : model=''
#if model =='tiiuae__Falcon3-10B-Instruct' : model=''
#if model =='unsloth__phi-4' : model=''
#if model =='Henrychur__MMed-Llama-3-8B' : model=''
header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1"
lines = [
"|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|",
"|-------|-------|------|------|------|----|------|---|------|",
#f"|Gen | | | |f1 | |{format_float(gen_value)} |---| {format_float(gen_stderr)} |",
]
# For each task, add task row (mean over prompts) then prompt rows
for task, df_task in g.groupby("task", sort=False):
f1s = df_task["f1"].to_numpy(dtype=float)
task_mean = float(np.mean(f1s)) if f1s.size else float("nan")
lines.append(f"| - {task.upper()} | | | |f1 | | {format_float(task_mean)} | |0 |")
# Prompt-level rows, sorted by prompt number if available
df_task = df_task.copy()
df_task["_order"] = df_task["prompts"].map(prompt_order_key)
df_task = df_task.sort_values("_order")
for _, r in df_task.iterrows():
prompt_label = str(r["prompts"])
lines.append(f"| - {prompt_label} | | | |f1 | | {format_float(r['f1'])} | | 0 |")
return header + "\n" + "\n".join(lines) + "\n"
def main():
ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.")
ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx")
ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx")
ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx")
ap.add_argument("--output-dir", required=True, help="Directory to write output files")
args = ap.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True)
# One file per (model, language, configuration)
for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False):
content = render_group_table(g, model, language, config)
fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt"
out_path = os.path.join(args.output_dir, fname)
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
if __name__ == "__main__":
main()