Spaces:

saeedfarzi
/

eCREAM_LLM_Leaderboard_Beta

Sleeping

File size: 5,538 Bytes

a05efde

#!/usr/bin/env python3

#python Gen_llm_eval_output.py   --p1 csv_files/llm_scores_p1.xlsx   --p2 csv_files/llm_scores_p2.xlsx   --p3 csv_files/llm_scores_p3.xlsx   --output-dir csv_files/outputs
import argparse
import os
import re
import math
import pandas as pd
import numpy as np

REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"]

def read_scores(path: str) -> pd.DataFrame:
    df = pd.read_excel(path)
    # normalize columns
    df.columns = [c.strip().lower() for c in df.columns]
    if "prompts" not in df.columns and "prompt" in df.columns:
        df["prompts"] = df["prompt"]
    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{path} is missing required columns: {missing}")
    # keep only required, coerce f1 to numeric
    df = df[REQUIRED_COLS].copy()
    df["f1"] = pd.to_numeric(df["f1"], errors="coerce")
    df = df.dropna(subset=["f1"])
    return df

def sanitize_filename(s: str) -> str:
    return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip())

def format_float(x):
    if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))):
        return "nan"
    return f"{x:.4f}"

def prompt_order_key(label: str):
    # Sort by the number in "prompt-<n>" if present; fallback to string
    m = re.search(r"(\d+)", str(label))
    return (0, int(m.group(1))) if m else (1, str(label))

def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str:
    # Collect all prompt-level f1 values (across tasks and prompts)
    prompt_values = g["f1"].to_numpy(dtype=float)
    if prompt_values.size > 0:
        gen_value = float(np.mean(prompt_values))
        gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0
    else:
        gen_value, gen_stderr = float("nan"), 0.0

    # Build table text
    if configuration=="0shot" : configuration='0'
    if configuration=="10shot" : configuration='10'
    model = model.split("__")[0]+'/'+model.split("__")[1]
    #if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B'
    #if model =='HiTZ__Medical-mT5-large' : model=''
    #if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model
    #if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model
    #if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model
    #if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model=''
    #if model =='epfl-llm__meditron-7b' : model=''
    #if model =='google__gemma-2-9b-it' : model=''
    #if model =='google__gemma-3-27b-it' : model=''
    #if model =='google__medgemma-27b-text-it' : model=''
    #if model =='google__medgemma-4b-it' : model=''
    #if model =='microsoft__MediPhi-Clinical' : model=''
    #if model =='microsoft__MediPhi-Instruct' : model=''
    #if model =='mistralai__Mistral-7B-Instruct-v0.2' : model=''
    #if model =='mistralai__Mistral-Nemo-Instruct-2407' : model=''
    #if model =='tiiuae__Falcon3-10B-Instruct' : model=''
    #if model =='unsloth__phi-4' : model=''
    #if model =='Henrychur__MMed-Llama-3-8B' : model=''
    
    header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1"
    lines = [
        "|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|",
        "|-------|-------|------|------|------|----|------|---|------|",
        #f"|Gen   |       |      |      |f1    |    |{format_float(gen_value)} |---| {format_float(gen_stderr)} |",
    ]

    # For each task, add task row (mean over prompts) then prompt rows
    for task, df_task in g.groupby("task", sort=False):
        f1s = df_task["f1"].to_numpy(dtype=float)
        task_mean = float(np.mean(f1s)) if f1s.size else float("nan")
        lines.append(f"| - {task.upper()}        |       |      |      |f1    |   | {format_float(task_mean)} |   |0 |")

        # Prompt-level rows, sorted by prompt number if available
        df_task = df_task.copy()
        df_task["_order"] = df_task["prompts"].map(prompt_order_key)
        df_task = df_task.sort_values("_order")
        for _, r in df_task.iterrows():
            prompt_label = str(r["prompts"])
            lines.append(f"|   - {prompt_label}  |       |      |      |f1    |   | {format_float(r['f1'])} |   | 0 |")

    return header + "\n" + "\n".join(lines) + "\n"

def main():
    ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.")
    ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx")
    ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx")
    ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx")
    ap.add_argument("--output-dir", required=True, help="Directory to write output files")
    args = ap.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)

    df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True)

    # One file per (model, language, configuration)
    for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False):
        content = render_group_table(g, model, language, config)
        fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt"
        out_path = os.path.join(args.output_dir, fname)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(content)

if __name__ == "__main__":
    main()