Spaces:

saeedfarzi
/

eCREAM_LLM_Leaderboard_Beta

Sleeping

App Files Files Community

eCREAM_LLM_Leaderboard_Beta / Gen_llm_eval_output.py

Sfarzi

Initial clone with modifications

a05efde about 2 months ago

raw

history blame contribute delete

5.54 kB

	#!/usr/bin/env python3

	#python Gen_llm_eval_output.py --p1 csv_files/llm_scores_p1.xlsx --p2 csv_files/llm_scores_p2.xlsx --p3 csv_files/llm_scores_p3.xlsx --output-dir csv_files/outputs
	import argparse
	import os
	import re
	import math
	import pandas as pd
	import numpy as np

	REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"]

	def read_scores(path: str) -> pd.DataFrame:
	df = pd.read_excel(path)
	# normalize columns
	df.columns = [c.strip().lower() for c in df.columns]
	if "prompts" not in df.columns and "prompt" in df.columns:
	df["prompts"] = df["prompt"]
	missing = [c for c in REQUIRED_COLS if c not in df.columns]
	if missing:
	raise ValueError(f"{path} is missing required columns: {missing}")
	# keep only required, coerce f1 to numeric
	df = df[REQUIRED_COLS].copy()
	df["f1"] = pd.to_numeric(df["f1"], errors="coerce")
	df = df.dropna(subset=["f1"])
	return df

	def sanitize_filename(s: str) -> str:
	return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip())

	def format_float(x):
	if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))):
	return "nan"
	return f"{x:.4f}"

	def prompt_order_key(label: str):
	# Sort by the number in "prompt-<n>" if present; fallback to string
	m = re.search(r"(\d+)", str(label))
	return (0, int(m.group(1))) if m else (1, str(label))

	def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str:
	# Collect all prompt-level f1 values (across tasks and prompts)
	prompt_values = g["f1"].to_numpy(dtype=float)
	if prompt_values.size > 0:
	gen_value = float(np.mean(prompt_values))
	gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0
	else:
	gen_value, gen_stderr = float("nan"), 0.0

	# Build table text
	if configuration=="0shot" : configuration='0'
	if configuration=="10shot" : configuration='10'
	model = model.split("__")[0]+'/'+model.split("__")[1]
	#if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B'
	#if model =='HiTZ__Medical-mT5-large' : model=''
	#if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model
	#if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model
	#if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model
	#if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model=''
	#if model =='epfl-llm__meditron-7b' : model=''
	#if model =='google__gemma-2-9b-it' : model=''
	#if model =='google__gemma-3-27b-it' : model=''
	#if model =='google__medgemma-27b-text-it' : model=''
	#if model =='google__medgemma-4b-it' : model=''
	#if model =='microsoft__MediPhi-Clinical' : model=''
	#if model =='microsoft__MediPhi-Instruct' : model=''
	#if model =='mistralai__Mistral-7B-Instruct-v0.2' : model=''
	#if model =='mistralai__Mistral-Nemo-Instruct-2407' : model=''
	#if model =='tiiuae__Falcon3-10B-Instruct' : model=''
	#if model =='unsloth__phi-4' : model=''
	#if model =='Henrychur__MMed-Llama-3-8B' : model=''

	header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1"
	lines = [
	"\|Tasks \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|",
	"\|-------\|-------\|------\|------\|------\|----\|------\|---\|------\|",
	#f"\|Gen \| \| \| \|f1 \| \|{format_float(gen_value)} \|---\| {format_float(gen_stderr)} \|",
	]

	# For each task, add task row (mean over prompts) then prompt rows
	for task, df_task in g.groupby("task", sort=False):
	f1s = df_task["f1"].to_numpy(dtype=float)
	task_mean = float(np.mean(f1s)) if f1s.size else float("nan")
	lines.append(f"\| - {task.upper()} \| \| \| \|f1 \| \| {format_float(task_mean)} \| \|0 \|")

	# Prompt-level rows, sorted by prompt number if available
	df_task = df_task.copy()
	df_task["_order"] = df_task["prompts"].map(prompt_order_key)
	df_task = df_task.sort_values("_order")
	for _, r in df_task.iterrows():
	prompt_label = str(r["prompts"])
	lines.append(f"\| - {prompt_label} \| \| \| \|f1 \| \| {format_float(r['f1'])} \| \| 0 \|")

	return header + "\n" + "\n".join(lines) + "\n"

	def main():
	ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.")
	ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx")
	ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx")
	ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx")
	ap.add_argument("--output-dir", required=True, help="Directory to write output files")
	args = ap.parse_args()

	os.makedirs(args.output_dir, exist_ok=True)

	df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True)

	# One file per (model, language, configuration)
	for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False):
	content = render_group_table(g, model, language, config)
	fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt"
	out_path = os.path.join(args.output_dir, fname)
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(content)

	if __name__ == "__main__":
	main()