|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import re |
|
|
import math |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"] |
|
|
|
|
|
def read_scores(path: str) -> pd.DataFrame: |
|
|
df = pd.read_excel(path) |
|
|
|
|
|
df.columns = [c.strip().lower() for c in df.columns] |
|
|
if "prompts" not in df.columns and "prompt" in df.columns: |
|
|
df["prompts"] = df["prompt"] |
|
|
missing = [c for c in REQUIRED_COLS if c not in df.columns] |
|
|
if missing: |
|
|
raise ValueError(f"{path} is missing required columns: {missing}") |
|
|
|
|
|
df = df[REQUIRED_COLS].copy() |
|
|
df["f1"] = pd.to_numeric(df["f1"], errors="coerce") |
|
|
df = df.dropna(subset=["f1"]) |
|
|
return df |
|
|
|
|
|
def sanitize_filename(s: str) -> str: |
|
|
return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip()) |
|
|
|
|
|
def format_float(x): |
|
|
if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))): |
|
|
return "nan" |
|
|
return f"{x:.4f}" |
|
|
|
|
|
def prompt_order_key(label: str): |
|
|
|
|
|
m = re.search(r"(\d+)", str(label)) |
|
|
return (0, int(m.group(1))) if m else (1, str(label)) |
|
|
|
|
|
def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str: |
|
|
|
|
|
prompt_values = g["f1"].to_numpy(dtype=float) |
|
|
if prompt_values.size > 0: |
|
|
gen_value = float(np.mean(prompt_values)) |
|
|
gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0 |
|
|
else: |
|
|
gen_value, gen_stderr = float("nan"), 0.0 |
|
|
|
|
|
|
|
|
if configuration=="0shot" : configuration='0' |
|
|
if configuration=="10shot" : configuration='10' |
|
|
model = model.split("__")[0]+'/'+model.split("__")[1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1" |
|
|
lines = [ |
|
|
"|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|", |
|
|
"|-------|-------|------|------|------|----|------|---|------|", |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
for task, df_task in g.groupby("task", sort=False): |
|
|
f1s = df_task["f1"].to_numpy(dtype=float) |
|
|
task_mean = float(np.mean(f1s)) if f1s.size else float("nan") |
|
|
lines.append(f"| - {task.upper()} | | | |f1 | | {format_float(task_mean)} | |0 |") |
|
|
|
|
|
|
|
|
df_task = df_task.copy() |
|
|
df_task["_order"] = df_task["prompts"].map(prompt_order_key) |
|
|
df_task = df_task.sort_values("_order") |
|
|
for _, r in df_task.iterrows(): |
|
|
prompt_label = str(r["prompts"]) |
|
|
lines.append(f"| - {prompt_label} | | | |f1 | | {format_float(r['f1'])} | | 0 |") |
|
|
|
|
|
return header + "\n" + "\n".join(lines) + "\n" |
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.") |
|
|
ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx") |
|
|
ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx") |
|
|
ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx") |
|
|
ap.add_argument("--output-dir", required=True, help="Directory to write output files") |
|
|
args = ap.parse_args() |
|
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
|
|
df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True) |
|
|
|
|
|
|
|
|
for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False): |
|
|
content = render_group_table(g, model, language, config) |
|
|
fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt" |
|
|
out_path = os.path.join(args.output_dir, fname) |
|
|
with open(out_path, "w", encoding="utf-8") as f: |
|
|
f.write(content) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |