llm_cp2 / src /lmms-eval /tools /regression.py
csuhan's picture
Upload folder using huggingface_hub
b0c0df0 verified
# code from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/scripts/regression.py
import argparse
import glob
import json
import os
import subprocess
import time
from pathlib import Path
from lmms_eval import utils
from lmms_eval.api.registry import ALL_TASKS
model_types = ["llava_onevision"]
vision_models = [
"lmms-lab/llava-onevision-qwen2-0.5b-ov",
]
single_image_tasks = ["ocrbench", "mmmu_val", "ai2d"]
multi_image_tasks = ["muirbench"]
video_tasks = ["videomme"]
# choice_tasks = []
# perplexity_tasks = []
# generation_tasks = []
task_names = single_image_tasks + multi_image_tasks + video_tasks
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--branches", default=[])
parser.add_argument("--models", default=vision_models)
parser.add_argument("--tasks", default=task_names)
parser.add_argument("--acc_norm", type=bool, default=False)
parser.add_argument("--perplexity", default=None)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--limit", type=float, default=8)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser.add_argument("--model", default="llava_onevision")
# Use whatever is faster here
parser.add_argument("--model_args", default="conv_template=qwen_1_5,model_name=llava_qwen")
parser.add_argument("--batch_size", default="1")
return parser.parse_args()
def eval_models(args, branch=None):
if branch is not None:
if os.system(f"git checkout {branch}") != 0:
return {}, 0
branch = branch or initial_branch
start_time = time.time()
results = {}
for indx, model in enumerate(args.models):
model_type = model_types[indx]
model_args = f"pretrained={model},{args.model_args}"
tasks = args.tasks
batch_size = args.batch_size
output_path = f"logs/regression_test/{int(start_time)}-{branch.replace('/', '_')}"
original_dir = os.getcwd()
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
command = (
f"python3 -m accelerate.commands.launch --main_process_port=12580 --num_processes=8 lmms_eval --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
f"--batch_size {batch_size} --output_path {output_path}"
)
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
ret = os.system(command)
os.chdir(original_dir)
json_file_path = find_json_file(output_path)
if json_file_path and ret == 0:
with open(json_file_path, encoding="utf-8") as f:
results[model] = json.load(f)
else:
results[model] = {"results": {}}
end_time = time.time()
return results, end_time - start_time
def extract_value(args, results, model, task, err=False):
if model not in results:
return 0
results = results[model]["results"]
if task not in results:
return 0
results = results[task]
if task == "ai2d":
return results["exact_match,flexible-extract"]
elif task == "mmmu_val":
return results["mmmu_acc,none"]
elif task == "ocrbench":
return results["ocrbench_accuracy,none"]
elif task == "videomme":
return results["videomme_perception_score,none"]
elif task == "muirbench":
return results["muirbench_score_overall,flexible-extract"]
return 0
def format_value(args, results, model, task):
val = 100 * extract_value(args, results, model, task)
err = 100 * extract_value(args, results, model, task, err=True)
return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}"
def format_diff(args, results1, results2, model, task):
val1 = 100 * extract_value(args, results1, model, task)
val2 = 100 * extract_value(args, results2, model, task)
diff = val2 - val1
return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}"
def find_json_file(base_path):
pattern = os.path.join(base_path, "**", "*_results.json")
json_files = glob.glob(pattern, recursive=True)
return json_files[0] if json_files else None
def main():
args = parse_args()
args.branches = args.branches.split(",") if isinstance(args.branches, str) else args.branches
args.models = args.models.split(",") if isinstance(args.models, str) else args.models
args.tasks = ALL_TASKS if args.tasks == "all_tasks" else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if isinstance(args.tasks, str) else args.tasks
global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results, runtime = eval_models(args)
print(results, runtime)
runs = []
for branch in args.branches:
runs.append((branch, *eval_models(args, branch)))
os.system(f"git checkout {initial_branch}")
print("")
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
print("")
print("|branch|runtime|%|")
print("|--|--|--|")
print(f"|{initial_branch}|{runtime:.1f}s|100%|")
for branch, _, branch_runtime in runs:
print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|")
if __name__ == "__main__":
main()