|
|
|
|
|
import argparse |
|
|
import glob |
|
|
import json |
|
|
import os |
|
|
import subprocess |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
from lmms_eval import utils |
|
|
from lmms_eval.api.registry import ALL_TASKS |
|
|
|
|
|
model_types = ["llava_onevision"] |
|
|
vision_models = [ |
|
|
"lmms-lab/llava-onevision-qwen2-0.5b-ov", |
|
|
] |
|
|
|
|
|
single_image_tasks = ["ocrbench", "mmmu_val", "ai2d"] |
|
|
multi_image_tasks = ["muirbench"] |
|
|
video_tasks = ["videomme"] |
|
|
|
|
|
|
|
|
|
|
|
task_names = single_image_tasks + multi_image_tasks + video_tasks |
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--branches", default=[]) |
|
|
parser.add_argument("--models", default=vision_models) |
|
|
parser.add_argument("--tasks", default=task_names) |
|
|
parser.add_argument("--acc_norm", type=bool, default=False) |
|
|
parser.add_argument("--perplexity", default=None) |
|
|
|
|
|
parser.add_argument("--num_fewshot", type=int, default=0) |
|
|
parser.add_argument("--limit", type=float, default=8) |
|
|
|
|
|
parser.add_argument("--model", default="llava_onevision") |
|
|
|
|
|
parser.add_argument("--model_args", default="conv_template=qwen_1_5,model_name=llava_qwen") |
|
|
parser.add_argument("--batch_size", default="1") |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def eval_models(args, branch=None): |
|
|
if branch is not None: |
|
|
if os.system(f"git checkout {branch}") != 0: |
|
|
return {}, 0 |
|
|
|
|
|
branch = branch or initial_branch |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
results = {} |
|
|
|
|
|
for indx, model in enumerate(args.models): |
|
|
model_type = model_types[indx] |
|
|
model_args = f"pretrained={model},{args.model_args}" |
|
|
tasks = args.tasks |
|
|
batch_size = args.batch_size |
|
|
output_path = f"logs/regression_test/{int(start_time)}-{branch.replace('/', '_')}" |
|
|
|
|
|
original_dir = os.getcwd() |
|
|
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
|
command = ( |
|
|
f"python3 -m accelerate.commands.launch --main_process_port=12580 --num_processes=8 lmms_eval --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " |
|
|
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " |
|
|
f"--batch_size {batch_size} --output_path {output_path}" |
|
|
) |
|
|
|
|
|
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}") |
|
|
|
|
|
ret = os.system(command) |
|
|
os.chdir(original_dir) |
|
|
|
|
|
json_file_path = find_json_file(output_path) |
|
|
|
|
|
if json_file_path and ret == 0: |
|
|
with open(json_file_path, encoding="utf-8") as f: |
|
|
results[model] = json.load(f) |
|
|
else: |
|
|
results[model] = {"results": {}} |
|
|
|
|
|
end_time = time.time() |
|
|
|
|
|
return results, end_time - start_time |
|
|
|
|
|
|
|
|
def extract_value(args, results, model, task, err=False): |
|
|
if model not in results: |
|
|
return 0 |
|
|
results = results[model]["results"] |
|
|
if task not in results: |
|
|
return 0 |
|
|
results = results[task] |
|
|
if task == "ai2d": |
|
|
return results["exact_match,flexible-extract"] |
|
|
elif task == "mmmu_val": |
|
|
return results["mmmu_acc,none"] |
|
|
elif task == "ocrbench": |
|
|
return results["ocrbench_accuracy,none"] |
|
|
elif task == "videomme": |
|
|
return results["videomme_perception_score,none"] |
|
|
elif task == "muirbench": |
|
|
return results["muirbench_score_overall,flexible-extract"] |
|
|
return 0 |
|
|
|
|
|
|
|
|
def format_value(args, results, model, task): |
|
|
val = 100 * extract_value(args, results, model, task) |
|
|
err = 100 * extract_value(args, results, model, task, err=True) |
|
|
return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}" |
|
|
|
|
|
|
|
|
def format_diff(args, results1, results2, model, task): |
|
|
val1 = 100 * extract_value(args, results1, model, task) |
|
|
val2 = 100 * extract_value(args, results2, model, task) |
|
|
diff = val2 - val1 |
|
|
return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}" |
|
|
|
|
|
|
|
|
def find_json_file(base_path): |
|
|
pattern = os.path.join(base_path, "**", "*_results.json") |
|
|
json_files = glob.glob(pattern, recursive=True) |
|
|
return json_files[0] if json_files else None |
|
|
|
|
|
|
|
|
def main(): |
|
|
args = parse_args() |
|
|
|
|
|
args.branches = args.branches.split(",") if isinstance(args.branches, str) else args.branches |
|
|
args.models = args.models.split(",") if isinstance(args.models, str) else args.models |
|
|
args.tasks = ALL_TASKS if args.tasks == "all_tasks" else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if isinstance(args.tasks, str) else args.tasks |
|
|
|
|
|
global initial_branch |
|
|
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results, runtime = eval_models(args) |
|
|
print(results, runtime) |
|
|
|
|
|
runs = [] |
|
|
for branch in args.branches: |
|
|
runs.append((branch, *eval_models(args, branch))) |
|
|
|
|
|
os.system(f"git checkout {initial_branch}") |
|
|
|
|
|
print("") |
|
|
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") |
|
|
print(f"|--|{'--|' * len(args.models)}") |
|
|
for task in args.tasks: |
|
|
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|") |
|
|
for branch, branch_results, branch_runtime in runs: |
|
|
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|") |
|
|
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|") |
|
|
|
|
|
print("") |
|
|
print("|branch|runtime|%|") |
|
|
print("|--|--|--|") |
|
|
print(f"|{initial_branch}|{runtime:.1f}s|100%|") |
|
|
for branch, _, branch_runtime in runs: |
|
|
print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|