jbilcke-hf's picture
Upload core files for paper 2510.18876
46861c5 verified
import argparse
import json
import os
from collections import defaultdict
import numpy as np
def parse_args():
parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
parser.add_argument("-d", "--dir", default=None)
parser.add_argument("-f", "--files", nargs="*", default=None)
parser.add_argument("-i", "--ignore", nargs="*", default=None)
parser.add_argument("-s", "--save", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.ignore is not None:
args.ignore = [int(x) for x in args.ignore]
if args.files is not None and len(args.files) > 0:
review_files = args.files
else:
review_files = [
x
for x in os.listdir(args.dir)
if x.endswith(".jsonl")
and (
x.startswith("gpt4_text")
or x.startswith("reviews_")
or x.startswith("review_")
)
]
metrics = []
for review_file in sorted(review_files):
config = (
os.path.basename(review_file)
.replace("gpt4_text_", "")
.replace(".jsonl", "")
)
scores = defaultdict(list)
print(config)
with open(
os.path.join(args.dir, review_file) if args.dir is not None else review_file
) as f:
for review_str in f:
review = json.loads(review_str)
if args.ignore is not None and review["question_id"] in args.ignore:
continue
if "category" in review:
scores[review["category"]].append(review["tuple"])
scores["all"].append(review["tuple"])
else:
if "tuple" in review:
scores["all"].append(review["tuple"])
else:
scores["all"].append(review["score"])
summ_dict = defaultdict(list)
for k, v in sorted(scores.items()):
stats = np.asarray(v).mean(0).tolist()
stats = [round(x, 3) for x in stats]
# print(k, stats, round(stats[1]/stats[0]*100, 1))
print(k, round(stats[1] / stats[0] * 100, 1))
summ_dict[k] = round(stats[1] / stats[0] * 100, 1)
print("=================================")
metrics.append(summ_dict)
if args.save:
with open(os.path.join(args.dir, "metric.json"), "w") as f:
json.dump(metrics, f, indent=2)