| import argparse | |
| import util | |
| from collections import defaultdict | |
| import pandas as pd | |
| def get_domain(x): | |
| for domain in ["chest_xray", "mri", "histology", "gross", "ct_scan"]: | |
| in_domain = x["domain"][domain] | |
| if in_domain: | |
| return domain | |
| def main(args): | |
| scores_data = util.load_file_jsonl(args.scores_file) | |
| predictions = [ | |
| (x["question_id"], x["type"], get_domain(x), x["gpt_eval"].split("\n")[0].split(" ")) | |
| for x in scores_data | |
| ] | |
| score_type_dict = defaultdict(lambda: defaultdict(list)) | |
| for q_id, q_type, domain, (a1_score, a2_score) in predictions: | |
| score_type_dict[q_type][1].append(a1_score) | |
| score_type_dict[q_type][2].append(a2_score) | |
| score_type_dict["overall"][1].append(a1_score) | |
| score_type_dict["overall"][2].append(a2_score) | |
| score_type_dict[domain][1].append(a1_score) | |
| score_type_dict[domain][2].append(a2_score) | |
| result = defaultdict(dict) | |
| for q_type, score_dict in score_type_dict.items(): | |
| result[q_type]["gpt4_score"] = util.get_avg(score_dict[1]) | |
| result[q_type]["pred_score"] = util.get_avg(score_dict[2]) | |
| result[q_type]["pred_relative_score"] = ( | |
| util.get_avg([float(s2) / float(s1) for s1, s2 in zip(score_dict[1], score_dict[2])]) | |
| * 100 | |
| ) | |
| result[q_type]["data_size"] = len(score_dict[1]) | |
| df = pd.DataFrame.from_dict(result).filter( | |
| [ | |
| "conversation", | |
| "detailed_description", | |
| "chest_xray", | |
| "mri", | |
| "histology", | |
| "gross", | |
| "ct_scan", | |
| "overall", | |
| ] | |
| ) | |
| print(df) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Eval Postprocessing", add_help=True) | |
| parser.add_argument( | |
| "--scores-file", default="", metavar="FILE", help="input path to gpt-4 score file" | |
| ) | |
| args = parser.parse_args() | |
| main(args) | |