Spaces:
Runtime error
Runtime error
| from functools import partial | |
| import os | |
| import json | |
| from typing import OrderedDict | |
| import tqdm | |
| import torch | |
| from PIL import Image | |
| import ast | |
| import numpy as np | |
| from multiprocessing import Pool | |
| from decord import VideoReader, cpu | |
| import os | |
| from tasks.eval.eval_utils import ( | |
| dump_json, | |
| load_json, | |
| EvalDataset, | |
| ) | |
| from dataclasses import dataclass | |
| from openai import OpenAI | |
| client = OpenAI( | |
| # This is the default and can be omitted | |
| api_key=os.environ.get("OPENAI_API_KEY"), | |
| ) | |
| task_type2chatgpt_contents = OrderedDict({ | |
| "MSVD_QA": { | |
| "system": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " | |
| "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" | |
| "------" | |
| "##INSTRUCTIONS: " | |
| "- Focus on the meaningful match between the predicted answer and the correct answer.\n" | |
| "- Consider synonyms or paraphrases as valid matches.\n" | |
| "- Evaluate the correctness of the prediction compared to the answer.", | |
| "user": """Please evaluate the following video-based question-answer pair:\n\n""" | |
| """Question: {question}\n""" | |
| """Correct Answer: {answer}\n""" | |
| """Predicted Answer: {pred}\n\n""" | |
| """Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. """ | |
| """Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING.""" | |
| """DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. """ | |
| """For example, your response should look like this: {{'pred': 'yes', 'score': 4.8}}.""" | |
| }, | |
| "MSRVTT_QA": { | |
| "system": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " | |
| "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" | |
| "------" | |
| "##INSTRUCTIONS: " | |
| "- Focus on the meaningful match between the predicted answer and the correct answer.\n" | |
| "- Consider synonyms or paraphrases as valid matches.\n" | |
| "- Evaluate the correctness of the prediction compared to the answer.", | |
| "user": """Please evaluate the following video-based question-answer pair:\n\n""" | |
| """Question: {question}\n""" | |
| """Correct Answer: {answer}\n""" | |
| """Predicted Answer: {pred}\n\n""" | |
| """Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. """ | |
| """Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING.""" | |
| """DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. """ | |
| """For example, your response should look like this: {{'pred': 'yes', 'score': 4.8}}.""" | |
| # """Make sure you only response with text that Follows Python syntax. For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.""" | |
| }, | |
| "ActivityNet": { | |
| "system": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " | |
| "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" | |
| "------" | |
| "##INSTRUCTIONS: " | |
| "- Focus on the meaningful match between the predicted answer and the correct answer.\n" | |
| "- Consider synonyms or paraphrases as valid matches.\n" | |
| "- Evaluate the correctness of the prediction compared to the answer.", | |
| "user": """Please evaluate the following video-based question-answer pair:\n\n""" | |
| """Question: {question}\n""" | |
| """Correct Answer: {answer}\n""" | |
| """Predicted Answer: {pred}\n\n""" | |
| """Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. """ | |
| """Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING.""" | |
| """DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. """ | |
| """For example, your response should look like this: {{'pred': 'yes', 'score': 4.8}}.""" | |
| # """Make sure you only response with text that Follows Python syntax. For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.""" | |
| }, | |
| "TGIF_QA": { | |
| "system": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " | |
| "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" | |
| "------" | |
| "##INSTRUCTIONS: " | |
| "- Focus on the meaningful match between the predicted answer and the correct answer.\n" | |
| "- Consider synonyms or paraphrases as valid matches.\n" | |
| "- Evaluate the correctness of the prediction compared to the answer.", | |
| "user": """Please evaluate the following video-based question-answer pair:\n\n""" | |
| """Question: {question}\n""" | |
| """Correct Answer: {answer}\n""" | |
| """Predicted Answer: {pred}\n\n""" | |
| """Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. """ | |
| """Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING.""" | |
| """DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. """ | |
| """For example, your response should look like this: {{'pred': 'yes', 'score': 4.8}}.""" | |
| # """Make sure you only response with text that Follows Python syntax. For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.""" | |
| }, | |
| }) | |
| # Follow the instructions carefully and be helpful and precise with your answer. | |
| def check_ans_qa(question, pred, gt, task_type, model="gpt-3.5-turbo-0125"): | |
| try: | |
| # Compute the temporal understanding score | |
| user_input = task_type2chatgpt_contents[task_type]['user'] | |
| user_input = user_input.format(question=question, answer=gt, pred=pred) | |
| completion = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": task_type2chatgpt_contents[task_type]['system'], | |
| }, | |
| { | |
| "role": "user", | |
| "content": user_input, | |
| } | |
| ] | |
| ) | |
| # Convert response to a Python dictionary. | |
| # response_message = completion["choices"][0]["message"]["content"] | |
| response_message = completion.choices[0].message.content | |
| response_dict = ast.literal_eval(response_message) | |
| pred = response_dict['pred'] | |
| score = response_dict['score'] | |
| if not pred in ('yes', 'no') or not isinstance(score, (int, float)): | |
| raise ValueError(f"{model} doesn't follow") | |
| flag = pred == 'yes' | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| flag, score = False, 0 | |
| print( | |
| f"GPT cannot deal with:\n" | |
| f"--pred: {pred}\n" | |
| f"--gt: {gt}\n" | |
| f"--gpt responded: {response_message}\n" | |
| "--will assign flag=False and score=0" | |
| ) | |
| print(f"Dumb Answer in {task_type}") | |
| return flag, score | |
| def chatgpt_eval(res, model="gpt-3.5-turbo-0125"): | |
| pred = res['pred'] | |
| gt = res['gt'] | |
| question=res['question'] | |
| task_type = res['task_type'] | |
| correct, score = check_ans_qa(question=question, pred=pred, gt=gt,task_type=task_type, model=model) # acc is bool, score is given by chatgpt | |
| # update the scores in result_list for this sample | |
| res['score'] = score | |
| res['correct'] = correct | |
| return res | |
| def save_results(result_list, save_path, model="gpt-3.5-turbo-0125"): | |
| dump_json(result_list, save_path, 'inference_results.json') | |
| with Pool(7) as pool: | |
| func = partial(chatgpt_eval, model=model) | |
| result_list = [ res for res in tqdm.tqdm(pool.imap_unordered(func, result_list), total=len(result_list), desc='Language Chat Model Automated Evaluation...')] | |
| # result_list = pool.map(partial(chatgpt_eval, model=model), result_list) | |
| # result_list = [chatgpt_eval(res, model=model) for res in result_list] | |
| final_res, acc_dict = {}, {} | |
| correct, total, total_score = 0, 0, 0 | |
| for i, res in enumerate(result_list): | |
| task_type = res['task_type'] | |
| if task_type not in acc_dict: | |
| acc_dict[task_type] = { | |
| 'correct': 0, | |
| 'total': 0, | |
| 'score': 0, | |
| } # correct, total | |
| acc_dict[task_type]['total'] += 1 | |
| acc_dict[task_type]['correct'] += res['correct'] | |
| acc_dict[task_type]['score'] += res['score'] | |
| for k, v in acc_dict.items(): | |
| final_res[k] = { | |
| 'acc': v['correct'] / v['total'] * 100, | |
| 'score': v['score'] / v['total'] | |
| } | |
| correct += v['correct'] | |
| total += v['total'] | |
| total_score += v['score'] | |
| final_res['Avg_Acc'] = correct / total * 100 | |
| final_res['Avg_Score'] = total_score / total | |
| all_results = { | |
| "acc_dict": acc_dict, | |
| "result_list": result_list | |
| } | |
| dump_json(all_results, save_path, 'all_results.json') | |
| dump_json(final_res, save_path, 'upload_leaderboard.json') | |
| def load_results(save_path): | |
| json_data = load_json(save_path, 'inference_results.json') | |
| return json_data | |
| class OpenendQASample(): | |
| question: str | |
| answer: str | |
| class VideoQABenchDataset(EvalDataset): | |
| data_dir = "DATAS/VideoQA" | |
| data_list_info = OrderedDict({ | |
| "MSVD_QA": OrderedDict( | |
| q_json_relpath="MSVD_Zero_Shot_QA/test_q.json", | |
| a_json_relpath="MSVD_Zero_Shot_QA/test_a.json", | |
| prefix="DATAS/VideoQA/MSVD_Zero_Shot_QA/videos", | |
| data_type="video", | |
| bound=False, | |
| question_key='question', | |
| answer_key='answer', | |
| name_key='video_name', | |
| postfix=('avi',), | |
| ), | |
| "MSRVTT_QA": OrderedDict( | |
| q_json_relpath="MSRVTT_Zero_Shot_QA/test_q.json", | |
| a_json_relpath="MSRVTT_Zero_Shot_QA/test_a.json", | |
| prefix="DATAS/VideoQA/MSRVTT_Zero_Shot_QA/videos/all", | |
| data_type="video", | |
| bound=False, | |
| question_key='question', | |
| answer_key='answer', | |
| name_key='video_name', | |
| postfix=('mp4', ), | |
| ), # don't has start & end | |
| "ActivityNet": OrderedDict( | |
| q_json_relpath="ActivityNet/test_q.json", | |
| a_json_relpath="ActivityNet/test_a.json", | |
| prefix="DATAS/VideoQA/ActivityNet/all_test", | |
| data_type="video", | |
| bound=False, | |
| question_key='question', | |
| answer_key='answer', | |
| name_key='video_name', | |
| postfix=('mp4', 'mkv', 'webm'), | |
| ), # don't has start & end | |
| "TGIF_QA": OrderedDict( | |
| q_json_relpath="TGIF_QA/test_q.json", | |
| a_json_relpath="TGIF_QA/test_a.json", | |
| prefix="DATAS/VideoQA/TGIF_QA/tgif_videos", | |
| data_type="gif", | |
| bound=False, | |
| question_key='question', | |
| answer_key='answer', | |
| name_key='video_name', | |
| postfix=('gif',), | |
| ), # don't has start & end | |
| }) | |
| def __init__(self, *args, **kwargs): | |
| # test_ratio for videoqa is for each sub dataset | |
| test_ratio = kwargs.pop('test_ratio', None) | |
| kwargs['test_ratio'] = None | |
| test_datasets = kwargs.pop('test_datasets', None) | |
| super().__init__(*args, **kwargs) | |
| test_ratio = 1 if test_ratio is None else test_ratio | |
| self.test_ratio = test_ratio | |
| if test_datasets is not None: | |
| data_list_info = {k:v for k,v in self.data_list_info.items() if k in test_datasets} | |
| else: | |
| data_list_info = self.data_list_info | |
| data_dir = self.data_dir | |
| self.data_list = [] | |
| for k, v in data_list_info.items(): | |
| with open(os.path.join(data_dir, v['q_json_relpath']), 'r') as f: | |
| quesions_json_data = json.load(f) | |
| with open(os.path.join(data_dir, v['a_json_relpath']), 'r') as f: | |
| answers_json_data = json.load(f) | |
| indexs = list(range(len(quesions_json_data))) | |
| np.random.RandomState(42).shuffle(indexs) | |
| num_samples = int(len(indexs) * self.test_ratio) if 0 < self.test_ratio <= 1 else int(self.test_ratio) | |
| indexs = indexs[:num_samples] | |
| for i in indexs: | |
| question_data = quesions_json_data[i] | |
| answer_data = answers_json_data[i] | |
| data = {} | |
| # why do we have anet's video name not in the original json file??? | |
| if k == "ActivityNet": | |
| question_data['video_name'] = 'v_' + question_data['video_name'] | |
| data.update(**question_data) | |
| data.update(**answer_data) | |
| self.data_list.append({ | |
| 'task_type': k, | |
| 'data': data, | |
| **v, # all the infos | |
| }) | |
| print(len(self.data_list)) | |
| def __len__(self): | |
| return len(self.data_list) | |
| def __getitem__(self, idx): | |
| decord_method = self.decord_method[self.data_list[idx]['data_type']] | |
| bound = None | |
| if self.data_list[idx]['bound']: | |
| bound = ( | |
| self.data_list[idx]['data']['start'], | |
| self.data_list[idx]['data']['end'], | |
| ) | |
| video_name_key = self.data_list[idx]['name_key'] | |
| video_name = self.data_list[idx]['data'][video_name_key] | |
| video_postfixs = self.data_list[idx]['postfix'] | |
| video_paths = [] | |
| for p in video_postfixs: | |
| video_path = os.path.join(self.data_list[idx]['prefix'], video_name + '.' + p) | |
| if os.path.exists(video_path): | |
| video_paths.append(video_path) | |
| assert len(video_paths) > 0, f'no video named {video_name}' | |
| # video_filename = self.data_list[idx]['data'][video_name_key] + video_postfix | |
| video_path = video_paths[0] | |
| images_group = decord_method(video_path, bound) | |
| question_key = self.data_list[idx]['question_key'] | |
| answer_key = self.data_list[idx]['answer_key'] | |
| sample = OpenendQASample( | |
| question=self.data_list[idx]['data'][question_key], | |
| answer=self.data_list[idx]['data'][answer_key] | |
| ) | |
| question, answer = self.qa_template(sample) | |
| return { | |
| 'video_pils': images_group, # some might use the original pils and do their own transforms | |
| 'question': question, | |
| 'video_path': video_path, | |
| 'answer': answer, | |
| 'task_type': self.data_list[idx]['task_type'] | |
| } | |
| def qa_template(self, data: OpenendQASample): | |
| answer = data.answer | |
| question = data.question | |
| # by far, might use some prompting. | |
| return question, answer | |