| | import openai |
| | import os |
| | import argparse |
| | import json |
| | import jsonlines |
| | import ast |
| | from multiprocessing.pool import Pool |
| |
|
| |
|
| | def read_jsonl(file): |
| | results = [] |
| | with open(file, encoding='utf-8') as f: |
| | for item in jsonlines.Reader(f): |
| | results.append(item) |
| | return results |
| |
|
| |
|
| | def parse_args(): |
| | parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3") |
| | parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.") |
| | parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.") |
| | parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.") |
| | parser.add_argument("--api_key", required=True, help="OpenAI API key.") |
| | parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.") |
| | args = parser.parse_args() |
| | return args |
| |
|
| |
|
| | def annotate(prediction_set, caption_files, output_dir): |
| | """ |
| | Evaluates question and answer pairs using GPT-3 |
| | Returns a score for correctness. |
| | """ |
| | for file in caption_files: |
| | key = file[:-5] |
| | qa_set = prediction_set[key] |
| | question = qa_set['q'] |
| | answer = qa_set['a'] |
| | pred = qa_set['pred'] |
| | try: |
| | |
| | completion = openai.ChatCompletion.create( |
| | model="gpt-3.5-turbo", |
| | messages=[ |
| | { |
| | "role": "system", |
| | "content": |
| | "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " |
| | "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" |
| | "------" |
| | "##INSTRUCTIONS: " |
| | "- Focus on the meaningful match between the predicted answer and the correct answer.\n" |
| | "- Consider synonyms or paraphrases as valid matches.\n" |
| | "- Evaluate the correctness of the prediction compared to the answer." |
| | }, |
| | { |
| | "role": "user", |
| | "content": |
| | "Please evaluate the following video-based question-answer pair:\n\n" |
| | f"Question: {question}\n" |
| | f"Correct Answer: {answer}\n" |
| | f"Predicted Answer: {pred}\n\n" |
| | "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. " |
| | "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING." |
| | "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " |
| | "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}." |
| | } |
| | ] |
| | ) |
| | |
| | response_message = completion["choices"][0]["message"]["content"] |
| | response_dict = ast.literal_eval(response_message) |
| | result_qa_pair = [response_dict, qa_set] |
| |
|
| | |
| | with open(f"{output_dir}/{key}.json", "w") as f: |
| | json.dump(result_qa_pair, f) |
| |
|
| | except Exception as e: |
| | print(f"Error processing file '{key}': {e}") |
| |
|
| |
|
| | def main(): |
| | """ |
| | Main function to control the flow of the program. |
| | """ |
| | |
| | args = parse_args() |
| |
|
| | file = args.pred_path |
| | try: |
| | pred_contents = json.load(file) |
| | except: |
| | pred_contents = read_jsonl(file) |
| |
|
| | |
| | video_id_counts = {} |
| | new_pred_contents = [] |
| |
|
| | |
| | for sample in pred_contents: |
| | sample['video_name'] = 1 |
| | video_id = sample['video_name'] |
| | if video_id in video_id_counts: |
| | video_id_counts[video_id] += 1 |
| | else: |
| | video_id_counts[video_id] = 0 |
| |
|
| | |
| | new_sample = sample |
| | new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" |
| | new_pred_contents.append(new_sample) |
| |
|
| | |
| | id_list = [x['video_name'] for x in new_pred_contents] |
| | caption_files = [f"{id}.json" for id in id_list] |
| |
|
| | output_dir = args.output_dir |
| | |
| | if not os.path.exists(output_dir): |
| | os.makedirs(output_dir) |
| |
|
| | |
| | prediction_set = {} |
| | for sample in new_pred_contents: |
| | id = sample['video_name'] |
| | question = sample['prompt'] |
| | answer = sample['answer'] |
| | pred = sample['text'] |
| | qa_set = {"q": question, "a": answer, "pred": pred} |
| | prediction_set[id] = qa_set |
| |
|
| | |
| | openai.api_key = args.api_key |
| | num_tasks = args.num_tasks |
| |
|
| | |
| | while True: |
| | try: |
| | |
| | completed_files = os.listdir(output_dir) |
| | print(f"completed_files: {len(completed_files)}") |
| |
|
| | |
| | incomplete_files = [f for f in caption_files if f not in completed_files] |
| | print(f"incomplete_files: {len(incomplete_files)}") |
| |
|
| | |
| | if len(incomplete_files) == 0: |
| | break |
| | if len(incomplete_files) <= num_tasks: |
| | num_tasks = 1 |
| |
|
| | |
| | part_len = len(incomplete_files) // num_tasks |
| | all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)] |
| | task_args = [(prediction_set, part, args.output_dir) for part in all_parts] |
| |
|
| | |
| | with Pool() as pool: |
| | pool.starmap(annotate, task_args) |
| |
|
| | except Exception as e: |
| | print(f"Error: {e}") |
| |
|
| | |
| | combined_contents = {} |
| | json_path = args.output_json |
| |
|
| | |
| | for file_name in os.listdir(output_dir): |
| | if file_name.endswith(".json"): |
| | file_path = os.path.join(output_dir, file_name) |
| | with open(file_path, "r") as json_file: |
| | content = json.load(json_file) |
| | combined_contents[file_name[:-5]] = content |
| |
|
| | |
| | with open(json_path, "w") as json_file: |
| | json.dump(combined_contents, json_file) |
| | print("All evaluation completed!") |
| |
|
| | |
| | score_sum = 0 |
| | count = 0 |
| | yes_count = 0 |
| | no_count = 0 |
| | for key, result in combined_contents.items(): |
| | |
| | count += 1 |
| | score_match = result[0]['score'] |
| | score = int(score_match) |
| | score_sum += score |
| |
|
| | |
| | pred = result[0]['pred'] |
| | if "yes" in pred.lower(): |
| | yes_count += 1 |
| | elif "no" in pred.lower(): |
| | no_count += 1 |
| |
|
| | average_score = score_sum / count |
| | accuracy = yes_count / (yes_count + no_count) |
| | print("Yes count:", yes_count) |
| | print("No count:", no_count) |
| | print("Accuracy:", accuracy) |
| | print("Average score:", average_score) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|