| | import os |
| | import argparse |
| | import json |
| | from typing import List, Dict, Optional, Union |
| | import re |
| |
|
| | CATEGORIES = [ |
| | "Knowledge", |
| | "Film & Television", |
| | "Sports Competition", |
| | "Artistic Performance", |
| | "Life Record", |
| | "Multilingual" |
| | ] |
| |
|
| | SUB_CATEGORIES = [ |
| | "Humanity & History", |
| | "Literature & Art", |
| | "Biology & Medicine", |
| | "Finance & Commerce", |
| | "Astronomy", |
| | "Geography", |
| | "Law", |
| | "Life Tip", |
| | "Technology", |
| | "Animation", |
| | "Movie & TV Show", |
| | "Documentary", |
| | "News Report", |
| | "Esports", |
| | "Basketball", |
| | "Football", |
| | "Athletics", |
| | "Other Sports", |
| | "Stage Play", |
| | "Magic Show", |
| | "Variety Show", |
| | "Acrobatics", |
| | "Handicraft", |
| | "Food", |
| | "Fashion", |
| | "Daily Life", |
| | "Travel", |
| | "Pet & Animal", |
| | "Exercise", |
| | "Multilingual" |
| | ] |
| |
|
| | TASK_CATEGORIES = [ |
| | "Temporal Perception", |
| | "Spatial Perception", |
| | "Attribute Perception", |
| | "Action Recognition", |
| | "Object Recognition", |
| | "OCR Problems", |
| | "Counting Problem", |
| | "Temporal Reasoning", |
| | "Spatial Reasoning", |
| | "Action Reasoning", |
| | "Object Reasoning", |
| | "Information Synopsis", |
| | ] |
| |
|
| |
|
| | def extract_characters_regex(s): |
| | s = s.strip() |
| | answer_prefixes = [ |
| | "The best answer is", |
| | "The correct answer is", |
| | "The answer is", |
| | "The answer", |
| | "The best option is" |
| | "The correct option is", |
| | "Best answer:" |
| | "Best option:", |
| | ] |
| | for answer_prefix in answer_prefixes: |
| | s = s.replace(answer_prefix, "") |
| |
|
| | if len(s.split()) > 10 and not re.search("[ABCD]", s): |
| | return "" |
| | matches = re.search(r'[ABCD]', s) |
| | if matches is None: |
| | return "" |
| | return matches[0] |
| |
|
| |
|
| | def eval_your_results( |
| | your_results_path: str, |
| | video_types: Optional[Union[List[str], str]] = None, |
| | skip_missing: Optional[bool] = False, |
| | return_categories_accuracy: Optional[bool] = True, |
| | return_sub_categories_accuracy: Optional[bool] = False, |
| | return_task_types_accuracy: Optional[bool] = False, |
| | gt_answer_key: Optional[str] = "answer", |
| | your_answer_key: Optional[str] = "response" |
| | |
| | ): |
| | """ |
| | Evaluate your results against the ground truth |
| | |
| | Args: |
| | - your_results_path (str): Path to your results file |
| | - video_types (Optional[List[str], str]): List of video types to evaluate. |
| | - skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files. |
| | - return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned. |
| | - return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned. |
| | - return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned. |
| | - gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file. |
| | - your_answer_key (Optional[str]): Key to access your answer in the results file. |
| | """ |
| |
|
| | |
| | with open(your_results_path, 'r') as f: |
| | your_results = json.load(f) |
| |
|
| | if isinstance(video_types, str): |
| | video_types = video_types.split(",") |
| |
|
| | q_type_dict = {} |
| | v_type_dict = {} |
| | v_sub_type_dict = {} |
| |
|
| |
|
| | for video_type in video_types: |
| |
|
| | |
| | your_results_video_type = [item for item in your_results if item["duration_category"] == video_type] |
| |
|
| | |
| | q_type_dict[video_type] = {} |
| | for q_type in TASK_CATEGORIES: |
| | q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0} |
| |
|
| | |
| | v_type_dict[video_type] = {} |
| | for v_type in CATEGORIES: |
| | v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0} |
| | |
| | v_sub_type_dict[video_type] = {} |
| | for v_sub_type in SUB_CATEGORIES: |
| | v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0} |
| |
|
| | if not skip_missing: |
| | |
| | assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files." |
| |
|
| | for item in your_results_video_type: |
| |
|
| | if skip_missing and item["missing"]: |
| | continue |
| |
|
| | |
| | video_category = item["video_category"] |
| | video_sub_category = item["video_subcategory"] |
| | |
| | questions = item["questions"] |
| |
|
| | for question in questions: |
| | q_type = question["task_type"] |
| |
|
| | |
| | gt_answer = question[gt_answer_key] |
| | response = question[your_answer_key] |
| |
|
| | |
| | extration = extract_characters_regex(response) |
| | |
| | if extration != "": |
| | q_type_dict[video_type][q_type]["answered"] += 1 |
| | q_type_dict[video_type][q_type]["correct"] += extration == gt_answer |
| |
|
| | v_type_dict[video_type][video_category]["answered"] += 1 |
| | v_type_dict[video_type][video_category]["correct"] += extration == gt_answer |
| |
|
| | v_sub_type_dict[video_type][video_sub_category]["answered"] += 1 |
| | v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer |
| |
|
| |
|
| | |
| | for video_type in video_types: |
| |
|
| | print("=====================================") |
| | print(f"Evaluation on video Type: {video_type}") |
| | print("=====================================") |
| | if return_categories_accuracy: |
| | print("-------------------------------------") |
| | print("Video Categories") |
| | print("-------------------------------------") |
| | for v_type in v_type_dict[video_type]: |
| | print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%") |
| | if return_sub_categories_accuracy: |
| | print("-------------------------------------") |
| | print("Video Sub Categories") |
| | print("-------------------------------------") |
| | for v_sub_type in v_sub_type_dict[video_type]: |
| | print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%") |
| | if return_task_types_accuracy: |
| | print("-------------------------------------") |
| | print("Task Categories") |
| | print("-------------------------------------") |
| | for q_type in q_type_dict[video_type]: |
| | print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%") |
| | |
| | print("-------------------------------------") |
| | print("Overall Performance") |
| | print("-------------------------------------") |
| | total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) |
| | total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) |
| | print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") |
| |
|
| | print("\n") |
| |
|
| | |
| | print("=====================================") |
| | print("Evaluation on the entire dataset") |
| | print("=====================================") |
| |
|
| | if return_categories_accuracy: |
| | print("-------------------------------------") |
| | print("Video Categories") |
| | print("-------------------------------------") |
| | for v_type in CATEGORIES: |
| | total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types]) |
| | total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types]) |
| | print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") |
| | |
| |
|
| | if return_sub_categories_accuracy: |
| | print("-------------------------------------") |
| | print("Video Sub Categories") |
| | print("-------------------------------------") |
| |
|
| | for v_sub_type in SUB_CATEGORIES: |
| | total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types]) |
| | total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types]) |
| | print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") |
| |
|
| |
|
| | if return_task_types_accuracy: |
| | print("-------------------------------------") |
| | print("Task Categories") |
| | print("-------------------------------------") |
| | for q_type in TASK_CATEGORIES: |
| |
|
| | total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types]) |
| | total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types]) |
| | print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") |
| |
|
| | print("-------------------------------------") |
| | print("Overall Performance") |
| | print("-------------------------------------") |
| | total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types]) |
| | total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types]) |
| | print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") |
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--results_file", type=str, required=True) |
| | parser.add_argument("--video_duration_type", type=str, required=True) |
| | parser.add_argument("--return_categories_accuracy", action="store_true") |
| | parser.add_argument("--return_sub_categories_accuracy", action="store_true") |
| | parser.add_argument("--return_task_types_accuracy", action="store_true") |
| |
|
| | args = parser.parse_args() |
| |
|
| | eval_your_results( |
| | args.results_file, |
| | video_types=args.video_duration_type, |
| | return_categories_accuracy=args.return_categories_accuracy, |
| | return_sub_categories_accuracy=args.return_sub_categories_accuracy, |
| | return_task_types_accuracy=args.return_task_types_accuracy, |
| | ) |
| |
|
| |
|
| |
|
| |
|