Spaces:
Runtime error
Runtime error
| MODEL_INFO = ["Model"] | |
| AVGACC = "Overall Acc." | |
| TASK_INFO = [AVGACC, "Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"] | |
| DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"] | |
| CSV_DIR = "./file/result.csv" | |
| COLUMN_NAMES = MODEL_INFO + TASK_INFO | |
| GT_PATH = "./file/AUTO-EVAL-VIDEO.json" | |
| JSON_DATASET_PATH = "./file/userdata.json" | |
| LEADERBORAD_INTRODUCTION = """# AutoEval-Video Leaderboard | |
| Welcome to the leaderboard of AutoEval-Video! | |
| AutoEval-Video comprises 327 complex open-ended video-question instances that span across nine skill dimensions, which address video-specific perception, comprehension, and generation skills. Please refer to our [paper]() for more details. | |
| """ | |
| SUBMIT_INTRODUCTION = """# Submit Introduction | |
| For example, if you want to upload GPT-4V's result in the leaderboard, you need to: | |
| 1. Fill in 'GPT-4V' in 'Model Name' if it is your first time to submit your result. Alternatively, if you wish to modify the outcomes of your model, please add a version suffix after the model's name like 'GPT-4V_v2'. | |
| 2. Upload results.json. | |
| 3. Click the 'Evaluate' button. | |
| 4. Click 'Refresh' to obtain the uploaded leaderboard. | |
| 5. The evaluation results of your model will be given in the "Overall Acc." box. For results specific to each evaluation dimension, please refer back to the leaderboard. | |
| """ | |
| TABLE_INTRODUCTION = """The table below shows the performance of various models on different evaluation dimensions on AutoEval-Video. | |
| We use accuracy(%) as the primary evaluation metric for each dimension. | |
| """ | |
| CITATION_BUTTON_LABEL = "If you find AutoEval-Video useful for your research and applications, please copy the following snippet to cite these results: " | |
| CITATION_BUTTON_TEXT = """""" | |
| style = """<style> | |
| .dataframe-container { | |
| overflow-x: auto; | |
| } | |
| </style>""" | |
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| import time | |
| import random | |
| from huggingface_hub import CommitScheduler, login | |
| import os | |
| from openai import OpenAI | |
| from tool import * | |
| global data_component | |
| login(token=os.environ.get("HF_TOKEN"), write_permission=True) | |
| def get_result_df(): | |
| df = pd.read_csv(CSV_DIR)[COLUMN_NAMES] | |
| df = df.sort_values(by=AVGACC, ascending=False) | |
| return df | |
| def prediction_analyse(prediction_content,questiontype_list): | |
| predictions = prediction_content.split("\n") | |
| ground_truth_data = [] | |
| with open("./file/AUTO-EVAL-VIDEO.json", "r") as f: | |
| for line in f : | |
| data = json.loads(line.strip()) | |
| ground_truth_data.append(data) | |
| id2item = {str(item["ID"]): item for item in ground_truth_data} | |
| results = {i: {"correct": 0, "total": 0} for i in questiontype_list} | |
| for prediction in predictions: | |
| # pdb.set_trace() | |
| prediction = prediction.strip() | |
| if not prediction: | |
| continue | |
| try: | |
| prediction = json.loads(prediction) | |
| except json.JSONDecodeError: | |
| print(f"Warning: Skipping invalid JSON data in line: {prediction}") | |
| continue | |
| question_id = str(prediction["ID"]) | |
| item_gt = id2item[question_id] | |
| rule = item_gt['Rule'] | |
| question_type = item_gt["Dimension"] | |
| pre_output = prediction["prediction"] | |
| if "judge" in list(prediction.keys()): | |
| judge_result_bit = prediction["judge"] | |
| else: | |
| _, judge_result_bit = alternate_judge(rule, pre_output, os.environ.get("yuan_api")) | |
| assert judge_result_bit in ["0", "1"], "Invalid judge result bit!" | |
| if judge_result_bit == "1": | |
| results[question_type]["correct"] += 1 | |
| results[question_type]["total"] += 1 | |
| return results | |
| scheduler = CommitScheduler( | |
| repo_id="AUTOEVAL-Video-Backup", | |
| private=True, | |
| repo_type="dataset", | |
| folder_path="./file", | |
| path_in_repo="data", | |
| every=1, | |
| ) | |
| def save_json(modelname, user_dict_list): | |
| with open(JSON_DATASET_PATH, "a") as f: | |
| json.dump({modelname:user_dict_list}, f) | |
| f.write('\n') | |
| def add_new_eval( | |
| input_file, | |
| model_name_textbox: str, | |
| ): | |
| if len(model_name_textbox) == 0: | |
| return "Error! Empty model name!", get_result_df() | |
| if input_file is None: | |
| return "Error! Empty file!", get_result_df() | |
| else: | |
| csv_data = pd.read_csv(CSV_DIR, dtype={'Model': str}) | |
| model_name_list = list(csv_data['Model']) | |
| if model_name_textbox in model_name_list: | |
| return "In the leaderboard, there already exists a model with the same name, and duplicate submissions of it are not allowed.", get_result_df() | |
| questiontype = ["Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"] | |
| id2questiontype = dict(zip(range(1, 10),questiontype)) | |
| content = input_file.decode("utf-8").strip() | |
| userdata = content.split('\n') | |
| if len(userdata) != count_lines(GT_PATH): | |
| return f"Error! The number of lines in the submit file ({len(userdata)}) does not match the number of lines in the AUTO-EVAL-VIDEO.json file ({count_lines(GT_PATH)}).", get_result_df() | |
| prediction = prediction_analyse(content,questiontype) | |
| each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in questiontype} | |
| total_correct_video = sum(prediction[i]["correct"] for i in questiontype) | |
| total_video = sum(prediction[i]["total"] for i in questiontype) | |
| average_accuracy_video = round(total_correct_video / total_video * 100, 1) | |
| col = csv_data.shape[0] | |
| new_data = [ | |
| model_name_textbox, | |
| average_accuracy_video, | |
| each_task_accuracy[id2questiontype[1]], | |
| each_task_accuracy[id2questiontype[2]], | |
| each_task_accuracy[id2questiontype[3]], | |
| each_task_accuracy[id2questiontype[4]], | |
| each_task_accuracy[id2questiontype[5]], | |
| each_task_accuracy[id2questiontype[6]], | |
| each_task_accuracy[id2questiontype[7]], | |
| each_task_accuracy[id2questiontype[8]], | |
| each_task_accuracy[id2questiontype[9]], | |
| ] | |
| csv_data.loc[col] = new_data | |
| with scheduler.lock: | |
| csv_data = csv_data.to_csv(CSV_DIR, index=False) | |
| save_json(model_name_textbox, userdata) | |
| return str(average_accuracy_video) + "%", get_result_df() | |
| block = gr.Blocks() | |
| with block: | |
| gr.Markdown( | |
| LEADERBORAD_INTRODUCTION | |
| ) | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem(" π AutoEval-Video Benchmark", elem_id="AutoEval-Video-tab-table", id=0): | |
| with gr.Row(): | |
| with gr.Accordion("Citation", open=False): | |
| # citation_button = gr.interface.inputs.Textbox( | |
| # value=CITATION_BUTTON_TEXT, | |
| # label=CITATION_BUTTON_LABEL, | |
| # interactive=False, | |
| # show_copy_button=True, | |
| # elem_id="citation-button", | |
| # ) | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| interactive=False, | |
| elem_id="citation-button", | |
| show_copy_button=True | |
| ) | |
| # citation_button = gr.Textbox( | |
| # value=CITATION_BUTTON_TEXT, | |
| # label=CITATION_BUTTON_LABEL, | |
| # interactive=False, | |
| # elem_id="citation-button", | |
| # ).style(show_copy_button=True) | |
| gr.Markdown( | |
| TABLE_INTRODUCTION | |
| ) | |
| data_component = gr.components.Dataframe( | |
| value=get_result_df, | |
| headers=COLUMN_NAMES, | |
| type="pandas", | |
| datatype=DATA_TITILE_TYPE, | |
| interactive=False, | |
| visible=True, | |
| # css=style, | |
| ) | |
| with gr.Row(): | |
| data_run = gr.Button("Refresh") | |
| data_run.click( | |
| get_result_df, outputs=data_component | |
| ) | |
| with gr.TabItem("β¨ Submit your model result here!", elem_id="AutoEval-Video-tab-table",id=1): | |
| with gr.Row(): | |
| gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") | |
| with gr.Column(): | |
| model_name_textbox = gr.Textbox( | |
| label="Model name" | |
| ) | |
| with gr.Column(): | |
| input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary') | |
| submit_button = gr.Button("Evaluate") | |
| overall_acc = gr.Textbox(label="Overall Acc.") | |
| submit_button.click( | |
| add_new_eval, | |
| inputs = [ | |
| input_file, | |
| model_name_textbox, | |
| ], | |
| outputs = [overall_acc, data_component], | |
| ) | |
| block.launch() |