|
|
import ast |
|
|
import argparse |
|
|
import glob |
|
|
import pickle |
|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
block_css = """ |
|
|
#notice_markdown { |
|
|
font-size: 104% |
|
|
} |
|
|
#notice_markdown th { |
|
|
display: none; |
|
|
} |
|
|
#notice_markdown td { |
|
|
padding-top: 6px; |
|
|
padding-bottom: 6px; |
|
|
} |
|
|
#leaderboard_markdown { |
|
|
font-size: 104% |
|
|
} |
|
|
#leaderboard_markdown td { |
|
|
padding-top: 6px; |
|
|
padding-bottom: 6px; |
|
|
} |
|
|
#leaderboard_dataframe td { |
|
|
line-height: 0.1em; |
|
|
} |
|
|
footer { |
|
|
display:none !important |
|
|
} |
|
|
.image-container { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
padding: 1px; |
|
|
} |
|
|
.image-container img { |
|
|
margin: 0 30px; |
|
|
height: 20px; |
|
|
max-height: 100%; |
|
|
width: auto; |
|
|
max-width: 20%; |
|
|
} |
|
|
""" |
|
|
def model_hyperlink(model_name, link): |
|
|
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
def load_leaderboard_table_csv(filename, add_hyperlink=True): |
|
|
lines = open(filename).readlines() |
|
|
heads = [v.strip() for v in lines[0].split(",")] |
|
|
rows = [] |
|
|
for i in range(1, len(lines)): |
|
|
row = [v.strip() for v in lines[i].split(",")] |
|
|
for j in range(len(heads)): |
|
|
item = {} |
|
|
for h, v in zip(heads, row): |
|
|
if h != "Model" and h != "Link" and h != "Language Model" and h != "Open Source": |
|
|
item[h] = int(v) |
|
|
else: |
|
|
item[h] = v |
|
|
if add_hyperlink: |
|
|
item["Model"] = model_hyperlink(item["Model"], item["Link"]) |
|
|
rows.append(item) |
|
|
return rows |
|
|
|
|
|
def get_arena_table(model_table_df): |
|
|
|
|
|
model_table_df = model_table_df.sort_values(by=["Final Score"], ascending=False) |
|
|
values = [] |
|
|
for i in range(len(model_table_df)): |
|
|
row = [] |
|
|
model_key = model_table_df.index[i] |
|
|
model_name = model_table_df["Model"].values[model_key] |
|
|
|
|
|
row.append(i + 1) |
|
|
|
|
|
row.append(model_name) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Language Model"].values[model_key] |
|
|
) |
|
|
row.append( |
|
|
model_table_df["Open Source"].values[model_key] |
|
|
) |
|
|
row.append( |
|
|
model_table_df["Text Recognition"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Scene Text-Centric VQA"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Doc-Oriented VQA"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["KIE"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["HMER"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Final Score"].values[model_key] |
|
|
) |
|
|
values.append(row) |
|
|
return values |
|
|
|
|
|
def get_recog_table(model_table_df): |
|
|
|
|
|
values = [] |
|
|
for i in range(len(model_table_df)): |
|
|
row = [] |
|
|
model_key = model_table_df.index[i] |
|
|
model_name = model_table_df["Model"].values[model_key] |
|
|
|
|
|
row.append(i + 1) |
|
|
|
|
|
row.append(model_name) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Language Model"].values[model_key] |
|
|
) |
|
|
row.append( |
|
|
model_table_df["Open Source"].values[model_key] |
|
|
) |
|
|
row.append( |
|
|
model_table_df["Regular Text"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Irregular Text"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Artistic Text"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Handwriting"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Digit string"].values[model_key] |
|
|
) |
|
|
|
|
|
row.append( |
|
|
model_table_df["Non-semantic Text"].values[model_key] |
|
|
) |
|
|
row.append( |
|
|
model_table_df["ALL"].values[model_key] |
|
|
) |
|
|
values.append(row) |
|
|
return values |
|
|
|
|
|
def build_leaderboard_tab(leaderboard_table_file, text_recog_file, Inaccessible_model_file, show_plot=False): |
|
|
if leaderboard_table_file: |
|
|
data = load_leaderboard_table_csv(leaderboard_table_file) |
|
|
data_recog = load_leaderboard_table_csv(text_recog_file) |
|
|
data_Inaccessible = load_leaderboard_table_csv(Inaccessible_model_file) |
|
|
model_table_df = pd.DataFrame(data) |
|
|
model_table_df_Inaccessible = pd.DataFrame(data_Inaccessible) |
|
|
recog_table_df = pd.DataFrame(data_recog) |
|
|
md_head = f""" |
|
|
# π OCRBench Leaderboard |
|
|
| [GitHub](https://github.com/Yuliang-Liu/MultimodalOCR) | [Paper](https://arxiv.org/abs/2305.07895) | |
|
|
""" |
|
|
gr.Markdown(md_head, elem_id="leaderboard_markdown") |
|
|
with gr.Tabs() as tabs: |
|
|
|
|
|
with gr.Tab("OCRBench", id=0): |
|
|
arena_table_vals = get_arena_table(model_table_df) |
|
|
md = "OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation." |
|
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
|
gr.Dataframe( |
|
|
headers=[ |
|
|
"Rank", |
|
|
"Name", |
|
|
"Language Model", |
|
|
"Open Source", |
|
|
"Text Recognition", |
|
|
"Scene Text-Centric VQA", |
|
|
"Doc-Oriented VQA", |
|
|
"KIE", |
|
|
"HMER", |
|
|
"Final Score", |
|
|
], |
|
|
datatype=[ |
|
|
"str", |
|
|
"markdown", |
|
|
"str", |
|
|
"str", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
], |
|
|
value=arena_table_vals, |
|
|
elem_id="arena_leaderboard_dataframe", |
|
|
height=700, |
|
|
column_widths=[60, 120,150,100, 150, 200, 180, 80, 80, 160], |
|
|
wrap=True, |
|
|
) |
|
|
with gr.Tab("Text Recognition", id=1): |
|
|
arena_table_vals = get_recog_table(recog_table_df) |
|
|
md = "OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation." |
|
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
|
gr.Dataframe( |
|
|
headers=[ |
|
|
"Rank", |
|
|
"Name", |
|
|
"Language Model", |
|
|
"Open Source", |
|
|
"Regular Text", |
|
|
"Irregular Text", |
|
|
"Artistic Text", |
|
|
"Handwriting", |
|
|
"Digit string", |
|
|
"Non-semantic Text", |
|
|
"ALL", |
|
|
], |
|
|
datatype=[ |
|
|
"str", |
|
|
"markdown", |
|
|
"str", |
|
|
"str", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
], |
|
|
value=arena_table_vals, |
|
|
elem_id="arena_leaderboard_dataframe", |
|
|
height=700, |
|
|
column_widths=[60, 120,150,100, 100, 100, 100, 100, 100,100, 80], |
|
|
wrap=True, |
|
|
) |
|
|
with gr.Tab("Inaccessible Model", id=2): |
|
|
arena_table_vals = get_arena_table(model_table_df_Inaccessible) |
|
|
md = "The models on this list are neither open-source nor have API call interfaces available." |
|
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
|
gr.Dataframe( |
|
|
headers=[ |
|
|
"Rank", |
|
|
"Name", |
|
|
"Language Model", |
|
|
"Open Source", |
|
|
"Text Recognition", |
|
|
"Scene Text-Centric VQA", |
|
|
"Doc-Oriented VQA", |
|
|
"KIE", |
|
|
"HMER", |
|
|
"Final Score", |
|
|
], |
|
|
datatype=[ |
|
|
"str", |
|
|
"markdown", |
|
|
"str", |
|
|
"str", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
], |
|
|
value=arena_table_vals, |
|
|
elem_id="arena_leaderboard_dataframe", |
|
|
height=700, |
|
|
column_widths=[60, 120,150,100, 150, 200, 180, 80, 80, 160], |
|
|
wrap=True, |
|
|
) |
|
|
else: |
|
|
pass |
|
|
md_tail = f""" |
|
|
# Notice |
|
|
Sometimes, API calls to closed-source models may not succeed. In such cases, we will repeat the calls for unsuccessful samples until it becomes impossible to obtain a successful response. It is important to note that due to rigorous security reviews by OpenAI, GPT4V refuses to provide results for the 84 samples in OCRBench. |
|
|
If you would like to include your model in the OCRBench leaderboard, please follow the evaluation instructions provided on [GitHub](https://github.com/Yuliang-Liu/MultimodalOCR), [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) or [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) and feel free to contact us via email at zhangli123@hust.edu.cn. We will update the leaderboard in time.""" |
|
|
gr.Markdown(md_tail, elem_id="leaderboard_markdown") |
|
|
|
|
|
def build_demo(leaderboard_table_file, recog_table_file, Inaccessible_model_file): |
|
|
text_size = gr.themes.sizes.text_lg |
|
|
|
|
|
with gr.Blocks( |
|
|
title="OCRBench Leaderboard", |
|
|
theme=gr.themes.Base(text_size=text_size), |
|
|
css=block_css, |
|
|
) as demo: |
|
|
leader_components = build_leaderboard_tab( |
|
|
leaderboard_table_file, recog_table_file,Inaccessible_model_file,show_plot=True |
|
|
) |
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--share", action="store_true") |
|
|
parser.add_argument("--OCRBench_file", type=str, default="./OCRBench.csv") |
|
|
parser.add_argument("--TextRecognition_file", type=str, default="./TextRecognition.csv") |
|
|
parser.add_argument("--Inaccessible_model_file", type=str, default="./Inaccessible_model.csv") |
|
|
args = parser.parse_args() |
|
|
|
|
|
demo = build_demo(args.OCRBench_file, args.TextRecognition_file, args.Inaccessible_model_file) |
|
|
demo.launch() |