| from dataclasses import dataclass |
| from enum import Enum |
|
|
|
|
| @dataclass |
| class Task: |
| benchmark: str |
| metric: str |
| col_name: str |
|
|
|
|
| |
| |
| class Tasks(Enum): |
| |
| dim_layout = Task("dimension_layout", "error_score", "Layout") |
| dim_attribute = Task("dimension_attribute", "error_score", "Attribute") |
| dim_text = Task("dimension_text", "error_score", "Text") |
| dim_knowledge = Task("dimension_knowledge", "error_score", "Knowledge") |
| dom_slides = Task("domain_slides", "error_score", "Slides") |
| dom_webpage = Task("domain_webpage", "error_score", "Webpage") |
| dom_poster = Task("domain_poster", "error_score", "Poster") |
| dom_chart = Task("domain_chart", "error_score", "Chart") |
| dom_scientific_figure = Task("domain_scientific_figure", "error_score", "Scientific Figure") |
|
|
|
|
| NUM_FEWSHOT = 0 |
| |
|
|
|
|
| |
| TITLE = """<h1 align="center" id="space-title">BizGenEval Leaderboard</h1>""" |
|
|
| |
| INTRODUCTION_TEXT = """ |
| BizGenEval is a benchmark for commercial visual content generation quality. |
| This leaderboard reports score breakdowns by: |
| |
| - Capability dimensions: Layout, Attribute, Text, Knowledge |
| - Content domains: Slides, Webpage, Poster, Chart, Scientific Figure |
| |
| All leaderboard scores are displayed as `hard(easy)` when ranking by hard, and `easy(hard)` when ranking by easy, on a |
| 0-100 scale. |
| |
| GitHub: [microsoft/BizGenEval](https://github.com/microsoft/BizGenEval) |
| """ |
|
|
| |
| LLM_BENCHMARKS_TEXT = f""" |
| ## How it works |
| |
| 1. Run BizGenEval evaluation locally. |
| 2. Summarize your scores into the 4 capability dimensions and 5 content domains. |
| 3. Enter the hard/easy scores in the `Submit here!` tab. |
| 4. Click `Submit Result` to add a new row to the public leaderboard. |
| |
| ## Score Protocol |
| |
| - Scores are shown as `hard(easy)` when ranking by hard, and `easy(hard)` when ranking by easy. |
| - The leaderboard is sorted by the average score of the currently selected `Rank By` mode. `Hard` is the default. |
| - If two models have the same average on the selected mode, the other mode is used as a tiebreaker. |
| - Displayed columns include 4 capability dimensions and 5 content domains. |
| - User submissions are appended as new leaderboard rows. |
| """ |
|
|
| EVALUATION_QUEUE_TEXT = """ |
| ## Submission Guide |
| |
| 1. Enter the model name exactly as you want it to appear on the leaderboard. |
| 2. Fill in all 18 scores on a `0-100` scale. |
| 3. You can enter integers or decimals. Scores are saved with one decimal place. |
| 4. If you enter more than one decimal place, the score will be rounded to one decimal place before it is saved. |
| 5. Click `Submit Result` to add a new row to the public leaderboard. |
| |
| ### Required scores |
| |
| - Capability dimensions: `Layout`, `Attribute`, `Text`, `Knowledge` |
| - Content domains: `Slides`, `Webpage`, `Poster`, `Chart`, `Scientific Figure` |
| - Each item needs both `hard` and `easy` |
| """ |
|
|
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
| CITATION_BUTTON_TEXT = r"""@misc{{li2026bizgeneval, |
| title={{BizGenEval: A Systematic Benchmark for Commercial Visual Content Generation}}, |
| author={{Yan Li and Zezi Zeng and Ziwei Zhou and Xin Gao and Muzhao Tian and Yifan Yang and Mingxi Cheng and Qi Dai and Yuqing Yang and Lili Qiu and Zhendong Wang and Zhengyuan Yang and Xue Yang and Lijuan Wang and Ji Li and Chong Luo}}, |
| year={{2026}}, |
| eprint={{2603.25732}}, |
| archivePrefix={{arXiv}}, |
| primaryClass={{cs.CV}}, |
| url={{https://arxiv.org/abs/2603.25732}} |
| }}""" |
|
|