Spaces:

microsoft
/

BizGenEval-Leaderboard

Runtime error

File size: 3,776 Bytes

from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # benchmark key in parsed results, metric key (for lm-eval compatibility), and display label
    dim_layout = Task("dimension_layout", "error_score", "Layout")
    dim_attribute = Task("dimension_attribute", "error_score", "Attribute")
    dim_text = Task("dimension_text", "error_score", "Text")
    dim_knowledge = Task("dimension_knowledge", "error_score", "Knowledge")
    dom_slides = Task("domain_slides", "error_score", "Slides")
    dom_webpage = Task("domain_webpage", "error_score", "Webpage")
    dom_poster = Task("domain_poster", "error_score", "Poster")
    dom_chart = Task("domain_chart", "error_score", "Chart")
    dom_scientific_figure = Task("domain_scientific_figure", "error_score", "Scientific Figure")


NUM_FEWSHOT = 0  # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">BizGenEval Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
BizGenEval is a benchmark for commercial visual content generation quality.
This leaderboard reports score breakdowns by:

- Capability dimensions: Layout, Attribute, Text, Knowledge
- Content domains: Slides, Webpage, Poster, Chart, Scientific Figure

All leaderboard scores are displayed as `hard(easy)` when ranking by hard, and `easy(hard)` when ranking by easy, on a
0-100 scale.

GitHub: [microsoft/BizGenEval](https://github.com/microsoft/BizGenEval)
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

1. Run BizGenEval evaluation locally.
2. Summarize your scores into the 4 capability dimensions and 5 content domains.
3. Enter the hard/easy scores in the `Submit here!` tab.
4. Click `Submit Result` to add a new row to the public leaderboard.

## Score Protocol

- Scores are shown as `hard(easy)` when ranking by hard, and `easy(hard)` when ranking by easy.
- The leaderboard is sorted by the average score of the currently selected `Rank By` mode. `Hard` is the default.
- If two models have the same average on the selected mode, the other mode is used as a tiebreaker.
- Displayed columns include 4 capability dimensions and 5 content domains.
- User submissions are appended as new leaderboard rows.
"""

EVALUATION_QUEUE_TEXT = """
## Submission Guide

1. Enter the model name exactly as you want it to appear on the leaderboard.
2. Fill in all 18 scores on a `0-100` scale.
3. You can enter integers or decimals. Scores are saved with one decimal place.
4. If you enter more than one decimal place, the score will be rounded to one decimal place before it is saved.
5. Click `Submit Result` to add a new row to the public leaderboard.

### Required scores

- Capability dimensions: `Layout`, `Attribute`, `Text`, `Knowledge`
- Content domains: `Slides`, `Webpage`, `Poster`, `Chart`, `Scientific Figure`
- Each item needs both `hard` and `easy`
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{{li2026bizgeneval,
  title={{BizGenEval: A Systematic Benchmark for Commercial Visual Content Generation}},
  author={{Yan Li and Zezi Zeng and Ziwei Zhou and Xin Gao and Muzhao Tian and Yifan Yang and Mingxi Cheng and Qi Dai and Yuqing Yang and Lili Qiu and Zhendong Wang and Zhengyuan Yang and Xue Yang and Lijuan Wang and Ji Li and Chong Luo}},
  year={{2026}},
  eprint={{2603.25732}},
  archivePrefix={{arXiv}},
  primaryClass={{cs.CV}},
  url={{https://arxiv.org/abs/2603.25732}}
}}"""