Critical Apparatus OCR Leaderboard

from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


class Tasks(Enum):
    easy_levenshtein = Task("easy_levenshtein", "score", "Easy Lev. ↑")
    easy_bleu = Task("easy_bleu", "score", "Easy BLEU ↑")
    hard_levenshtein = Task("hard_levenshtein", "score", "Hard Lev. ↑")
    hard_bleu = Task("hard_bleu", "score", "Hard BLEU ↑")


TITLE = """<h1 align="center" id="space-title">Critical Apparatus OCR Leaderboard</h1>"""

INTRODUCTION_TEXT = """
This benchmark measures OCR quality on a Greek critical edition page from Lloyd-Jones' *Sophocles*.

Systems must emit two JSON files:
- `text.json`: the main text lines keyed by line number
- `apparatus.json`: the critical apparatus keyed by line number or range

There are two task variants:
- `Easy`: input is already split into `text` and `apparatus` crops
- `Hard`: input is the full page image and the system must separate the two outputs itself

Each variant is scored with normalized Levenshtein similarity and BLEU. The leaderboard average is the mean of those four scores.
"""

LLM_BENCHMARKS_TEXT = """
## How it works

- Gold data lives in `data/lloyd-jones-soph-170/ocr/`
- Hard input lives in `data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png`
- Easy inputs live in `data/lloyd-jones-soph-170/png/lloyd-jones-text.png` and `data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png`

The expected output schema is a JSON object mapping line numbers or ranges to OCR strings, matching the gold files already in the repo.

## Metrics

- `Levenshtein`: character-level similarity after flattening each JSON file into a deterministic text representation
- `BLEU`: token-level overlap score on the same flattened representation

## First seeded submission

The queue is pre-seeded with `ibm-granite/granite-vision-3.3-2b`. Once a real result JSON is added for that model, it will appear in the leaderboard automatically.

"""

EVALUATION_QUEUE_TEXT = """
Submit the Hugging Face model repo and revision you want evaluated on this OCR task.

The evaluator should produce four scores in its result JSON:
- `easy_levenshtein.score`
- `easy_bleu.score`
- `hard_levenshtein.score`
- `hard_bleu.score`

The queue shown below is local-first, so the Space can be previewed before the dedicated backend datasets are configured.
"""

CITATION_BUTTON_LABEL = "Citation snippet"
CITATION_BUTTON_TEXT = r"""
"""