Spaces:

DigPhil
/

apparatus-ocr

Runtime error

App Files Files Community

al1808th commited on Mar 28

Commit

69dc570

1 Parent(s): bc4255a

first commit

Browse files

Files changed (18) hide show

README.md +42 -18
app.py +50 -45
data/leaderboard/requests/ibm-granite/granite-vision-3.3-2b_eval_request_False_float16_Original.json +12 -0
data/lloyd-jones-soph-170/ocr/lloyd-jones-apparatus.json +14 -0
data/lloyd-jones-soph-170/ocr/lloyd-jones-text.json +27 -0
data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png +3 -0
data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png +3 -0
data/lloyd-jones-soph-170/png/lloyd-jones-text.png +3 -0
src/about.py +37 -40
src/display/utils.py +6 -11
src/envs.py +14 -10
src/evaluation/__init__.py +8 -0
src/evaluation/build_result.py +97 -0
src/evaluation/metrics.py +78 -0
src/leaderboard/read_evals.py +11 -9
src/populate.py +5 -1
src/submission/check_validity.py +3 -5
src/submission/submit.py +36 -51

README.md CHANGED Viewed

@@ -13,36 +13,60 @@ tags:
 - leaderboard
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
 ```json
 {
     "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
     },
     "results": {
-        "task_name": {
-            "metric_name": score,
         },
-        "task_name2": {
-            "metric_name": score,
         }
     }
 }
 ```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 - leaderboard
 ---
+# OCR leaderboard
+This Space is customized for a two-level OCR benchmark on a single critical-edition page.
+Inputs and gold outputs live under `data/lloyd-jones-soph-170/`:
+- `png/lloyd-jones-fullpage.png`: hard task input
+- `png/lloyd-jones-text.png`: easy task text crop
+- `png/lloyd-jones-apparatus.png`: easy task apparatus crop
+- `ocr/lloyd-jones-text.json`: gold main-text output
+- `ocr/lloyd-jones-apparatus.json`: gold apparatus output
+The leaderboard expects result files in the following format:
 ```json
 {
     "config": {
+        "model_dtype": "torch.float16",
+        "model_name": "org/model",
+        "model_sha": "main"
     },
     "results": {
+        "easy_levenshtein": {
+            "score": 91.23
+        },
+        "easy_bleu": {
+            "score": 84.56
+        },
+        "hard_levenshtein": {
+            "score": 79.10
         },
+        "hard_bleu": {
+            "score": 70.42
         }
     }
 }
 ```
+The Space is local-first:
+- If HF backend datasets are configured via env vars, it will sync from them.
+- Otherwise it reads seeded queue/results data from `data/leaderboard/`.
+Useful files:
+- `src/about.py`: task definitions and benchmark copy
+- `src/evaluation/metrics.py`: local OCR metric helpers
+- `src/evaluation/build_result.py`: CLI to turn predicted OCR JSON files into a leaderboard result JSON
+- `src/leaderboard/read_evals.py`: result ingestion
+- `src/populate.py`: leaderboard and queue dataframe assembly
+Example:
+```bash
+python -m src.evaluation.build_result \
+  --model-name ibm-granite/granite-vision-3.3-2b \
+  --easy-text path/to/easy-text.json \
+  --easy-apparatus path/to/easy-apparatus.json \
+  --hard-text path/to/hard-text.json \
+  --hard-apparatus path/to/hard-apparatus.json \
+  --output data/leaderboard/results/ibm-granite/results_2026-03-28T00-00-00Z.json
+```

app.py CHANGED Viewed

@@ -19,47 +19,63 @@ from src.display.utils import (
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
-    WeightType,
-    Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
-    API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -95,8 +111,15 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -146,15 +169,8 @@ with demo:
             with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
                 with gr.Column():
                     precision = gr.Dropdown(
@@ -164,14 +180,6 @@ with demo:
                         value="float16",
                         interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
@@ -179,11 +187,8 @@ with demo:
                 add_new_eval,
                 [
                     model_name_textbox,
-                    base_model_name_textbox,
                     revision_name_textbox,
                     precision,
-                    weight_type,
-                    model_type,
                 ],
                 submission_result,
             )
@@ -201,4 +206,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     fields,
+    Precision,
+)
+from src.envs import (
+    API,
+    EVAL_REQUESTS_PATH,
+    EVAL_RESULTS_PATH,
+    LOCAL_EVAL_REQUESTS_PATH,
+    LOCAL_EVAL_RESULTS_PATH,
+    QUEUE_REPO,
+    REPO_ID,
+    RESULTS_REPO,
+    TOKEN,
+    has_remote_backend,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
+    if REPO_ID and TOKEN:
+        API.restart_space(repo_id=REPO_ID)
+def sync_or_fallback(repo_id: str, local_dir: str, fallback_dir: str) -> str:
+    if not has_remote_backend() or not repo_id:
+        return fallback_dir
+    try:
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=local_dir,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        return local_dir
+    except Exception:
+        return fallback_dir
 ### Space initialisation
+REQUESTS_PATH = sync_or_fallback(QUEUE_REPO, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH)
+RESULTS_PATH = sync_or_fallback(RESULTS_REPO, EVAL_RESULTS_PATH, LOCAL_EVAL_RESULTS_PATH)
+LEADERBOARD_DF = get_leaderboard_df(RESULTS_PATH, REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
+) = get_evaluation_queue_df(REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
+    if dataframe is None:
+        dataframe = pd.DataFrame(columns=COLS)
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 OCR Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            if LEADERBOARD_DF.empty:
+                gr.Markdown(
+                    "No finished evaluations are available yet. The queue below is seeded with the first model submission.",
+                    elem_classes="markdown-text",
+                )
+                gr.Dataframe(value=LEADERBOARD_DF, headers=COLS, interactive=False)
+            else:
+                leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name", value="ibm-granite/granite-vision-3.3-2b")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                 with gr.Column():
                     precision = gr.Dropdown(
                         value="float16",
                         interactive=True,
                     )
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
                 add_new_eval,
                 [
                     model_name_textbox,
                     revision_name_textbox,
                     precision,
                 ],
                 submission_result,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

data/leaderboard/requests/ibm-granite/granite-vision-3.3-2b_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "model": "ibm-granite/granite-vision-3.3-2b",
+  "revision": "main",
+  "precision": "float16",
+  "status": "PENDING",
+  "submitted_time": "2026-03-28T00:00:00Z",
+  "model_type": "pretrained",
+  "weight_type": "Original",
+  "likes": 0,
+  "params": 2.0,
+  "license": "?"
+}

data/lloyd-jones-soph-170/ocr/lloyd-jones-apparatus.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "1270": "αὑτοῦ a: αὐ- codd. plerique",
+    "1271": "ὄψοιντο Cat: -οιτο Lrpa",
+    "1276": "ἐπαίρων] cf. Senecae Oed. 962sq.: πείρων Nauck",
+    "1278-9": "del. West",
+    "1279": "χαλάζης αἵματος] χάλαζά θ᾽ αἱματοῦσσ᾽ Porson: alii alia (αἵματός 〈θ᾽〉 Zrpct)",
+    "1280-1": "del. Dindorf",
+    "1280": "ἐκ] ἐς (... κάρα) Pearson    οὐ μόνου κακά] οὐ μόνου κάτα C. Otto: οὐχ ἑνὸς μόνου Porson: alii alia",
+    "1284": "ἄτη Rpat: ἄται Gp: ἄτε l",
+    "1286": "τινι Mudge et Elmsley, teste Hermann: τίνι codd.",
+    "1287": "κλῄθρα Lpa: κλεῖθρα rpat",
+    "1291": "μενῶν Lat: μένων rp    δόμοις ἀραῖος ὡς] δόμοισιν ἔνοχος οἷς Nauck",
+    "1294": "δείξει] δόξει Xr, coni. Reiske    κλῇθρα L, P s.l., a: κλεῖθρα rpat    γὰρ rpat: γε lp"
+}

data/lloyd-jones-soph-170/ocr/lloyd-jones-text.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "1270": "ἄρας ἔπαισεν ἄρθρα τῶν αὑτοῦ κύκλων,",
+  "1271": "αὐδῶν τοιαῦθ᾽, ὁθούνεκ᾽ οὐκ ὄψοιντό νιν",
+  "1272": "οὔθ᾽ οἷ᾽ ἔπασχεν οὔθ᾽ ὁποῖ᾽ ἔδρα κακά,",
+  "1273": "ἀλλ᾽ ἐν σκότῳ τὸ λοιπὸν οὓς μὲν οὐκ ἔδει",
+  "1274": "ὀψοίαθ᾽, οὓς δ᾽ ἐχρῃζεν οὐ γνωσοίατο.",
+  "1275": "τοιαῦτ᾽ ἐφυμνῶν πολλάκις τε κοὐχ ἅπαξ",
+  "1276": "ἤρασσ᾽ ἐπαίρων βλέφαρα. φοίνιαι δ᾽ ὁμοῦ",
+  "1277": "γλῆναι γένει᾽ ἔτεγγον, οὐδ᾽ ἀνίεσαν.",
+  "1278": "[φόνου μυδώσας σταγόνας, ἀλλ᾽ ὁμοῦ μέλας",
+  "1279": "ὄμβρος †χαλάζης αἵματος† ἐτέγγετο.]",
+  "1280": "†τάδ᾽ ἐκ δυοῖν ἔρρωγεν οὐ μόνου κακά†",
+  "1281": "ἀλλ᾽ ἀνδρὶ καὶ γυναικὶ συμμιγῆ κακά.",
+  "1282": "ὁ πρὶν παλαιὸς δ᾽ ὄλβος ἦν πάροιθε μὲν",
+  "1283": "ὄλβος δικαίως, νῦν δὲ τῇδε θἠμέρᾳ",
+  "1284": "στεναγμός, ἄτη, θάνατος, αἰσχύνη, κακῶν",
+  "1285": "ὅσ᾽ ἐστὶ πάντων ὀνόματ᾽, οὐδὲν ἔστ᾽ ἀπόν.",
+  "1286": "νῦν δ᾽ ἔσθ᾽ ὁ τλήμων ἔν τινι σχολῇ κακοῦ;",
+  "1287": "βοᾷ διοίγειν κλῇθρα καὶ δηλοῦν τινα",
+  "1288": "τοῖς πᾶσι Καδμείοισι τὸν πατροκτόνον,",
+  "1289": "τὸν μητρός, αὐδῶν ἀνόσι᾽ οὐδὲ ῥητά μοι,",
+  "1290": "ὡς ἐκ χθονὸς ῥίψων ἑαυτόν, οὐδ᾽ ἔτι",
+  "1291": "μενῶν δόμοις ἀραῖος, ὡς ἠράσατο.",
+  "1292": "ρώμης γε μέντοι καὶ προηγητοῦ τινος",
+  "1293": "δεῖται· τὸ γὰρ νόσημα μεῖζον ἢ φέρειν.",
+  "1294": "δείξει δὲ καὶ σοί· κλῇθρα γὰρ πυλῶν τάδε"
+}

data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png ADDED Viewed

Git LFS Details

SHA256: facc45bc80a10a402df2c81fb35f68e785e0263851cdd89ce7bc2dd221d40ab8
Pointer size: 131 Bytes
Size of remote file: 194 kB

data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png ADDED Viewed

Git LFS Details

SHA256: 590ddb2692f86b758dd58cfb69d702e73dea3a5f2da7f5f8190459cfb8ccaea8
Pointer size: 131 Bytes
Size of remote file: 650 kB

data/lloyd-jones-soph-170/png/lloyd-jones-text.png ADDED Viewed

Git LFS Details

SHA256: 7ef86d1ebd3c8a36e997fed06ad4b0c3ee7556c31346420904d3911942ae6ea3
Pointer size: 131 Bytes
Size of remote file: 430 kB

src/about.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
@@ -8,65 +9,61 @@ class Task:
     col_name: str
-# Select your tasks here
-# ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-Intro text
 """
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
 ## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
     col_name: str
 class Tasks(Enum):
+    easy_levenshtein = Task("easy_levenshtein", "score", "Easy Lev. ↑")
+    easy_bleu = Task("easy_bleu", "score", "Easy BLEU ↑")
+    hard_levenshtein = Task("hard_levenshtein", "score", "Hard Lev. ↑")
+    hard_bleu = Task("hard_bleu", "score", "Hard BLEU ↑")
+TITLE = """<h1 align="center" id="space-title">Critical Apparatus OCR Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+This benchmark measures OCR quality on a Greek critical edition page from Lloyd-Jones' *Sophocles*.
+Systems must emit two JSON files:
+- `text.json`: the main text lines keyed by line number
+- `apparatus.json`: the critical apparatus keyed by line number or range
+There are two task variants:
+- `Easy`: input is already split into `text` and `apparatus` crops
+- `Hard`: input is the full page image and the system must separate the two outputs itself
+Each variant is scored with normalized Levenshtein similarity and BLEU. The leaderboard average is the mean of those four scores.
 """
+LLM_BENCHMARKS_TEXT = """
 ## How it works
+- Gold data lives in `data/lloyd-jones-soph-170/ocr/`
+- Hard input lives in `data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png`
+- Easy inputs live in `data/lloyd-jones-soph-170/png/lloyd-jones-text.png` and `data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png`
+The expected output schema is a JSON object mapping line numbers or ranges to OCR strings, matching the gold files already in the repo.
+## Metrics
+- `Levenshtein`: character-level similarity after flattening each JSON file into a deterministic text representation
+- `BLEU`: token-level overlap score on the same flattened representation
+## First seeded submission
+The queue is pre-seeded with `ibm-granite/granite-vision-3.3-2b`. Once a real result JSON is added for that model, it will appear in the leaderboard automatically.
+"""
+EVALUATION_QUEUE_TEXT = """
+Submit the Hugging Face model repo and revision you want evaluated on this OCR task.
+The evaluator should produce four scores in its result JSON:
+- `easy_levenshtein.score`
+- `easy_bleu.score`
+- `hard_levenshtein.score`
+- `hard_bleu.score`
+The queue shown below is local-first, so the Space can be previewed before the dedicated backend datasets are configured.
 """
+CITATION_BUTTON_LABEL = "Citation snippet"
 CITATION_BUTTON_TEXT = r"""
 """

src/display/utils.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
-import pandas as pd
 from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -25,11 +24,9 @@ auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
@@ -48,9 +45,8 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
     precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
@@ -62,10 +58,10 @@ class ModelDetails:
 class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -107,4 +103,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     precision = ColumnContent("precision", "str", True)
+    submitted_time = ColumnContent("submitted_time", "str", True)
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="P")
+    FT = ModelDetails(name="fine-tuned", symbol="F")
+    IFT = ModelDetails(name="instruction-tuned", symbol="I")
+    RL = ModelDetails(name="RL-tuned", symbol="R")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py CHANGED Viewed

@@ -2,24 +2,28 @@ import os
 from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
+TOKEN = os.environ.get("HF_TOKEN")
+OWNER = os.environ.get("HF_LEADERBOARD_OWNER", "")
+REPO_ID = os.environ.get("SPACE_ID") or (f"{OWNER}/apparatus-ocr" if OWNER else "")
+QUEUE_REPO = os.environ.get("HF_QUEUE_REPO") or (f"{OWNER}/requests" if OWNER else "")
+RESULTS_REPO = os.environ.get("HF_RESULTS_REPO") or (f"{OWNER}/results" if OWNER else "")
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+LOCAL_DATA_PATH = os.path.join(REPO_ROOT, "data", "leaderboard")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+LOCAL_EVAL_REQUESTS_PATH = os.path.join(LOCAL_DATA_PATH, "requests")
+LOCAL_EVAL_RESULTS_PATH = os.path.join(LOCAL_DATA_PATH, "results")
 API = HfApi(token=TOKEN)
+def has_remote_backend() -> bool:
+    return bool(TOKEN and QUEUE_REPO and RESULTS_REPO)

src/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity, paired_ocr_metrics
+__all__ = [
+    "bleu_score",
+    "flatten_ocr_json",
+    "levenshtein_similarity",
+    "paired_ocr_metrics",
+]

src/evaluation/build_result.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import argparse
+import json
+import os
+from pathlib import Path
+from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity
+REPO_ROOT = Path(__file__).resolve().parents[2]
+GOLD_ROOT = REPO_ROOT / "data" / "lloyd-jones-soph-170" / "ocr"
+TEXT_GOLD_PATH = GOLD_ROOT / "lloyd-jones-text.json"
+APPARATUS_GOLD_PATH = GOLD_ROOT / "lloyd-jones-apparatus.json"
+def build_result_payload(
+    model_name: str,
+    revision: str,
+    precision: str,
+    easy_text_path: str,
+    easy_apparatus_path: str,
+    hard_text_path: str,
+    hard_apparatus_path: str,
+) -> dict:
+    text_gold = _load_json(TEXT_GOLD_PATH)
+    apparatus_gold = _load_json(APPARATUS_GOLD_PATH)
+    easy_reference = _join_sections(text_gold, apparatus_gold)
+    easy_prediction = _join_sections(_load_json(easy_text_path), _load_json(easy_apparatus_path))
+    hard_reference = _join_sections(text_gold, apparatus_gold)
+    hard_prediction = _join_sections(_load_json(hard_text_path), _load_json(hard_apparatus_path))
+    return {
+        "config": {
+            "model_dtype": _normalize_precision(precision),
+            "model_name": model_name,
+            "model_sha": revision,
+        },
+        "results": {
+            "easy_levenshtein": {"score": levenshtein_similarity(easy_reference, easy_prediction) / 100.0},
+            "easy_bleu": {"score": bleu_score(easy_reference, easy_prediction) / 100.0},
+            "hard_levenshtein": {"score": levenshtein_similarity(hard_reference, hard_prediction) / 100.0},
+            "hard_bleu": {"score": bleu_score(hard_reference, hard_prediction) / 100.0},
+        },
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Build a leaderboard-compatible result JSON for the OCR benchmark.")
+    parser.add_argument("--model-name", required=True)
+    parser.add_argument("--revision", default="main")
+    parser.add_argument("--precision", default="float16")
+    parser.add_argument("--easy-text", required=True)
+    parser.add_argument("--easy-apparatus", required=True)
+    parser.add_argument("--hard-text", required=True)
+    parser.add_argument("--hard-apparatus", required=True)
+    parser.add_argument("--output", required=True)
+    args = parser.parse_args()
+    payload = build_result_payload(
+        model_name=args.model_name,
+        revision=args.revision,
+        precision=args.precision,
+        easy_text_path=args.easy_text,
+        easy_apparatus_path=args.easy_apparatus,
+        hard_text_path=args.hard_text,
+        hard_apparatus_path=args.hard_apparatus,
+    )
+    os.makedirs(os.path.dirname(args.output), exist_ok=True)
+    with open(args.output, "w") as handle:
+        json.dump(payload, handle, ensure_ascii=False, indent=2)
+def _load_json(path: str | Path) -> dict[str, str]:
+    with open(path) as handle:
+        return json.load(handle)
+def _join_sections(text_json: dict[str, str], apparatus_json: dict[str, str]) -> str:
+    return "\n".join(
+        [
+            "[TEXT]",
+            flatten_ocr_json(text_json),
+            "[APPARATUS]",
+            flatten_ocr_json(apparatus_json),
+        ]
+    )
+def _normalize_precision(precision: str) -> str:
+    if precision.startswith("torch."):
+        return precision
+    return f"torch.{precision}"
+if __name__ == "__main__":
+    main()

src/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import math
+from collections import Counter
+def flatten_ocr_json(payload: dict[str, str]) -> str:
+    parts = [f"{key}\t{payload[key]}" for key in sorted(payload, key=_sort_key)]
+    return "\n".join(parts)
+def _sort_key(value: str):
+    head, _, tail = value.partition("-")
+    try:
+        return (int(head), int(tail) if tail else -1, value)
+    except ValueError:
+        return (math.inf, math.inf, value)
+def levenshtein_similarity(reference: str, prediction: str) -> float:
+    if reference == prediction:
+        return 100.0
+    if not reference and not prediction:
+        return 100.0
+    if not reference or not prediction:
+        return 0.0
+    prev = list(range(len(prediction) + 1))
+    for i, ref_char in enumerate(reference, start=1):
+        current = [i]
+        for j, pred_char in enumerate(prediction, start=1):
+            substitution_cost = 0 if ref_char == pred_char else 1
+            current.append(
+                min(
+                    prev[j] + 1,
+                    current[j - 1] + 1,
+                    prev[j - 1] + substitution_cost,
+                )
+            )
+        prev = current
+    distance = prev[-1]
+    return max(0.0, (1 - (distance / max(len(reference), len(prediction)))) * 100.0)
+def bleu_score(reference: str, prediction: str, max_order: int = 4) -> float:
+    ref_tokens = reference.split()
+    pred_tokens = prediction.split()
+    if not ref_tokens and not pred_tokens:
+        return 100.0
+    if not ref_tokens or not pred_tokens:
+        return 0.0
+    precisions = []
+    for order in range(1, max_order + 1):
+        ref_counts = _ngram_counts(ref_tokens, order)
+        pred_counts = _ngram_counts(pred_tokens, order)
+        overlap = sum(min(count, ref_counts[ngram]) for ngram, count in pred_counts.items())
+        total = max(sum(pred_counts.values()), 1)
+        precisions.append((overlap + 1) / (total + 1))
+    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
+    bp = 1.0 if len(pred_tokens) > len(ref_tokens) else math.exp(1 - (len(ref_tokens) / len(pred_tokens)))
+    return geo_mean * bp * 100.0
+def paired_ocr_metrics(reference: dict[str, str], prediction: dict[str, str]) -> dict[str, float]:
+    reference_text = flatten_ocr_json(reference)
+    prediction_text = flatten_ocr_json(prediction)
+    return {
+        "levenshtein": levenshtein_similarity(reference_text, prediction_text),
+        "bleu": bleu_score(reference_text, prediction_text),
+    }
+def _ngram_counts(tokens: list[str], order: int) -> Counter:
+    if len(tokens) < order:
+        return Counter()
+    return Counter(tuple(tokens[i : i + order]) for i in range(len(tokens) - order + 1))

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
@@ -57,10 +56,16 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
@@ -127,7 +132,7 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -146,10 +151,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
                 request_file = tmp_request_file
     return request_file

 import glob
 import json
 import os
 from dataclasses import dataclass
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         architecture = "?"
+        still_on_hub = False
+        model_config = None
+        try:
+            still_on_hub, _, model_config = is_model_on_hub(
+                full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+            )
+        except Exception:
+            still_on_hub = False
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
         }
         for task in Tasks:
+            data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
         return data_dict
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            if req_content["status"] in ["FINISHED", "FINISHED_MANUAL"] and req_content["precision"] == precision.split(".")[-1]:
                 request_file = tmp_request_file
     return request_file

src/populate.py CHANGED Viewed

@@ -39,7 +39,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
+            sub_entries = [
+                e
+                for e in os.listdir(f"{save_path}/{entry}")
+                if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")
+            ]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

src/submission/check_validity.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import json
 import os
-import re
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
@@ -37,14 +35,14 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
                     f"uses a tokenizer which is not in a transformers release: {e}",
                     None
                 )
-            except Exception as e:
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
@@ -55,7 +53,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
             None
         )
-    except Exception as e:
         return False, "was not found on hub!", None

 import json
 import os
 from collections import defaultdict
 import huggingface_hub
 from huggingface_hub import ModelCard
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
+                AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
                     f"uses a tokenizer which is not in a transformers release: {e}",
                     None
                 )
+            except Exception:
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
             None
         )
+    except Exception:
         return False, "was not found on hub!", None

src/submission/submit.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -16,16 +16,15 @@ USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
-    base_model: str,
     revision: str,
     precision: str,
-    weight_type: str,
-    model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
     model_path = model
@@ -36,58 +35,42 @@ def add_new_eval(
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
     if revision == "":
         revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
         model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
     eval_entry = {
         "model": model,
-        "base_model": base_model,
         "revision": revision,
         "precision": precision,
-        "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
         "params": model_size,
         "license": license,
-        "private": False,
     }
     # Check for duplicate submission
@@ -95,25 +78,27 @@ def add_new_eval(
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
     # Remove the local file
-    os.remove(out_path)
     return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN, has_remote_backend
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
 def add_new_eval(
     model: str,
     revision: str,
     precision: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
+    requests_path = EVAL_REQUESTS_PATH if has_remote_backend() else LOCAL_EVAL_REQUESTS_PATH
     if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(requests_path)
     user_name = ""
     model_path = model
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if revision == "":
         revision = "main"
+    # Seems good, creating the eval
+    print("Adding new eval")
+    license = "?"
+    model_size = 0
+    likes = 0
+    if has_remote_backend():
         model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
+        try:
+            model_info = API.model_info(repo_id=model, revision=revision)
+            model_size = get_model_size(model_info=model_info, precision=precision)
+            likes = model_info.likes
+            license = model_info.cardData.get("license", "?")
+            modelcard_OK, error_msg = check_model_card(model)
+            if not modelcard_OK:
+                return styled_error(error_msg)
+        except Exception:
+            return styled_error("Could not get your model information from the Hub.")
     eval_entry = {
         "model": model,
         "revision": revision,
         "precision": precision,
         "status": "PENDING",
         "submitted_time": current_time,
+        "model_type": "pretrained",
+        "weight_type": "Original",
+        "likes": likes,
         "params": model_size,
         "license": license,
     }
     # Check for duplicate submission
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
+    OUT_DIR = f"{requests_path}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
+    if has_remote_backend():
+        API.upload_file(
+            path_or_fileobj=out_path,
+            path_in_repo=out_path.split("eval-queue/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {model} to eval queue",
+        )
     # Remove the local file
+    if has_remote_backend():
+        os.remove(out_path)
     return styled_message(
+        "Your request has been submitted to the evaluation queue."
     )