Spaces:
Runtime error
Runtime error
first commit
Browse files- README.md +42 -18
- app.py +50 -45
- data/leaderboard/requests/ibm-granite/granite-vision-3.3-2b_eval_request_False_float16_Original.json +12 -0
- data/lloyd-jones-soph-170/ocr/lloyd-jones-apparatus.json +14 -0
- data/lloyd-jones-soph-170/ocr/lloyd-jones-text.json +27 -0
- data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png +3 -0
- data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png +3 -0
- data/lloyd-jones-soph-170/png/lloyd-jones-text.png +3 -0
- src/about.py +37 -40
- src/display/utils.py +6 -11
- src/envs.py +14 -10
- src/evaluation/__init__.py +8 -0
- src/evaluation/build_result.py +97 -0
- src/evaluation/metrics.py +78 -0
- src/leaderboard/read_evals.py +11 -9
- src/populate.py +5 -1
- src/submission/check_validity.py +3 -5
- src/submission/submit.py +36 -51
README.md
CHANGED
|
@@ -13,36 +13,60 @@ tags:
|
|
| 13 |
- leaderboard
|
| 14 |
---
|
| 15 |
|
| 16 |
-
#
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
```json
|
| 22 |
{
|
| 23 |
"config": {
|
| 24 |
-
"model_dtype": "torch.float16",
|
| 25 |
-
"model_name": "
|
| 26 |
-
"model_sha": "
|
| 27 |
},
|
| 28 |
"results": {
|
| 29 |
-
"
|
| 30 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
},
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
}
|
| 35 |
}
|
| 36 |
}
|
| 37 |
```
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
-
|
| 47 |
-
-
|
| 48 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
- leaderboard
|
| 14 |
---
|
| 15 |
|
| 16 |
+
# OCR leaderboard
|
| 17 |
|
| 18 |
+
This Space is customized for a two-level OCR benchmark on a single critical-edition page.
|
| 19 |
|
| 20 |
+
Inputs and gold outputs live under `data/lloyd-jones-soph-170/`:
|
| 21 |
+
- `png/lloyd-jones-fullpage.png`: hard task input
|
| 22 |
+
- `png/lloyd-jones-text.png`: easy task text crop
|
| 23 |
+
- `png/lloyd-jones-apparatus.png`: easy task apparatus crop
|
| 24 |
+
- `ocr/lloyd-jones-text.json`: gold main-text output
|
| 25 |
+
- `ocr/lloyd-jones-apparatus.json`: gold apparatus output
|
| 26 |
+
|
| 27 |
+
The leaderboard expects result files in the following format:
|
| 28 |
```json
|
| 29 |
{
|
| 30 |
"config": {
|
| 31 |
+
"model_dtype": "torch.float16",
|
| 32 |
+
"model_name": "org/model",
|
| 33 |
+
"model_sha": "main"
|
| 34 |
},
|
| 35 |
"results": {
|
| 36 |
+
"easy_levenshtein": {
|
| 37 |
+
"score": 91.23
|
| 38 |
+
},
|
| 39 |
+
"easy_bleu": {
|
| 40 |
+
"score": 84.56
|
| 41 |
+
},
|
| 42 |
+
"hard_levenshtein": {
|
| 43 |
+
"score": 79.10
|
| 44 |
},
|
| 45 |
+
"hard_bleu": {
|
| 46 |
+
"score": 70.42
|
| 47 |
}
|
| 48 |
}
|
| 49 |
}
|
| 50 |
```
|
| 51 |
|
| 52 |
+
The Space is local-first:
|
| 53 |
+
- If HF backend datasets are configured via env vars, it will sync from them.
|
| 54 |
+
- Otherwise it reads seeded queue/results data from `data/leaderboard/`.
|
| 55 |
|
| 56 |
+
Useful files:
|
| 57 |
+
- `src/about.py`: task definitions and benchmark copy
|
| 58 |
+
- `src/evaluation/metrics.py`: local OCR metric helpers
|
| 59 |
+
- `src/evaluation/build_result.py`: CLI to turn predicted OCR JSON files into a leaderboard result JSON
|
| 60 |
+
- `src/leaderboard/read_evals.py`: result ingestion
|
| 61 |
+
- `src/populate.py`: leaderboard and queue dataframe assembly
|
| 62 |
|
| 63 |
+
Example:
|
| 64 |
+
```bash
|
| 65 |
+
python -m src.evaluation.build_result \
|
| 66 |
+
--model-name ibm-granite/granite-vision-3.3-2b \
|
| 67 |
+
--easy-text path/to/easy-text.json \
|
| 68 |
+
--easy-apparatus path/to/easy-apparatus.json \
|
| 69 |
+
--hard-text path/to/hard-text.json \
|
| 70 |
+
--hard-apparatus path/to/hard-apparatus.json \
|
| 71 |
+
--output data/leaderboard/results/ibm-granite/results_2026-03-28T00-00-00Z.json
|
| 72 |
+
```
|
app.py
CHANGED
|
@@ -19,47 +19,63 @@ from src.display.utils import (
|
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
| 22 |
-
ModelType,
|
| 23 |
fields,
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
| 27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
from src.submission.submit import add_new_eval
|
| 30 |
|
| 31 |
|
| 32 |
def restart_space():
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
### Space initialisation
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
snapshot_download(
|
| 39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 40 |
-
)
|
| 41 |
-
except Exception:
|
| 42 |
-
restart_space()
|
| 43 |
-
try:
|
| 44 |
-
print(EVAL_RESULTS_PATH)
|
| 45 |
-
snapshot_download(
|
| 46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 47 |
-
)
|
| 48 |
-
except Exception:
|
| 49 |
-
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
-
LEADERBOARD_DF = get_leaderboard_df(
|
| 53 |
|
| 54 |
(
|
| 55 |
finished_eval_queue_df,
|
| 56 |
running_eval_queue_df,
|
| 57 |
pending_eval_queue_df,
|
| 58 |
-
) = get_evaluation_queue_df(
|
| 59 |
|
| 60 |
def init_leaderboard(dataframe):
|
| 61 |
-
if dataframe is None
|
| 62 |
-
|
| 63 |
return Leaderboard(
|
| 64 |
value=dataframe,
|
| 65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
@@ -95,8 +111,15 @@ with demo:
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
@@ -146,15 +169,8 @@ with demo:
|
|
| 146 |
|
| 147 |
with gr.Row():
|
| 148 |
with gr.Column():
|
| 149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
|
| 159 |
with gr.Column():
|
| 160 |
precision = gr.Dropdown(
|
|
@@ -164,14 +180,6 @@ with demo:
|
|
| 164 |
value="float16",
|
| 165 |
interactive=True,
|
| 166 |
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
|
| 176 |
submit_button = gr.Button("Submit Eval")
|
| 177 |
submission_result = gr.Markdown()
|
|
@@ -179,11 +187,8 @@ with demo:
|
|
| 179 |
add_new_eval,
|
| 180 |
[
|
| 181 |
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
revision_name_textbox,
|
| 184 |
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
],
|
| 188 |
submission_result,
|
| 189 |
)
|
|
@@ -201,4 +206,4 @@ with demo:
|
|
| 201 |
scheduler = BackgroundScheduler()
|
| 202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
|
|
|
| 22 |
fields,
|
| 23 |
+
Precision,
|
| 24 |
+
)
|
| 25 |
+
from src.envs import (
|
| 26 |
+
API,
|
| 27 |
+
EVAL_REQUESTS_PATH,
|
| 28 |
+
EVAL_RESULTS_PATH,
|
| 29 |
+
LOCAL_EVAL_REQUESTS_PATH,
|
| 30 |
+
LOCAL_EVAL_RESULTS_PATH,
|
| 31 |
+
QUEUE_REPO,
|
| 32 |
+
REPO_ID,
|
| 33 |
+
RESULTS_REPO,
|
| 34 |
+
TOKEN,
|
| 35 |
+
has_remote_backend,
|
| 36 |
)
|
|
|
|
| 37 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 38 |
from src.submission.submit import add_new_eval
|
| 39 |
|
| 40 |
|
| 41 |
def restart_space():
|
| 42 |
+
if REPO_ID and TOKEN:
|
| 43 |
+
API.restart_space(repo_id=REPO_ID)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def sync_or_fallback(repo_id: str, local_dir: str, fallback_dir: str) -> str:
|
| 47 |
+
if not has_remote_backend() or not repo_id:
|
| 48 |
+
return fallback_dir
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
snapshot_download(
|
| 52 |
+
repo_id=repo_id,
|
| 53 |
+
local_dir=local_dir,
|
| 54 |
+
repo_type="dataset",
|
| 55 |
+
tqdm_class=None,
|
| 56 |
+
etag_timeout=30,
|
| 57 |
+
token=TOKEN,
|
| 58 |
+
)
|
| 59 |
+
return local_dir
|
| 60 |
+
except Exception:
|
| 61 |
+
return fallback_dir
|
| 62 |
|
| 63 |
### Space initialisation
|
| 64 |
+
REQUESTS_PATH = sync_or_fallback(QUEUE_REPO, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH)
|
| 65 |
+
RESULTS_PATH = sync_or_fallback(RESULTS_REPO, EVAL_RESULTS_PATH, LOCAL_EVAL_RESULTS_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
+
LEADERBOARD_DF = get_leaderboard_df(RESULTS_PATH, REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 69 |
|
| 70 |
(
|
| 71 |
finished_eval_queue_df,
|
| 72 |
running_eval_queue_df,
|
| 73 |
pending_eval_queue_df,
|
| 74 |
+
) = get_evaluation_queue_df(REQUESTS_PATH, EVAL_COLS)
|
| 75 |
|
| 76 |
def init_leaderboard(dataframe):
|
| 77 |
+
if dataframe is None:
|
| 78 |
+
dataframe = pd.DataFrame(columns=COLS)
|
| 79 |
return Leaderboard(
|
| 80 |
value=dataframe,
|
| 81 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
|
| 111 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 112 |
|
| 113 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 114 |
+
with gr.TabItem("🏅 OCR Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 115 |
+
if LEADERBOARD_DF.empty:
|
| 116 |
+
gr.Markdown(
|
| 117 |
+
"No finished evaluations are available yet. The queue below is seeded with the first model submission.",
|
| 118 |
+
elem_classes="markdown-text",
|
| 119 |
+
)
|
| 120 |
+
gr.Dataframe(value=LEADERBOARD_DF, headers=COLS, interactive=False)
|
| 121 |
+
else:
|
| 122 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 123 |
|
| 124 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 125 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 169 |
|
| 170 |
with gr.Row():
|
| 171 |
with gr.Column():
|
| 172 |
+
model_name_textbox = gr.Textbox(label="Model name", value="ibm-granite/granite-vision-3.3-2b")
|
| 173 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
with gr.Column():
|
| 176 |
precision = gr.Dropdown(
|
|
|
|
| 180 |
value="float16",
|
| 181 |
interactive=True,
|
| 182 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
submit_button = gr.Button("Submit Eval")
|
| 185 |
submission_result = gr.Markdown()
|
|
|
|
| 187 |
add_new_eval,
|
| 188 |
[
|
| 189 |
model_name_textbox,
|
|
|
|
| 190 |
revision_name_textbox,
|
| 191 |
precision,
|
|
|
|
|
|
|
| 192 |
],
|
| 193 |
submission_result,
|
| 194 |
)
|
|
|
|
| 206 |
scheduler = BackgroundScheduler()
|
| 207 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 208 |
scheduler.start()
|
| 209 |
+
demo.queue(default_concurrency_limit=40).launch()
|
data/leaderboard/requests/ibm-granite/granite-vision-3.3-2b_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ibm-granite/granite-vision-3.3-2b",
|
| 3 |
+
"revision": "main",
|
| 4 |
+
"precision": "float16",
|
| 5 |
+
"status": "PENDING",
|
| 6 |
+
"submitted_time": "2026-03-28T00:00:00Z",
|
| 7 |
+
"model_type": "pretrained",
|
| 8 |
+
"weight_type": "Original",
|
| 9 |
+
"likes": 0,
|
| 10 |
+
"params": 2.0,
|
| 11 |
+
"license": "?"
|
| 12 |
+
}
|
data/lloyd-jones-soph-170/ocr/lloyd-jones-apparatus.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1270": "αὑτοῦ a: αὐ- codd. plerique",
|
| 3 |
+
"1271": "ὄψοιντο Cat: -οιτο Lrpa",
|
| 4 |
+
"1276": "ἐπαίρων] cf. Senecae Oed. 962sq.: πείρων Nauck",
|
| 5 |
+
"1278-9": "del. West",
|
| 6 |
+
"1279": "χαλάζης αἵματος] χάλαζά θ᾽ αἱματοῦσσ᾽ Porson: alii alia (αἵματός 〈θ᾽〉 Zrpct)",
|
| 7 |
+
"1280-1": "del. Dindorf",
|
| 8 |
+
"1280": "ἐκ] ἐς (... κάρα) Pearson οὐ μόνου κακά] οὐ μόνου κάτα C. Otto: οὐχ ἑνὸς μόνου Porson: alii alia",
|
| 9 |
+
"1284": "ἄτη Rpat: ἄται Gp: ἄτε l",
|
| 10 |
+
"1286": "τινι Mudge et Elmsley, teste Hermann: τίνι codd.",
|
| 11 |
+
"1287": "κλῄθρα Lpa: κλεῖθρα rpat",
|
| 12 |
+
"1291": "μενῶν Lat: μένων rp δόμοις ἀραῖος ὡς] δόμοισιν ἔνοχος οἷς Nauck",
|
| 13 |
+
"1294": "δείξει] δόξει Xr, coni. Reiske κλῇθρα L, P s.l., a: κλεῖθρα rpat γὰρ rpat: γε lp"
|
| 14 |
+
}
|
data/lloyd-jones-soph-170/ocr/lloyd-jones-text.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1270": "ἄρας ἔπαισεν ἄρθρα τῶν αὑτοῦ κύκλων,",
|
| 3 |
+
"1271": "αὐδῶν τοιαῦθ᾽, ὁθούνεκ᾽ οὐκ ὄψοιντό νιν",
|
| 4 |
+
"1272": "οὔθ᾽ οἷ᾽ ἔπασχεν οὔθ᾽ ὁποῖ᾽ ἔδρα κακά,",
|
| 5 |
+
"1273": "ἀλλ᾽ ἐν σκότῳ τὸ λοιπὸν οὓς μὲν οὐκ ἔδει",
|
| 6 |
+
"1274": "ὀψοίαθ᾽, οὓς δ᾽ ἐχρῃζεν οὐ γνωσοίατο.",
|
| 7 |
+
"1275": "τοιαῦτ᾽ ἐφυμνῶν πολλάκις τε κοὐχ ἅπαξ",
|
| 8 |
+
"1276": "ἤρασσ᾽ ἐπαίρων βλέφαρα. φοίνιαι δ᾽ ὁμοῦ",
|
| 9 |
+
"1277": "γλῆναι γένει᾽ ἔτεγγον, οὐδ᾽ ἀνίεσαν.",
|
| 10 |
+
"1278": "[φόνου μυδώσας σταγόνας, ἀλλ᾽ ὁμοῦ μέλας",
|
| 11 |
+
"1279": "ὄμβρος †χαλάζης αἵματος† ἐτέγγετο.]",
|
| 12 |
+
"1280": "†τάδ᾽ ἐκ δυοῖν ἔρρωγεν οὐ μόνου κακά†",
|
| 13 |
+
"1281": "ἀλλ᾽ ἀνδρὶ καὶ γυναικὶ συμμιγῆ κακά.",
|
| 14 |
+
"1282": "ὁ πρὶν παλαιὸς δ᾽ ὄλβος ἦν πάροιθε μὲν",
|
| 15 |
+
"1283": "ὄλβος δικαίως, νῦν δὲ τῇδε θἠμέρᾳ",
|
| 16 |
+
"1284": "στεναγμός, ἄτη, θάνατος, αἰσχύνη, κακῶν",
|
| 17 |
+
"1285": "ὅσ᾽ ἐστὶ πάντων ὀνόματ᾽, οὐδὲν ἔστ᾽ ἀπόν.",
|
| 18 |
+
"1286": "νῦν δ᾽ ἔσθ᾽ ὁ τλήμων ἔν τινι σχολῇ κακοῦ;",
|
| 19 |
+
"1287": "βοᾷ διοίγειν κλῇθρα καὶ δηλοῦν τινα",
|
| 20 |
+
"1288": "τοῖς πᾶσι Καδμείοισι τὸν πατροκτόνον,",
|
| 21 |
+
"1289": "τὸν μητρός, αὐδῶν ἀνόσι᾽ οὐδὲ ῥητά μοι,",
|
| 22 |
+
"1290": "ὡς ἐκ χθονὸς ῥίψων ἑαυτόν, οὐδ᾽ ἔτι",
|
| 23 |
+
"1291": "μενῶν δόμοις ἀραῖος, ὡς ἠράσατο.",
|
| 24 |
+
"1292": "ρώμης γε μέντοι καὶ προηγητοῦ τινος",
|
| 25 |
+
"1293": "δεῖται· τὸ γὰρ νόσημα μεῖζον ἢ φέρειν.",
|
| 26 |
+
"1294": "δείξει δὲ καὶ σοί· κλῇθρα γὰρ πυλῶν τάδε"
|
| 27 |
+
}
|
data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png
ADDED
|
Git LFS Details
|
data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png
ADDED
|
Git LFS Details
|
data/lloyd-jones-soph-170/png/lloyd-jones-text.png
ADDED
|
Git LFS Details
|
src/about.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
|
|
|
| 4 |
@dataclass
|
| 5 |
class Task:
|
| 6 |
benchmark: str
|
|
@@ -8,65 +9,61 @@ class Task:
|
|
| 8 |
col_name: str
|
| 9 |
|
| 10 |
|
| 11 |
-
# Select your tasks here
|
| 12 |
-
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
-
# ---------------------------------------------------
|
| 20 |
|
|
|
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
-
|
| 32 |
-
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
## How it works
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
##
|
| 44 |
-
```python
|
| 45 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 46 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 49 |
-
```
|
| 50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 51 |
|
| 52 |
-
|
| 53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
| 54 |
|
| 55 |
-
|
| 56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 66 |
-
Make sure you have followed the above steps first.
|
| 67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 68 |
"""
|
| 69 |
|
| 70 |
-
CITATION_BUTTON_LABEL = "
|
| 71 |
CITATION_BUTTON_TEXT = r"""
|
| 72 |
"""
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
+
|
| 5 |
@dataclass
|
| 6 |
class Task:
|
| 7 |
benchmark: str
|
|
|
|
| 9 |
col_name: str
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
| 12 |
class Tasks(Enum):
|
| 13 |
+
easy_levenshtein = Task("easy_levenshtein", "score", "Easy Lev. ↑")
|
| 14 |
+
easy_bleu = Task("easy_bleu", "score", "Easy BLEU ↑")
|
| 15 |
+
hard_levenshtein = Task("hard_levenshtein", "score", "Hard Lev. ↑")
|
| 16 |
+
hard_bleu = Task("hard_bleu", "score", "Hard BLEU ↑")
|
| 17 |
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
TITLE = """<h1 align="center" id="space-title">Critical Apparatus OCR Leaderboard</h1>"""
|
| 20 |
|
| 21 |
+
INTRODUCTION_TEXT = """
|
| 22 |
+
This benchmark measures OCR quality on a Greek critical edition page from Lloyd-Jones' *Sophocles*.
|
| 23 |
|
| 24 |
+
Systems must emit two JSON files:
|
| 25 |
+
- `text.json`: the main text lines keyed by line number
|
| 26 |
+
- `apparatus.json`: the critical apparatus keyed by line number or range
|
| 27 |
|
| 28 |
+
There are two task variants:
|
| 29 |
+
- `Easy`: input is already split into `text` and `apparatus` crops
|
| 30 |
+
- `Hard`: input is the full page image and the system must separate the two outputs itself
|
| 31 |
+
|
| 32 |
+
Each variant is scored with normalized Levenshtein similarity and BLEU. The leaderboard average is the mean of those four scores.
|
| 33 |
"""
|
| 34 |
|
| 35 |
+
LLM_BENCHMARKS_TEXT = """
|
|
|
|
| 36 |
## How it works
|
| 37 |
|
| 38 |
+
- Gold data lives in `data/lloyd-jones-soph-170/ocr/`
|
| 39 |
+
- Hard input lives in `data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png`
|
| 40 |
+
- Easy inputs live in `data/lloyd-jones-soph-170/png/lloyd-jones-text.png` and `data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png`
|
| 41 |
|
| 42 |
+
The expected output schema is a JSON object mapping line numbers or ranges to OCR strings, matching the gold files already in the repo.
|
| 43 |
|
| 44 |
+
## Metrics
|
| 45 |
+
|
| 46 |
+
- `Levenshtein`: character-level similarity after flattening each JSON file into a deterministic text representation
|
| 47 |
+
- `BLEU`: token-level overlap score on the same flattened representation
|
| 48 |
|
| 49 |
+
## First seeded submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
The queue is pre-seeded with `ibm-granite/granite-vision-3.3-2b`. Once a real result JSON is added for that model, it will appear in the leaderboard automatically.
|
|
|
|
| 52 |
|
| 53 |
+
"""
|
|
|
|
| 54 |
|
| 55 |
+
EVALUATION_QUEUE_TEXT = """
|
| 56 |
+
Submit the Hugging Face model repo and revision you want evaluated on this OCR task.
|
| 57 |
|
| 58 |
+
The evaluator should produce four scores in its result JSON:
|
| 59 |
+
- `easy_levenshtein.score`
|
| 60 |
+
- `easy_bleu.score`
|
| 61 |
+
- `hard_levenshtein.score`
|
| 62 |
+
- `hard_bleu.score`
|
| 63 |
|
| 64 |
+
The queue shown below is local-first, so the Space can be previewed before the dedicated backend datasets are configured.
|
|
|
|
|
|
|
|
|
|
| 65 |
"""
|
| 66 |
|
| 67 |
+
CITATION_BUTTON_LABEL = "Citation snippet"
|
| 68 |
CITATION_BUTTON_TEXT = r"""
|
| 69 |
"""
|
src/display/utils.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
-
import pandas as pd
|
| 5 |
-
|
| 6 |
from src.about import Tasks
|
| 7 |
|
|
|
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 10 |
|
|
@@ -25,11 +24,9 @@ auto_eval_column_dict = []
|
|
| 25 |
# Init
|
| 26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
-
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
-
# Model information
|
| 33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
|
@@ -48,9 +45,8 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
| 48 |
class EvalQueueColumn: # Queue column
|
| 49 |
model = ColumnContent("model", "markdown", True)
|
| 50 |
revision = ColumnContent("revision", "str", True)
|
| 51 |
-
private = ColumnContent("private", "bool", True)
|
| 52 |
precision = ColumnContent("precision", "str", True)
|
| 53 |
-
|
| 54 |
status = ColumnContent("status", "str", True)
|
| 55 |
|
| 56 |
## All the model information that we might need
|
|
@@ -62,10 +58,10 @@ class ModelDetails:
|
|
| 62 |
|
| 63 |
|
| 64 |
class ModelType(Enum):
|
| 65 |
-
PT = ModelDetails(name="pretrained", symbol="
|
| 66 |
-
FT = ModelDetails(name="fine-tuned", symbol="
|
| 67 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="
|
| 68 |
-
RL = ModelDetails(name="RL-tuned", symbol="
|
| 69 |
Unknown = ModelDetails(name="", symbol="?")
|
| 70 |
|
| 71 |
def to_str(self, separator=" "):
|
|
@@ -107,4 +103,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
-
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from src.about import Tasks
|
| 5 |
|
| 6 |
+
|
| 7 |
def fields(raw_class):
|
| 8 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 9 |
|
|
|
|
| 24 |
# Init
|
| 25 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
| 27 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 28 |
for task in Tasks:
|
| 29 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
| 30 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 31 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 32 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
|
|
|
| 45 |
class EvalQueueColumn: # Queue column
|
| 46 |
model = ColumnContent("model", "markdown", True)
|
| 47 |
revision = ColumnContent("revision", "str", True)
|
|
|
|
| 48 |
precision = ColumnContent("precision", "str", True)
|
| 49 |
+
submitted_time = ColumnContent("submitted_time", "str", True)
|
| 50 |
status = ColumnContent("status", "str", True)
|
| 51 |
|
| 52 |
## All the model information that we might need
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
class ModelType(Enum):
|
| 61 |
+
PT = ModelDetails(name="pretrained", symbol="P")
|
| 62 |
+
FT = ModelDetails(name="fine-tuned", symbol="F")
|
| 63 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="I")
|
| 64 |
+
RL = ModelDetails(name="RL-tuned", symbol="R")
|
| 65 |
Unknown = ModelDetails(name="", symbol="?")
|
| 66 |
|
| 67 |
def to_str(self, separator=" "):
|
|
|
|
| 103 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 104 |
|
| 105 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
src/envs.py
CHANGED
|
@@ -2,24 +2,28 @@ import os
|
|
| 2 |
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
|
|
|
|
|
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 6 |
+
OWNER = os.environ.get("HF_LEADERBOARD_OWNER", "")
|
|
|
|
| 7 |
|
| 8 |
+
REPO_ID = os.environ.get("SPACE_ID") or (f"{OWNER}/apparatus-ocr" if OWNER else "")
|
| 9 |
+
QUEUE_REPO = os.environ.get("HF_QUEUE_REPO") or (f"{OWNER}/requests" if OWNER else "")
|
| 10 |
+
RESULTS_REPO = os.environ.get("HF_RESULTS_REPO") or (f"{OWNER}/results" if OWNER else "")
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# If you setup a cache later, just change HF_HOME
|
| 13 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 14 |
+
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 15 |
+
LOCAL_DATA_PATH = os.path.join(REPO_ROOT, "data", "leaderboard")
|
| 16 |
|
| 17 |
# Local caches
|
| 18 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 19 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 20 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 21 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 22 |
+
LOCAL_EVAL_REQUESTS_PATH = os.path.join(LOCAL_DATA_PATH, "requests")
|
| 23 |
+
LOCAL_EVAL_RESULTS_PATH = os.path.join(LOCAL_DATA_PATH, "results")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def has_remote_backend() -> bool:
|
| 29 |
+
return bool(TOKEN and QUEUE_REPO and RESULTS_REPO)
|
src/evaluation/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity, paired_ocr_metrics
|
| 2 |
+
|
| 3 |
+
__all__ = [
|
| 4 |
+
"bleu_score",
|
| 5 |
+
"flatten_ocr_json",
|
| 6 |
+
"levenshtein_similarity",
|
| 7 |
+
"paired_ocr_metrics",
|
| 8 |
+
]
|
src/evaluation/build_result.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 10 |
+
GOLD_ROOT = REPO_ROOT / "data" / "lloyd-jones-soph-170" / "ocr"
|
| 11 |
+
TEXT_GOLD_PATH = GOLD_ROOT / "lloyd-jones-text.json"
|
| 12 |
+
APPARATUS_GOLD_PATH = GOLD_ROOT / "lloyd-jones-apparatus.json"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def build_result_payload(
|
| 16 |
+
model_name: str,
|
| 17 |
+
revision: str,
|
| 18 |
+
precision: str,
|
| 19 |
+
easy_text_path: str,
|
| 20 |
+
easy_apparatus_path: str,
|
| 21 |
+
hard_text_path: str,
|
| 22 |
+
hard_apparatus_path: str,
|
| 23 |
+
) -> dict:
|
| 24 |
+
text_gold = _load_json(TEXT_GOLD_PATH)
|
| 25 |
+
apparatus_gold = _load_json(APPARATUS_GOLD_PATH)
|
| 26 |
+
|
| 27 |
+
easy_reference = _join_sections(text_gold, apparatus_gold)
|
| 28 |
+
easy_prediction = _join_sections(_load_json(easy_text_path), _load_json(easy_apparatus_path))
|
| 29 |
+
hard_reference = _join_sections(text_gold, apparatus_gold)
|
| 30 |
+
hard_prediction = _join_sections(_load_json(hard_text_path), _load_json(hard_apparatus_path))
|
| 31 |
+
|
| 32 |
+
return {
|
| 33 |
+
"config": {
|
| 34 |
+
"model_dtype": _normalize_precision(precision),
|
| 35 |
+
"model_name": model_name,
|
| 36 |
+
"model_sha": revision,
|
| 37 |
+
},
|
| 38 |
+
"results": {
|
| 39 |
+
"easy_levenshtein": {"score": levenshtein_similarity(easy_reference, easy_prediction) / 100.0},
|
| 40 |
+
"easy_bleu": {"score": bleu_score(easy_reference, easy_prediction) / 100.0},
|
| 41 |
+
"hard_levenshtein": {"score": levenshtein_similarity(hard_reference, hard_prediction) / 100.0},
|
| 42 |
+
"hard_bleu": {"score": bleu_score(hard_reference, hard_prediction) / 100.0},
|
| 43 |
+
},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def main():
|
| 48 |
+
parser = argparse.ArgumentParser(description="Build a leaderboard-compatible result JSON for the OCR benchmark.")
|
| 49 |
+
parser.add_argument("--model-name", required=True)
|
| 50 |
+
parser.add_argument("--revision", default="main")
|
| 51 |
+
parser.add_argument("--precision", default="float16")
|
| 52 |
+
parser.add_argument("--easy-text", required=True)
|
| 53 |
+
parser.add_argument("--easy-apparatus", required=True)
|
| 54 |
+
parser.add_argument("--hard-text", required=True)
|
| 55 |
+
parser.add_argument("--hard-apparatus", required=True)
|
| 56 |
+
parser.add_argument("--output", required=True)
|
| 57 |
+
args = parser.parse_args()
|
| 58 |
+
|
| 59 |
+
payload = build_result_payload(
|
| 60 |
+
model_name=args.model_name,
|
| 61 |
+
revision=args.revision,
|
| 62 |
+
precision=args.precision,
|
| 63 |
+
easy_text_path=args.easy_text,
|
| 64 |
+
easy_apparatus_path=args.easy_apparatus,
|
| 65 |
+
hard_text_path=args.hard_text,
|
| 66 |
+
hard_apparatus_path=args.hard_apparatus,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
os.makedirs(os.path.dirname(args.output), exist_ok=True)
|
| 70 |
+
with open(args.output, "w") as handle:
|
| 71 |
+
json.dump(payload, handle, ensure_ascii=False, indent=2)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _load_json(path: str | Path) -> dict[str, str]:
|
| 75 |
+
with open(path) as handle:
|
| 76 |
+
return json.load(handle)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _join_sections(text_json: dict[str, str], apparatus_json: dict[str, str]) -> str:
|
| 80 |
+
return "\n".join(
|
| 81 |
+
[
|
| 82 |
+
"[TEXT]",
|
| 83 |
+
flatten_ocr_json(text_json),
|
| 84 |
+
"[APPARATUS]",
|
| 85 |
+
flatten_ocr_json(apparatus_json),
|
| 86 |
+
]
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _normalize_precision(precision: str) -> str:
|
| 91 |
+
if precision.startswith("torch."):
|
| 92 |
+
return precision
|
| 93 |
+
return f"torch.{precision}"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
main()
|
src/evaluation/metrics.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from collections import Counter
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def flatten_ocr_json(payload: dict[str, str]) -> str:
|
| 6 |
+
parts = [f"{key}\t{payload[key]}" for key in sorted(payload, key=_sort_key)]
|
| 7 |
+
return "\n".join(parts)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _sort_key(value: str):
|
| 11 |
+
head, _, tail = value.partition("-")
|
| 12 |
+
try:
|
| 13 |
+
return (int(head), int(tail) if tail else -1, value)
|
| 14 |
+
except ValueError:
|
| 15 |
+
return (math.inf, math.inf, value)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def levenshtein_similarity(reference: str, prediction: str) -> float:
|
| 19 |
+
if reference == prediction:
|
| 20 |
+
return 100.0
|
| 21 |
+
if not reference and not prediction:
|
| 22 |
+
return 100.0
|
| 23 |
+
if not reference or not prediction:
|
| 24 |
+
return 0.0
|
| 25 |
+
|
| 26 |
+
prev = list(range(len(prediction) + 1))
|
| 27 |
+
for i, ref_char in enumerate(reference, start=1):
|
| 28 |
+
current = [i]
|
| 29 |
+
for j, pred_char in enumerate(prediction, start=1):
|
| 30 |
+
substitution_cost = 0 if ref_char == pred_char else 1
|
| 31 |
+
current.append(
|
| 32 |
+
min(
|
| 33 |
+
prev[j] + 1,
|
| 34 |
+
current[j - 1] + 1,
|
| 35 |
+
prev[j - 1] + substitution_cost,
|
| 36 |
+
)
|
| 37 |
+
)
|
| 38 |
+
prev = current
|
| 39 |
+
|
| 40 |
+
distance = prev[-1]
|
| 41 |
+
return max(0.0, (1 - (distance / max(len(reference), len(prediction)))) * 100.0)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def bleu_score(reference: str, prediction: str, max_order: int = 4) -> float:
|
| 45 |
+
ref_tokens = reference.split()
|
| 46 |
+
pred_tokens = prediction.split()
|
| 47 |
+
|
| 48 |
+
if not ref_tokens and not pred_tokens:
|
| 49 |
+
return 100.0
|
| 50 |
+
if not ref_tokens or not pred_tokens:
|
| 51 |
+
return 0.0
|
| 52 |
+
|
| 53 |
+
precisions = []
|
| 54 |
+
for order in range(1, max_order + 1):
|
| 55 |
+
ref_counts = _ngram_counts(ref_tokens, order)
|
| 56 |
+
pred_counts = _ngram_counts(pred_tokens, order)
|
| 57 |
+
overlap = sum(min(count, ref_counts[ngram]) for ngram, count in pred_counts.items())
|
| 58 |
+
total = max(sum(pred_counts.values()), 1)
|
| 59 |
+
precisions.append((overlap + 1) / (total + 1))
|
| 60 |
+
|
| 61 |
+
geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
|
| 62 |
+
bp = 1.0 if len(pred_tokens) > len(ref_tokens) else math.exp(1 - (len(ref_tokens) / len(pred_tokens)))
|
| 63 |
+
return geo_mean * bp * 100.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def paired_ocr_metrics(reference: dict[str, str], prediction: dict[str, str]) -> dict[str, float]:
|
| 67 |
+
reference_text = flatten_ocr_json(reference)
|
| 68 |
+
prediction_text = flatten_ocr_json(prediction)
|
| 69 |
+
return {
|
| 70 |
+
"levenshtein": levenshtein_similarity(reference_text, prediction_text),
|
| 71 |
+
"bleu": bleu_score(reference_text, prediction_text),
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _ngram_counts(tokens: list[str], order: int) -> Counter:
|
| 76 |
+
if len(tokens) < order:
|
| 77 |
+
return Counter()
|
| 78 |
+
return Counter(tuple(tokens[i : i + order]) for i in range(len(tokens) - order + 1))
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import glob
|
| 2 |
import json
|
| 3 |
-
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
| 6 |
|
|
@@ -57,10 +56,16 @@ class EvalResult:
|
|
| 57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
| 61 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 62 |
-
)
|
| 63 |
architecture = "?"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
if model_config is not None:
|
| 65 |
architectures = getattr(model_config, "architectures", None)
|
| 66 |
if architectures:
|
|
@@ -127,7 +132,7 @@ class EvalResult:
|
|
| 127 |
}
|
| 128 |
|
| 129 |
for task in Tasks:
|
| 130 |
-
data_dict[task.value.col_name] = self.results
|
| 131 |
|
| 132 |
return data_dict
|
| 133 |
|
|
@@ -146,10 +151,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 146 |
for tmp_request_file in request_files:
|
| 147 |
with open(tmp_request_file, "r") as f:
|
| 148 |
req_content = json.load(f)
|
| 149 |
-
if (
|
| 150 |
-
req_content["status"] in ["FINISHED"]
|
| 151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
| 152 |
-
):
|
| 153 |
request_file = tmp_request_file
|
| 154 |
return request_file
|
| 155 |
|
|
|
|
| 1 |
import glob
|
| 2 |
import json
|
|
|
|
| 3 |
import os
|
| 4 |
from dataclasses import dataclass
|
| 5 |
|
|
|
|
| 56 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 57 |
full_model = "/".join(org_and_model)
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
architecture = "?"
|
| 60 |
+
still_on_hub = False
|
| 61 |
+
model_config = None
|
| 62 |
+
try:
|
| 63 |
+
still_on_hub, _, model_config = is_model_on_hub(
|
| 64 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 65 |
+
)
|
| 66 |
+
except Exception:
|
| 67 |
+
still_on_hub = False
|
| 68 |
+
|
| 69 |
if model_config is not None:
|
| 70 |
architectures = getattr(model_config, "architectures", None)
|
| 71 |
if architectures:
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
for task in Tasks:
|
| 135 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
|
| 136 |
|
| 137 |
return data_dict
|
| 138 |
|
|
|
|
| 151 |
for tmp_request_file in request_files:
|
| 152 |
with open(tmp_request_file, "r") as f:
|
| 153 |
req_content = json.load(f)
|
| 154 |
+
if req_content["status"] in ["FINISHED", "FINISHED_MANUAL"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
|
|
|
| 155 |
request_file = tmp_request_file
|
| 156 |
return request_file
|
| 157 |
|
src/populate.py
CHANGED
|
@@ -39,7 +39,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 39 |
all_evals.append(data)
|
| 40 |
elif ".md" not in entry:
|
| 41 |
# this is a folder
|
| 42 |
-
sub_entries = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
for sub_entry in sub_entries:
|
| 44 |
file_path = os.path.join(save_path, entry, sub_entry)
|
| 45 |
with open(file_path) as fp:
|
|
|
|
| 39 |
all_evals.append(data)
|
| 40 |
elif ".md" not in entry:
|
| 41 |
# this is a folder
|
| 42 |
+
sub_entries = [
|
| 43 |
+
e
|
| 44 |
+
for e in os.listdir(f"{save_path}/{entry}")
|
| 45 |
+
if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")
|
| 46 |
+
]
|
| 47 |
for sub_entry in sub_entries:
|
| 48 |
file_path = os.path.join(save_path, entry, sub_entry)
|
| 49 |
with open(file_path) as fp:
|
src/submission/check_validity.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
-
import re
|
| 4 |
from collections import defaultdict
|
| 5 |
-
from datetime import datetime, timedelta, timezone
|
| 6 |
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
|
@@ -37,14 +35,14 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
| 37 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 38 |
if test_tokenizer:
|
| 39 |
try:
|
| 40 |
-
|
| 41 |
except ValueError as e:
|
| 42 |
return (
|
| 43 |
False,
|
| 44 |
f"uses a tokenizer which is not in a transformers release: {e}",
|
| 45 |
None
|
| 46 |
)
|
| 47 |
-
except Exception
|
| 48 |
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
| 49 |
return True, None, config
|
| 50 |
|
|
@@ -55,7 +53,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
| 55 |
None
|
| 56 |
)
|
| 57 |
|
| 58 |
-
except Exception
|
| 59 |
return False, "was not found on hub!", None
|
| 60 |
|
| 61 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
from collections import defaultdict
|
|
|
|
| 4 |
|
| 5 |
import huggingface_hub
|
| 6 |
from huggingface_hub import ModelCard
|
|
|
|
| 35 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 36 |
if test_tokenizer:
|
| 37 |
try:
|
| 38 |
+
AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 39 |
except ValueError as e:
|
| 40 |
return (
|
| 41 |
False,
|
| 42 |
f"uses a tokenizer which is not in a transformers release: {e}",
|
| 43 |
None
|
| 44 |
)
|
| 45 |
+
except Exception:
|
| 46 |
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
| 47 |
return True, None, config
|
| 48 |
|
|
|
|
| 53 |
None
|
| 54 |
)
|
| 55 |
|
| 56 |
+
except Exception:
|
| 57 |
return False, "was not found on hub!", None
|
| 58 |
|
| 59 |
|
src/submission/submit.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
| 7 |
from src.submission.check_validity import (
|
| 8 |
already_submitted_models,
|
| 9 |
check_model_card,
|
|
@@ -16,16 +16,15 @@ USERS_TO_SUBMISSION_DATES = None
|
|
| 16 |
|
| 17 |
def add_new_eval(
|
| 18 |
model: str,
|
| 19 |
-
base_model: str,
|
| 20 |
revision: str,
|
| 21 |
precision: str,
|
| 22 |
-
weight_type: str,
|
| 23 |
-
model_type: str,
|
| 24 |
):
|
| 25 |
global REQUESTED_MODELS
|
| 26 |
global USERS_TO_SUBMISSION_DATES
|
|
|
|
|
|
|
| 27 |
if not REQUESTED_MODELS:
|
| 28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(
|
| 29 |
|
| 30 |
user_name = ""
|
| 31 |
model_path = model
|
|
@@ -36,58 +35,42 @@ def add_new_eval(
|
|
| 36 |
precision = precision.split(" ")[0]
|
| 37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 38 |
|
| 39 |
-
if model_type is None or model_type == "":
|
| 40 |
-
return styled_error("Please select a model type.")
|
| 41 |
-
|
| 42 |
-
# Does the model actually exist?
|
| 43 |
if revision == "":
|
| 44 |
revision = "main"
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
|
| 48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 49 |
-
if not base_model_on_hub:
|
| 50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
| 53 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 54 |
if not model_on_hub:
|
| 55 |
return styled_error(f'Model "{model}" {error}')
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
license = model_info.cardData["license"]
|
| 68 |
-
except Exception:
|
| 69 |
-
return styled_error("Please select a license for your model")
|
| 70 |
-
|
| 71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
| 72 |
-
if not modelcard_OK:
|
| 73 |
-
return styled_error(error_msg)
|
| 74 |
-
|
| 75 |
-
# Seems good, creating the eval
|
| 76 |
-
print("Adding new eval")
|
| 77 |
|
| 78 |
eval_entry = {
|
| 79 |
"model": model,
|
| 80 |
-
"base_model": base_model,
|
| 81 |
"revision": revision,
|
| 82 |
"precision": precision,
|
| 83 |
-
"weight_type": weight_type,
|
| 84 |
"status": "PENDING",
|
| 85 |
"submitted_time": current_time,
|
| 86 |
-
"model_type":
|
| 87 |
-
"
|
|
|
|
| 88 |
"params": model_size,
|
| 89 |
"license": license,
|
| 90 |
-
"private": False,
|
| 91 |
}
|
| 92 |
|
| 93 |
# Check for duplicate submission
|
|
@@ -95,25 +78,27 @@ def add_new_eval(
|
|
| 95 |
return styled_warning("This model has been already submitted.")
|
| 96 |
|
| 97 |
print("Creating eval file")
|
| 98 |
-
OUT_DIR = f"{
|
| 99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 100 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}
|
| 101 |
|
| 102 |
with open(out_path, "w") as f:
|
| 103 |
f.write(json.dumps(eval_entry))
|
| 104 |
|
| 105 |
print("Uploading eval file")
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
# Remove the local file
|
| 115 |
-
|
|
|
|
| 116 |
|
| 117 |
return styled_message(
|
| 118 |
-
"Your request has been submitted to the evaluation queue
|
| 119 |
)
|
|
|
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN, has_remote_backend
|
| 7 |
from src.submission.check_validity import (
|
| 8 |
already_submitted_models,
|
| 9 |
check_model_card,
|
|
|
|
| 16 |
|
| 17 |
def add_new_eval(
|
| 18 |
model: str,
|
|
|
|
| 19 |
revision: str,
|
| 20 |
precision: str,
|
|
|
|
|
|
|
| 21 |
):
|
| 22 |
global REQUESTED_MODELS
|
| 23 |
global USERS_TO_SUBMISSION_DATES
|
| 24 |
+
requests_path = EVAL_REQUESTS_PATH if has_remote_backend() else LOCAL_EVAL_REQUESTS_PATH
|
| 25 |
+
|
| 26 |
if not REQUESTED_MODELS:
|
| 27 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(requests_path)
|
| 28 |
|
| 29 |
user_name = ""
|
| 30 |
model_path = model
|
|
|
|
| 35 |
precision = precision.split(" ")[0]
|
| 36 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if revision == "":
|
| 39 |
revision = "main"
|
| 40 |
|
| 41 |
+
# Seems good, creating the eval
|
| 42 |
+
print("Adding new eval")
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
license = "?"
|
| 45 |
+
model_size = 0
|
| 46 |
+
likes = 0
|
| 47 |
+
if has_remote_backend():
|
| 48 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 49 |
if not model_on_hub:
|
| 50 |
return styled_error(f'Model "{model}" {error}')
|
| 51 |
|
| 52 |
+
try:
|
| 53 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
| 54 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
| 55 |
+
likes = model_info.likes
|
| 56 |
+
license = model_info.cardData.get("license", "?")
|
| 57 |
+
modelcard_OK, error_msg = check_model_card(model)
|
| 58 |
+
if not modelcard_OK:
|
| 59 |
+
return styled_error(error_msg)
|
| 60 |
+
except Exception:
|
| 61 |
+
return styled_error("Could not get your model information from the Hub.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
eval_entry = {
|
| 64 |
"model": model,
|
|
|
|
| 65 |
"revision": revision,
|
| 66 |
"precision": precision,
|
|
|
|
| 67 |
"status": "PENDING",
|
| 68 |
"submitted_time": current_time,
|
| 69 |
+
"model_type": "pretrained",
|
| 70 |
+
"weight_type": "Original",
|
| 71 |
+
"likes": likes,
|
| 72 |
"params": model_size,
|
| 73 |
"license": license,
|
|
|
|
| 74 |
}
|
| 75 |
|
| 76 |
# Check for duplicate submission
|
|
|
|
| 78 |
return styled_warning("This model has been already submitted.")
|
| 79 |
|
| 80 |
print("Creating eval file")
|
| 81 |
+
OUT_DIR = f"{requests_path}/{user_name}"
|
| 82 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 83 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
|
| 84 |
|
| 85 |
with open(out_path, "w") as f:
|
| 86 |
f.write(json.dumps(eval_entry))
|
| 87 |
|
| 88 |
print("Uploading eval file")
|
| 89 |
+
if has_remote_backend():
|
| 90 |
+
API.upload_file(
|
| 91 |
+
path_or_fileobj=out_path,
|
| 92 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
| 93 |
+
repo_id=QUEUE_REPO,
|
| 94 |
+
repo_type="dataset",
|
| 95 |
+
commit_message=f"Add {model} to eval queue",
|
| 96 |
+
)
|
| 97 |
|
| 98 |
# Remove the local file
|
| 99 |
+
if has_remote_backend():
|
| 100 |
+
os.remove(out_path)
|
| 101 |
|
| 102 |
return styled_message(
|
| 103 |
+
"Your request has been submitted to the evaluation queue."
|
| 104 |
)
|