al1808th commited on
Commit
69dc570
·
1 Parent(s): bc4255a

first commit

Browse files
README.md CHANGED
@@ -13,36 +13,60 @@ tags:
13
  - leaderboard
14
  ---
15
 
16
- # Start the configuration
17
 
18
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
19
 
20
- Results files should have the following format and be stored as json files:
 
 
 
 
 
 
 
21
  ```json
22
  {
23
  "config": {
24
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
25
- "model_name": "path of the model on the hub: org/model",
26
- "model_sha": "revision on the hub",
27
  },
28
  "results": {
29
- "task_name": {
30
- "metric_name": score,
 
 
 
 
 
 
31
  },
32
- "task_name2": {
33
- "metric_name": score,
34
  }
35
  }
36
  }
37
  ```
38
 
39
- Request files are created automatically by this tool.
 
 
40
 
41
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
 
 
 
 
 
42
 
43
- # Code logic for more complex edits
44
-
45
- You'll find
46
- - the main table' columns names and properties in `src/display/utils.py`
47
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
48
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
 
 
 
 
13
  - leaderboard
14
  ---
15
 
16
+ # OCR leaderboard
17
 
18
+ This Space is customized for a two-level OCR benchmark on a single critical-edition page.
19
 
20
+ Inputs and gold outputs live under `data/lloyd-jones-soph-170/`:
21
+ - `png/lloyd-jones-fullpage.png`: hard task input
22
+ - `png/lloyd-jones-text.png`: easy task text crop
23
+ - `png/lloyd-jones-apparatus.png`: easy task apparatus crop
24
+ - `ocr/lloyd-jones-text.json`: gold main-text output
25
+ - `ocr/lloyd-jones-apparatus.json`: gold apparatus output
26
+
27
+ The leaderboard expects result files in the following format:
28
  ```json
29
  {
30
  "config": {
31
+ "model_dtype": "torch.float16",
32
+ "model_name": "org/model",
33
+ "model_sha": "main"
34
  },
35
  "results": {
36
+ "easy_levenshtein": {
37
+ "score": 91.23
38
+ },
39
+ "easy_bleu": {
40
+ "score": 84.56
41
+ },
42
+ "hard_levenshtein": {
43
+ "score": 79.10
44
  },
45
+ "hard_bleu": {
46
+ "score": 70.42
47
  }
48
  }
49
  }
50
  ```
51
 
52
+ The Space is local-first:
53
+ - If HF backend datasets are configured via env vars, it will sync from them.
54
+ - Otherwise it reads seeded queue/results data from `data/leaderboard/`.
55
 
56
+ Useful files:
57
+ - `src/about.py`: task definitions and benchmark copy
58
+ - `src/evaluation/metrics.py`: local OCR metric helpers
59
+ - `src/evaluation/build_result.py`: CLI to turn predicted OCR JSON files into a leaderboard result JSON
60
+ - `src/leaderboard/read_evals.py`: result ingestion
61
+ - `src/populate.py`: leaderboard and queue dataframe assembly
62
 
63
+ Example:
64
+ ```bash
65
+ python -m src.evaluation.build_result \
66
+ --model-name ibm-granite/granite-vision-3.3-2b \
67
+ --easy-text path/to/easy-text.json \
68
+ --easy-apparatus path/to/easy-apparatus.json \
69
+ --hard-text path/to/hard-text.json \
70
+ --hard-apparatus path/to/hard-apparatus.json \
71
+ --output data/leaderboard/results/ibm-granite/results_2026-03-28T00-00-00Z.json
72
+ ```
app.py CHANGED
@@ -19,47 +19,63 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
- Precision
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
31
 
32
  def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
  (
55
  finished_eval_queue_df,
56
  running_eval_queue_df,
57
  pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -95,8 +111,15 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -146,15 +169,8 @@ with demo:
146
 
147
  with gr.Row():
148
  with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
 
159
  with gr.Column():
160
  precision = gr.Dropdown(
@@ -164,14 +180,6 @@ with demo:
164
  value="float16",
165
  interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
@@ -179,11 +187,8 @@ with demo:
179
  add_new_eval,
180
  [
181
  model_name_textbox,
182
- base_model_name_textbox,
183
  revision_name_textbox,
184
  precision,
185
- weight_type,
186
- model_type,
187
  ],
188
  submission_result,
189
  )
@@ -201,4 +206,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
 
22
  fields,
23
+ Precision,
24
+ )
25
+ from src.envs import (
26
+ API,
27
+ EVAL_REQUESTS_PATH,
28
+ EVAL_RESULTS_PATH,
29
+ LOCAL_EVAL_REQUESTS_PATH,
30
+ LOCAL_EVAL_RESULTS_PATH,
31
+ QUEUE_REPO,
32
+ REPO_ID,
33
+ RESULTS_REPO,
34
+ TOKEN,
35
+ has_remote_backend,
36
  )
 
37
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
38
  from src.submission.submit import add_new_eval
39
 
40
 
41
  def restart_space():
42
+ if REPO_ID and TOKEN:
43
+ API.restart_space(repo_id=REPO_ID)
44
+
45
+
46
+ def sync_or_fallback(repo_id: str, local_dir: str, fallback_dir: str) -> str:
47
+ if not has_remote_backend() or not repo_id:
48
+ return fallback_dir
49
+
50
+ try:
51
+ snapshot_download(
52
+ repo_id=repo_id,
53
+ local_dir=local_dir,
54
+ repo_type="dataset",
55
+ tqdm_class=None,
56
+ etag_timeout=30,
57
+ token=TOKEN,
58
+ )
59
+ return local_dir
60
+ except Exception:
61
+ return fallback_dir
62
 
63
  ### Space initialisation
64
+ REQUESTS_PATH = sync_or_fallback(QUEUE_REPO, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH)
65
+ RESULTS_PATH = sync_or_fallback(RESULTS_REPO, EVAL_RESULTS_PATH, LOCAL_EVAL_RESULTS_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
+ LEADERBOARD_DF = get_leaderboard_df(RESULTS_PATH, REQUESTS_PATH, COLS, BENCHMARK_COLS)
69
 
70
  (
71
  finished_eval_queue_df,
72
  running_eval_queue_df,
73
  pending_eval_queue_df,
74
+ ) = get_evaluation_queue_df(REQUESTS_PATH, EVAL_COLS)
75
 
76
  def init_leaderboard(dataframe):
77
+ if dataframe is None:
78
+ dataframe = pd.DataFrame(columns=COLS)
79
  return Leaderboard(
80
  value=dataframe,
81
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
111
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
112
 
113
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
114
+ with gr.TabItem("🏅 OCR Benchmark", elem_id="llm-benchmark-tab-table", id=0):
115
+ if LEADERBOARD_DF.empty:
116
+ gr.Markdown(
117
+ "No finished evaluations are available yet. The queue below is seeded with the first model submission.",
118
+ elem_classes="markdown-text",
119
+ )
120
+ gr.Dataframe(value=LEADERBOARD_DF, headers=COLS, interactive=False)
121
+ else:
122
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
123
 
124
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
125
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
169
 
170
  with gr.Row():
171
  with gr.Column():
172
+ model_name_textbox = gr.Textbox(label="Model name", value="ibm-granite/granite-vision-3.3-2b")
173
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
 
 
 
 
 
 
174
 
175
  with gr.Column():
176
  precision = gr.Dropdown(
 
180
  value="float16",
181
  interactive=True,
182
  )
 
 
 
 
 
 
 
 
183
 
184
  submit_button = gr.Button("Submit Eval")
185
  submission_result = gr.Markdown()
 
187
  add_new_eval,
188
  [
189
  model_name_textbox,
 
190
  revision_name_textbox,
191
  precision,
 
 
192
  ],
193
  submission_result,
194
  )
 
206
  scheduler = BackgroundScheduler()
207
  scheduler.add_job(restart_space, "interval", seconds=1800)
208
  scheduler.start()
209
+ demo.queue(default_concurrency_limit=40).launch()
data/leaderboard/requests/ibm-granite/granite-vision-3.3-2b_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "ibm-granite/granite-vision-3.3-2b",
3
+ "revision": "main",
4
+ "precision": "float16",
5
+ "status": "PENDING",
6
+ "submitted_time": "2026-03-28T00:00:00Z",
7
+ "model_type": "pretrained",
8
+ "weight_type": "Original",
9
+ "likes": 0,
10
+ "params": 2.0,
11
+ "license": "?"
12
+ }
data/lloyd-jones-soph-170/ocr/lloyd-jones-apparatus.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1270": "αὑτοῦ a: αὐ- codd. plerique",
3
+ "1271": "ὄψοιντο Cat: -οιτο Lrpa",
4
+ "1276": "ἐπαίρων] cf. Senecae Oed. 962sq.: πείρων Nauck",
5
+ "1278-9": "del. West",
6
+ "1279": "χαλάζης αἵματος] χάλαζά θ᾽ αἱματοῦσσ᾽ Porson: alii alia (αἵματός 〈θ᾽〉 Zrpct)",
7
+ "1280-1": "del. Dindorf",
8
+ "1280": "ἐκ] ἐς (... κάρα) Pearson οὐ μόνου κακά] οὐ μόνου κάτα C. Otto: οὐχ ἑνὸς μόνου Porson: alii alia",
9
+ "1284": "ἄτη Rpat: ἄται Gp: ἄτε l",
10
+ "1286": "τινι Mudge et Elmsley, teste Hermann: τίνι codd.",
11
+ "1287": "κλῄθρα Lpa: κλεῖθρα rpat",
12
+ "1291": "μενῶν Lat: μένων rp δόμοις ἀραῖος ὡς] δόμοισιν ἔνοχος οἷς Nauck",
13
+ "1294": "δείξει] δόξει Xr, coni. Reiske κλῇθρα L, P s.l., a: κλεῖθρα rpat γὰρ rpat: γε lp"
14
+ }
data/lloyd-jones-soph-170/ocr/lloyd-jones-text.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1270": "ἄρας ἔπαισεν ἄρθρα τῶν αὑτοῦ κύκλων,",
3
+ "1271": "αὐδῶν τοιαῦθ᾽, ὁθούνεκ᾽ οὐκ ὄψοιντό νιν",
4
+ "1272": "οὔθ᾽ οἷ᾽ ἔπασχεν οὔθ᾽ ὁποῖ᾽ ἔδρα κακά,",
5
+ "1273": "ἀλλ᾽ ἐν σκότῳ τὸ λοιπὸν οὓς μὲν οὐκ ἔδει",
6
+ "1274": "ὀψοίαθ᾽, οὓς δ᾽ ἐχρῃζεν οὐ γνωσοίατο.",
7
+ "1275": "τοιαῦτ᾽ ἐφυμνῶν πολλάκις τε κοὐχ ἅπαξ",
8
+ "1276": "ἤρασσ᾽ ἐπαίρων βλέφαρα. φοίνιαι δ᾽ ὁμοῦ",
9
+ "1277": "γλῆναι γένει᾽ ἔτεγγον, οὐδ᾽ ἀνίεσαν.",
10
+ "1278": "[φόνου μυδώσας σταγόνας, ἀλλ᾽ ὁμοῦ μέλας",
11
+ "1279": "ὄμβρος †χαλάζης αἵματος† ἐτέγγετο.]",
12
+ "1280": "†τάδ᾽ ἐκ δυοῖν ἔρρωγεν οὐ μόνου κακά†",
13
+ "1281": "ἀλλ᾽ ἀνδρὶ καὶ γυναικὶ συμμιγῆ κακά.",
14
+ "1282": "ὁ πρὶν παλαιὸς δ᾽ ὄλβος ἦν πάροιθε μὲν",
15
+ "1283": "ὄλβος δικαίως, νῦν δὲ τῇδε θἠμέρᾳ",
16
+ "1284": "στεναγμός, ἄτη, θάνατος, αἰσχύνη, κακῶν",
17
+ "1285": "ὅσ᾽ ἐστὶ πάντων ὀνόματ᾽, οὐδὲν ἔστ᾽ ἀπόν.",
18
+ "1286": "νῦν δ᾽ ἔσθ᾽ ὁ τλήμων ἔν τινι σχολῇ κακοῦ;",
19
+ "1287": "βοᾷ διοίγειν κλῇθρα καὶ δηλοῦν τινα",
20
+ "1288": "τοῖς πᾶσι Καδμείοισι τὸν πατροκτόνον,",
21
+ "1289": "τὸν μητρός, αὐδῶν ἀνόσι᾽ οὐδὲ ῥητά μοι,",
22
+ "1290": "ὡς ἐκ χθονὸς ῥίψων ἑαυτόν, οὐδ᾽ ἔτι",
23
+ "1291": "μενῶν δόμοις ἀραῖος, ὡς ἠράσατο.",
24
+ "1292": "ρώμης γε μέντοι καὶ προηγητοῦ τινος",
25
+ "1293": "δεῖται· τὸ γὰρ νόσημα μεῖζον ἢ φέρειν.",
26
+ "1294": "δείξει δὲ καὶ σοί· κλῇθρα γὰρ πυλῶν τάδε"
27
+ }
data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png ADDED

Git LFS Details

  • SHA256: facc45bc80a10a402df2c81fb35f68e785e0263851cdd89ce7bc2dd221d40ab8
  • Pointer size: 131 Bytes
  • Size of remote file: 194 kB
data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png ADDED

Git LFS Details

  • SHA256: 590ddb2692f86b758dd58cfb69d702e73dea3a5f2da7f5f8190459cfb8ccaea8
  • Pointer size: 131 Bytes
  • Size of remote file: 650 kB
data/lloyd-jones-soph-170/png/lloyd-jones-text.png ADDED

Git LFS Details

  • SHA256: 7ef86d1ebd3c8a36e997fed06ad4b0c3ee7556c31346420904d3911942ae6ea3
  • Pointer size: 131 Bytes
  • Size of remote file: 430 kB
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -8,65 +9,61 @@ class Task:
8
  col_name: str
9
 
10
 
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
21
 
 
 
22
 
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
25
 
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
 
 
29
  """
30
 
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
 
37
 
38
- """
39
 
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
 
 
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
9
  col_name: str
10
 
11
 
 
 
12
  class Tasks(Enum):
13
+ easy_levenshtein = Task("easy_levenshtein", "score", "Easy Lev. ↑")
14
+ easy_bleu = Task("easy_bleu", "score", "Easy BLEU ↑")
15
+ hard_levenshtein = Task("hard_levenshtein", "score", "Hard Lev. ↑")
16
+ hard_bleu = Task("hard_bleu", "score", "Hard BLEU ↑")
17
 
 
 
18
 
19
+ TITLE = """<h1 align="center" id="space-title">Critical Apparatus OCR Leaderboard</h1>"""
20
 
21
+ INTRODUCTION_TEXT = """
22
+ This benchmark measures OCR quality on a Greek critical edition page from Lloyd-Jones' *Sophocles*.
23
 
24
+ Systems must emit two JSON files:
25
+ - `text.json`: the main text lines keyed by line number
26
+ - `apparatus.json`: the critical apparatus keyed by line number or range
27
 
28
+ There are two task variants:
29
+ - `Easy`: input is already split into `text` and `apparatus` crops
30
+ - `Hard`: input is the full page image and the system must separate the two outputs itself
31
+
32
+ Each variant is scored with normalized Levenshtein similarity and BLEU. The leaderboard average is the mean of those four scores.
33
  """
34
 
35
+ LLM_BENCHMARKS_TEXT = """
 
36
  ## How it works
37
 
38
+ - Gold data lives in `data/lloyd-jones-soph-170/ocr/`
39
+ - Hard input lives in `data/lloyd-jones-soph-170/png/lloyd-jones-fullpage.png`
40
+ - Easy inputs live in `data/lloyd-jones-soph-170/png/lloyd-jones-text.png` and `data/lloyd-jones-soph-170/png/lloyd-jones-apparatus.png`
41
 
42
+ The expected output schema is a JSON object mapping line numbers or ranges to OCR strings, matching the gold files already in the repo.
43
 
44
+ ## Metrics
45
+
46
+ - `Levenshtein`: character-level similarity after flattening each JSON file into a deterministic text representation
47
+ - `BLEU`: token-level overlap score on the same flattened representation
48
 
49
+ ## First seeded submission
 
 
 
 
 
 
 
50
 
51
+ The queue is pre-seeded with `ibm-granite/granite-vision-3.3-2b`. Once a real result JSON is added for that model, it will appear in the leaderboard automatically.
 
52
 
53
+ """
 
54
 
55
+ EVALUATION_QUEUE_TEXT = """
56
+ Submit the Hugging Face model repo and revision you want evaluated on this OCR task.
57
 
58
+ The evaluator should produce four scores in its result JSON:
59
+ - `easy_levenshtein.score`
60
+ - `easy_bleu.score`
61
+ - `hard_levenshtein.score`
62
+ - `hard_bleu.score`
63
 
64
+ The queue shown below is local-first, so the Space can be previewed before the dedicated backend datasets are configured.
 
 
 
65
  """
66
 
67
+ CITATION_BUTTON_LABEL = "Citation snippet"
68
  CITATION_BUTTON_TEXT = r"""
69
  """
src/display/utils.py CHANGED
@@ -1,10 +1,9 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
- import pandas as pd
5
-
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -25,11 +24,9 @@ auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
@@ -48,9 +45,8 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
  revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
  precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
@@ -62,10 +58,10 @@ class ModelDetails:
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -107,4 +103,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
 
 
4
  from src.about import Tasks
5
 
6
+
7
  def fields(raw_class):
8
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
9
 
 
24
  # Init
25
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
26
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
27
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
28
  for task in Tasks:
29
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
30
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
31
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
32
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 
45
  class EvalQueueColumn: # Queue column
46
  model = ColumnContent("model", "markdown", True)
47
  revision = ColumnContent("revision", "str", True)
 
48
  precision = ColumnContent("precision", "str", True)
49
+ submitted_time = ColumnContent("submitted_time", "str", True)
50
  status = ColumnContent("status", "str", True)
51
 
52
  ## All the model information that we might need
 
58
 
59
 
60
  class ModelType(Enum):
61
+ PT = ModelDetails(name="pretrained", symbol="P")
62
+ FT = ModelDetails(name="fine-tuned", symbol="F")
63
+ IFT = ModelDetails(name="instruction-tuned", symbol="I")
64
+ RL = ModelDetails(name="RL-tuned", symbol="R")
65
  Unknown = ModelDetails(name="", symbol="?")
66
 
67
  def to_str(self, separator=" "):
 
103
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
104
 
105
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/envs.py CHANGED
@@ -2,24 +2,28 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
 
 
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 
 
24
 
25
  API = HfApi(token=TOKEN)
 
 
 
 
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ TOKEN = os.environ.get("HF_TOKEN")
6
+ OWNER = os.environ.get("HF_LEADERBOARD_OWNER", "")
 
7
 
8
+ REPO_ID = os.environ.get("SPACE_ID") or (f"{OWNER}/apparatus-ocr" if OWNER else "")
9
+ QUEUE_REPO = os.environ.get("HF_QUEUE_REPO") or (f"{OWNER}/requests" if OWNER else "")
10
+ RESULTS_REPO = os.environ.get("HF_RESULTS_REPO") or (f"{OWNER}/results" if OWNER else "")
 
 
 
11
 
12
  # If you setup a cache later, just change HF_HOME
13
+ CACHE_PATH = os.getenv("HF_HOME", ".")
14
+ REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
15
+ LOCAL_DATA_PATH = os.path.join(REPO_ROOT, "data", "leaderboard")
16
 
17
  # Local caches
18
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
19
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
20
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
21
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
22
+ LOCAL_EVAL_REQUESTS_PATH = os.path.join(LOCAL_DATA_PATH, "requests")
23
+ LOCAL_EVAL_RESULTS_PATH = os.path.join(LOCAL_DATA_PATH, "results")
24
 
25
  API = HfApi(token=TOKEN)
26
+
27
+
28
+ def has_remote_backend() -> bool:
29
+ return bool(TOKEN and QUEUE_REPO and RESULTS_REPO)
src/evaluation/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity, paired_ocr_metrics
2
+
3
+ __all__ = [
4
+ "bleu_score",
5
+ "flatten_ocr_json",
6
+ "levenshtein_similarity",
7
+ "paired_ocr_metrics",
8
+ ]
src/evaluation/build_result.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity
7
+
8
+
9
+ REPO_ROOT = Path(__file__).resolve().parents[2]
10
+ GOLD_ROOT = REPO_ROOT / "data" / "lloyd-jones-soph-170" / "ocr"
11
+ TEXT_GOLD_PATH = GOLD_ROOT / "lloyd-jones-text.json"
12
+ APPARATUS_GOLD_PATH = GOLD_ROOT / "lloyd-jones-apparatus.json"
13
+
14
+
15
+ def build_result_payload(
16
+ model_name: str,
17
+ revision: str,
18
+ precision: str,
19
+ easy_text_path: str,
20
+ easy_apparatus_path: str,
21
+ hard_text_path: str,
22
+ hard_apparatus_path: str,
23
+ ) -> dict:
24
+ text_gold = _load_json(TEXT_GOLD_PATH)
25
+ apparatus_gold = _load_json(APPARATUS_GOLD_PATH)
26
+
27
+ easy_reference = _join_sections(text_gold, apparatus_gold)
28
+ easy_prediction = _join_sections(_load_json(easy_text_path), _load_json(easy_apparatus_path))
29
+ hard_reference = _join_sections(text_gold, apparatus_gold)
30
+ hard_prediction = _join_sections(_load_json(hard_text_path), _load_json(hard_apparatus_path))
31
+
32
+ return {
33
+ "config": {
34
+ "model_dtype": _normalize_precision(precision),
35
+ "model_name": model_name,
36
+ "model_sha": revision,
37
+ },
38
+ "results": {
39
+ "easy_levenshtein": {"score": levenshtein_similarity(easy_reference, easy_prediction) / 100.0},
40
+ "easy_bleu": {"score": bleu_score(easy_reference, easy_prediction) / 100.0},
41
+ "hard_levenshtein": {"score": levenshtein_similarity(hard_reference, hard_prediction) / 100.0},
42
+ "hard_bleu": {"score": bleu_score(hard_reference, hard_prediction) / 100.0},
43
+ },
44
+ }
45
+
46
+
47
+ def main():
48
+ parser = argparse.ArgumentParser(description="Build a leaderboard-compatible result JSON for the OCR benchmark.")
49
+ parser.add_argument("--model-name", required=True)
50
+ parser.add_argument("--revision", default="main")
51
+ parser.add_argument("--precision", default="float16")
52
+ parser.add_argument("--easy-text", required=True)
53
+ parser.add_argument("--easy-apparatus", required=True)
54
+ parser.add_argument("--hard-text", required=True)
55
+ parser.add_argument("--hard-apparatus", required=True)
56
+ parser.add_argument("--output", required=True)
57
+ args = parser.parse_args()
58
+
59
+ payload = build_result_payload(
60
+ model_name=args.model_name,
61
+ revision=args.revision,
62
+ precision=args.precision,
63
+ easy_text_path=args.easy_text,
64
+ easy_apparatus_path=args.easy_apparatus,
65
+ hard_text_path=args.hard_text,
66
+ hard_apparatus_path=args.hard_apparatus,
67
+ )
68
+
69
+ os.makedirs(os.path.dirname(args.output), exist_ok=True)
70
+ with open(args.output, "w") as handle:
71
+ json.dump(payload, handle, ensure_ascii=False, indent=2)
72
+
73
+
74
+ def _load_json(path: str | Path) -> dict[str, str]:
75
+ with open(path) as handle:
76
+ return json.load(handle)
77
+
78
+
79
+ def _join_sections(text_json: dict[str, str], apparatus_json: dict[str, str]) -> str:
80
+ return "\n".join(
81
+ [
82
+ "[TEXT]",
83
+ flatten_ocr_json(text_json),
84
+ "[APPARATUS]",
85
+ flatten_ocr_json(apparatus_json),
86
+ ]
87
+ )
88
+
89
+
90
+ def _normalize_precision(precision: str) -> str:
91
+ if precision.startswith("torch."):
92
+ return precision
93
+ return f"torch.{precision}"
94
+
95
+
96
+ if __name__ == "__main__":
97
+ main()
src/evaluation/metrics.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from collections import Counter
3
+
4
+
5
+ def flatten_ocr_json(payload: dict[str, str]) -> str:
6
+ parts = [f"{key}\t{payload[key]}" for key in sorted(payload, key=_sort_key)]
7
+ return "\n".join(parts)
8
+
9
+
10
+ def _sort_key(value: str):
11
+ head, _, tail = value.partition("-")
12
+ try:
13
+ return (int(head), int(tail) if tail else -1, value)
14
+ except ValueError:
15
+ return (math.inf, math.inf, value)
16
+
17
+
18
+ def levenshtein_similarity(reference: str, prediction: str) -> float:
19
+ if reference == prediction:
20
+ return 100.0
21
+ if not reference and not prediction:
22
+ return 100.0
23
+ if not reference or not prediction:
24
+ return 0.0
25
+
26
+ prev = list(range(len(prediction) + 1))
27
+ for i, ref_char in enumerate(reference, start=1):
28
+ current = [i]
29
+ for j, pred_char in enumerate(prediction, start=1):
30
+ substitution_cost = 0 if ref_char == pred_char else 1
31
+ current.append(
32
+ min(
33
+ prev[j] + 1,
34
+ current[j - 1] + 1,
35
+ prev[j - 1] + substitution_cost,
36
+ )
37
+ )
38
+ prev = current
39
+
40
+ distance = prev[-1]
41
+ return max(0.0, (1 - (distance / max(len(reference), len(prediction)))) * 100.0)
42
+
43
+
44
+ def bleu_score(reference: str, prediction: str, max_order: int = 4) -> float:
45
+ ref_tokens = reference.split()
46
+ pred_tokens = prediction.split()
47
+
48
+ if not ref_tokens and not pred_tokens:
49
+ return 100.0
50
+ if not ref_tokens or not pred_tokens:
51
+ return 0.0
52
+
53
+ precisions = []
54
+ for order in range(1, max_order + 1):
55
+ ref_counts = _ngram_counts(ref_tokens, order)
56
+ pred_counts = _ngram_counts(pred_tokens, order)
57
+ overlap = sum(min(count, ref_counts[ngram]) for ngram, count in pred_counts.items())
58
+ total = max(sum(pred_counts.values()), 1)
59
+ precisions.append((overlap + 1) / (total + 1))
60
+
61
+ geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
62
+ bp = 1.0 if len(pred_tokens) > len(ref_tokens) else math.exp(1 - (len(ref_tokens) / len(pred_tokens)))
63
+ return geo_mean * bp * 100.0
64
+
65
+
66
+ def paired_ocr_metrics(reference: dict[str, str], prediction: dict[str, str]) -> dict[str, float]:
67
+ reference_text = flatten_ocr_json(reference)
68
+ prediction_text = flatten_ocr_json(prediction)
69
+ return {
70
+ "levenshtein": levenshtein_similarity(reference_text, prediction_text),
71
+ "bleu": bleu_score(reference_text, prediction_text),
72
+ }
73
+
74
+
75
+ def _ngram_counts(tokens: list[str], order: int) -> Counter:
76
+ if len(tokens) < order:
77
+ return Counter()
78
+ return Counter(tuple(tokens[i : i + order]) for i in range(len(tokens) - order + 1))
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,5 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
@@ -57,10 +56,16 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
  architecture = "?"
 
 
 
 
 
 
 
 
 
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
66
  if architectures:
@@ -127,7 +132,7 @@ class EvalResult:
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
 
132
  return data_dict
133
 
@@ -146,10 +151,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
56
  result_key = f"{org}_{model}_{precision.value.name}"
57
  full_model = "/".join(org_and_model)
58
 
 
 
 
59
  architecture = "?"
60
+ still_on_hub = False
61
+ model_config = None
62
+ try:
63
+ still_on_hub, _, model_config = is_model_on_hub(
64
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
65
+ )
66
+ except Exception:
67
+ still_on_hub = False
68
+
69
  if model_config is not None:
70
  architectures = getattr(model_config, "architectures", None)
71
  if architectures:
 
132
  }
133
 
134
  for task in Tasks:
135
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
136
 
137
  return data_dict
138
 
 
151
  for tmp_request_file in request_files:
152
  with open(tmp_request_file, "r") as f:
153
  req_content = json.load(f)
154
+ if req_content["status"] in ["FINISHED", "FINISHED_MANUAL"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
155
  request_file = tmp_request_file
156
  return request_file
157
 
src/populate.py CHANGED
@@ -39,7 +39,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
+ sub_entries = [
43
+ e
44
+ for e in os.listdir(f"{save_path}/{entry}")
45
+ if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")
46
+ ]
47
  for sub_entry in sub_entries:
48
  file_path = os.path.join(save_path, entry, sub_entry)
49
  with open(file_path) as fp:
src/submission/check_validity.py CHANGED
@@ -1,8 +1,6 @@
1
  import json
2
  import os
3
- import re
4
  from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
@@ -37,14 +35,14 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
37
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
  if test_tokenizer:
39
  try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
  except ValueError as e:
42
  return (
43
  False,
44
  f"uses a tokenizer which is not in a transformers release: {e}",
45
  None
46
  )
47
- except Exception as e:
48
  return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
  return True, None, config
50
 
@@ -55,7 +53,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
55
  None
56
  )
57
 
58
- except Exception as e:
59
  return False, "was not found on hub!", None
60
 
61
 
 
1
  import json
2
  import os
 
3
  from collections import defaultdict
 
4
 
5
  import huggingface_hub
6
  from huggingface_hub import ModelCard
 
35
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
36
  if test_tokenizer:
37
  try:
38
+ AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
39
  except ValueError as e:
40
  return (
41
  False,
42
  f"uses a tokenizer which is not in a transformers release: {e}",
43
  None
44
  )
45
+ except Exception:
46
  return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
47
  return True, None, config
48
 
 
53
  None
54
  )
55
 
56
+ except Exception:
57
  return False, "was not found on hub!", None
58
 
59
 
src/submission/submit.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
@@ -16,16 +16,15 @@ USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
  precision: str,
22
- weight_type: str,
23
- model_type: str,
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
 
 
27
  if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
  user_name = ""
31
  model_path = model
@@ -36,58 +35,42 @@ def add_new_eval(
36
  precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
 
52
- if not weight_type == "Adapter":
 
 
 
53
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
56
 
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
 
78
  eval_entry = {
79
  "model": model,
80
- "base_model": base_model,
81
  "revision": revision,
82
  "precision": precision,
83
- "weight_type": weight_type,
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
 
88
  "params": model_size,
89
  "license": license,
90
- "private": False,
91
  }
92
 
93
  # Check for duplicate submission
@@ -95,25 +78,27 @@ def add_new_eval(
95
  return styled_warning("This model has been already submitted.")
96
 
97
  print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
104
 
105
  print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
 
113
 
114
  # Remove the local file
115
- os.remove(out_path)
 
116
 
117
  return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, LOCAL_EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN, has_remote_backend
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
 
16
 
17
  def add_new_eval(
18
  model: str,
 
19
  revision: str,
20
  precision: str,
 
 
21
  ):
22
  global REQUESTED_MODELS
23
  global USERS_TO_SUBMISSION_DATES
24
+ requests_path = EVAL_REQUESTS_PATH if has_remote_backend() else LOCAL_EVAL_REQUESTS_PATH
25
+
26
  if not REQUESTED_MODELS:
27
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(requests_path)
28
 
29
  user_name = ""
30
  model_path = model
 
35
  precision = precision.split(" ")[0]
36
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
 
 
 
 
 
38
  if revision == "":
39
  revision = "main"
40
 
41
+ # Seems good, creating the eval
42
+ print("Adding new eval")
 
 
 
43
 
44
+ license = "?"
45
+ model_size = 0
46
+ likes = 0
47
+ if has_remote_backend():
48
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
49
  if not model_on_hub:
50
  return styled_error(f'Model "{model}" {error}')
51
 
52
+ try:
53
+ model_info = API.model_info(repo_id=model, revision=revision)
54
+ model_size = get_model_size(model_info=model_info, precision=precision)
55
+ likes = model_info.likes
56
+ license = model_info.cardData.get("license", "?")
57
+ modelcard_OK, error_msg = check_model_card(model)
58
+ if not modelcard_OK:
59
+ return styled_error(error_msg)
60
+ except Exception:
61
+ return styled_error("Could not get your model information from the Hub.")
 
 
 
 
 
 
 
 
 
 
62
 
63
  eval_entry = {
64
  "model": model,
 
65
  "revision": revision,
66
  "precision": precision,
 
67
  "status": "PENDING",
68
  "submitted_time": current_time,
69
+ "model_type": "pretrained",
70
+ "weight_type": "Original",
71
+ "likes": likes,
72
  "params": model_size,
73
  "license": license,
 
74
  }
75
 
76
  # Check for duplicate submission
 
78
  return styled_warning("This model has been already submitted.")
79
 
80
  print("Creating eval file")
81
+ OUT_DIR = f"{requests_path}/{user_name}"
82
  os.makedirs(OUT_DIR, exist_ok=True)
83
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_Original.json"
84
 
85
  with open(out_path, "w") as f:
86
  f.write(json.dumps(eval_entry))
87
 
88
  print("Uploading eval file")
89
+ if has_remote_backend():
90
+ API.upload_file(
91
+ path_or_fileobj=out_path,
92
+ path_in_repo=out_path.split("eval-queue/")[1],
93
+ repo_id=QUEUE_REPO,
94
+ repo_type="dataset",
95
+ commit_message=f"Add {model} to eval queue",
96
+ )
97
 
98
  # Remove the local file
99
+ if has_remote_backend():
100
+ os.remove(out_path)
101
 
102
  return styled_message(
103
+ "Your request has been submitted to the evaluation queue."
104
  )