davanstrien HF Staff commited on
Commit
fed5609
·
verified ·
1 Parent(s): 77c69a2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/britannica-vol1-leaf60.jpg filter=lfs diff=lfs merge=lfs -text
37
+ images/britannica-vol1-leaf61.jpg filter=lfs diff=lfs merge=lfs -text
38
+ images/commoner-1901-01-23.jpg filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -16,6 +16,7 @@ from pathlib import Path
16
  import spaces
17
  import torch
18
  from fastapi.responses import HTMLResponse, JSONResponse
 
19
  from gradio import Server
20
  from transformers import (
21
  AutoModelForMultimodalLM,
@@ -258,5 +259,10 @@ async def get_results():
258
  return JSONResponse({"summary_md": summary, "per_passage": rows})
259
 
260
 
 
 
 
 
 
261
  if __name__ == "__main__":
262
  app.launch(show_error=True)
 
16
  import spaces
17
  import torch
18
  from fastapi.responses import HTMLResponse, JSONResponse
19
+ from fastapi.staticfiles import StaticFiles
20
  from gradio import Server
21
  from transformers import (
22
  AutoModelForMultimodalLM,
 
259
  return JSONResponse({"summary_md": summary, "per_passage": rows})
260
 
261
 
262
+ _images_dir = HERE / "images"
263
+ if _images_dir.is_dir():
264
+ app.mount("/static", StaticFiles(directory=str(_images_dir)), name="static")
265
+
266
+
267
  if __name__ == "__main__":
268
  app.launch(show_error=True)
examples.json CHANGED
@@ -32,31 +32,43 @@
32
  {
33
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/0",
34
  "label": "The Commoner 1901-01-23: Had tho American consumers 1\u2026",
35
- "ocr_input": "Had tho American consumers 1 Jl . - 1 -L .. l-l! i.J1 i.L I oeen privuegea vo ouy ac wic cjuotunuus grauieu wio foreign buyers, the Americana would .have saved about 98010,080.97 on their purchase. \"\"\"More 'than eight millions of dollars! \"This measures tho extortion practiced upon the hard ware merchant, but this must bo increased by the merchant's profit, if his profit is estimated upon a percentage basis, before it measures the extor tion practiced upon the consumer."
 
 
36
  },
37
  {
38
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/1",
39
  "label": "The Commoner 1901-01-23: A government resting on force\u2026",
40
- "ocr_input": "A government resting on force is, on tho other hand, ever unstable because it excites hatred rather than affection anjd is continually at war with human nature; it is in constant antag onism to that universal sentiment- which is de fined as the love of liberty; ' All history sustains tho self-evident truths which form the fonndafiibif -W1 government deriving its just powers from the consent of the governed. ' All' history condemns a political structure which appeals only to fear and- relief upon bayonets for its support."
 
 
41
  },
42
  {
43
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/2",
44
  "label": "The Commoner 1901-01-23: It is riot necessary to\u2026",
45
- "ocr_input": "It is riot necessary to apologize for the use of a term which distinguishes the great body of the population from the comparatively few, who, for one reason or another, withdraw themselves from sympathetic connection with their fellows. Among the Greeks \"BLoi polloi\" was used to describe tho many, while among the Romans the word \"plebs\" was employed for the same purpose. These appellations, like \"the common people,\" have been assumed with pride by those to whom they were applied, while they have been used as terms of reproach by those who cdunted themselves n.TnrTify t.Tin."
 
 
46
  },
47
  {
48
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/3",
49
  "label": "The Commoner 1901-01-23: This quotation is reproduced because\u2026",
50
- "ocr_input": "This quotation is reproduced because it fairly represents the views of those who criticize tho expression. It has, however, an eminently , respectable origin. In the same chapter in which Christ condensed man's duty to his fellows into the commandment: Thou shalt love thy neighbor as thyself; in the same chapter in which he de nounced those who devour widows' houses and for a pretense make long prayers in this same chapter it is said of Him: The common people heard Him gladly."
 
 
51
  },
52
  {
53
  "id": "britannica1771/vol1/leaf60",
54
  "label": "Britannica 1771 (long \u017f): At the thick end\u2026",
55
- "ocr_input": "At the thick end of the bean,there is a fmall hole vifible to the naked eye, immediatelyover the radicle or future root, that it may have a freepaffage into the foil. Plate IV. fig. I. A. When thefecoats are taken off, the body of the feed appears, whichis divided into two fmooth portions or lobes. Thefmoothnefs of the lobes is owing to a thin'filin or cuticlewith which they are covered.At the bafts of the bean is placed the radicle Qr futureroot, Plate IV. fig. 3. A."
 
 
56
  },
57
  {
58
  "id": "britannica1771/vol1/leaf61",
59
  "label": "Britannica 1771 (long \u017f): II. Of the Root.I\u2026",
60
- "ocr_input": "II. Of the Root.I n examining the root of plants, the firft thing thatprefents itfelf is the Ikin, which is of various colours indifferent plants. Every root, after it has arrived at acertain age, has a double fkin. The firfl is coeval withthe other parts, and exifls in the feed: but afterwardsthere is a ring fent off from the bark, and forms a fe-cond fit in; eg. in the root of the dandelion, towardsthe end of May, the original or outer fkin appears ftiri-veled, and is eafily feparated from the new one, whichis frefher, and adheres more firmly to the bark."
 
 
61
  }
62
  ]
 
32
  {
33
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/0",
34
  "label": "The Commoner 1901-01-23: Had tho American consumers 1\u2026",
35
+ "ocr_input": "Had tho American consumers 1 Jl . - 1 -L .. l-l! i.J1 i.L I oeen privuegea vo ouy ac wic cjuotunuus grauieu wio foreign buyers, the Americana would .have saved about 98010,080.97 on their purchase. \"\"\"More 'than eight millions of dollars! \"This measures tho extortion practiced upon the hard ware merchant, but this must bo increased by the merchant's profit, if his profit is estimated upon a percentage basis, before it measures the extor tion practiced upon the consumer.",
36
+ "image": "static/commoner-1901-01-23.jpg",
37
+ "image_caption": "The Commoner (Lincoln, Nebraska), 23 Jan 1901, front page \u2014 via Chronicling America, Library of Congress"
38
  },
39
  {
40
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/1",
41
  "label": "The Commoner 1901-01-23: A government resting on force\u2026",
42
+ "ocr_input": "A government resting on force is, on tho other hand, ever unstable because it excites hatred rather than affection anjd is continually at war with human nature; it is in constant antag onism to that universal sentiment- which is de fined as the love of liberty; ' All history sustains tho self-evident truths which form the fonndafiibif -W1 government deriving its just powers from the consent of the governed. ' All' history condemns a political structure which appeals only to fear and- relief upon bayonets for its support.",
43
+ "image": "static/commoner-1901-01-23.jpg",
44
+ "image_caption": "The Commoner (Lincoln, Nebraska), 23 Jan 1901, front page \u2014 via Chronicling America, Library of Congress"
45
  },
46
  {
47
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/2",
48
  "label": "The Commoner 1901-01-23: It is riot necessary to\u2026",
49
+ "ocr_input": "It is riot necessary to apologize for the use of a term which distinguishes the great body of the population from the comparatively few, who, for one reason or another, withdraw themselves from sympathetic connection with their fellows. Among the Greeks \"BLoi polloi\" was used to describe tho many, while among the Romans the word \"plebs\" was employed for the same purpose. These appellations, like \"the common people,\" have been assumed with pride by those to whom they were applied, while they have been used as terms of reproach by those who cdunted themselves n.TnrTify t.Tin.",
50
+ "image": "static/commoner-1901-01-23.jpg",
51
+ "image_caption": "The Commoner (Lincoln, Nebraska), 23 Jan 1901, front page \u2014 via Chronicling America, Library of Congress"
52
  },
53
  {
54
  "id": "commoner/ca-46032385-1901-01-23-ed-1-0001/3",
55
  "label": "The Commoner 1901-01-23: This quotation is reproduced because\u2026",
56
+ "ocr_input": "This quotation is reproduced because it fairly represents the views of those who criticize tho expression. It has, however, an eminently , respectable origin. In the same chapter in which Christ condensed man's duty to his fellows into the commandment: Thou shalt love thy neighbor as thyself; in the same chapter in which he de nounced those who devour widows' houses and for a pretense make long prayers in this same chapter it is said of Him: The common people heard Him gladly.",
57
+ "image": "static/commoner-1901-01-23.jpg",
58
+ "image_caption": "The Commoner (Lincoln, Nebraska), 23 Jan 1901, front page \u2014 via Chronicling America, Library of Congress"
59
  },
60
  {
61
  "id": "britannica1771/vol1/leaf60",
62
  "label": "Britannica 1771 (long \u017f): At the thick end\u2026",
63
+ "ocr_input": "At the thick end of the bean,there is a fmall hole vifible to the naked eye, immediatelyover the radicle or future root, that it may have a freepaffage into the foil. Plate IV. fig. I. A. When thefecoats are taken off, the body of the feed appears, whichis divided into two fmooth portions or lobes. Thefmoothnefs of the lobes is owing to a thin'filin or cuticlewith which they are covered.At the bafts of the bean is placed the radicle Qr futureroot, Plate IV. fig. 3. A.",
64
+ "image": "static/britannica-vol1-leaf60.jpg",
65
+ "image_caption": "Encyclopaedia Britannica, 1st edition (1771), vol. 1 \u2014 National Library of Scotland digitisation"
66
  },
67
  {
68
  "id": "britannica1771/vol1/leaf61",
69
  "label": "Britannica 1771 (long \u017f): II. Of the Root.I\u2026",
70
+ "ocr_input": "II. Of the Root.I n examining the root of plants, the firft thing thatprefents itfelf is the Ikin, which is of various colours indifferent plants. Every root, after it has arrived at acertain age, has a double fkin. The firfl is coeval withthe other parts, and exifls in the feed: but afterwardsthere is a ring fent off from the bark, and forms a fe-cond fit in; eg. in the root of the dandelion, towardsthe end of May, the original or outer fkin appears ftiri-veled, and is eafily feparated from the new one, whichis frefher, and adheres more firmly to the bark.",
71
+ "image": "static/britannica-vol1-leaf61.jpg",
72
+ "image_caption": "Encyclopaedia Britannica, 1st edition (1771), vol. 1 \u2014 National Library of Scotland digitisation"
73
  }
74
  ]
images/britannica-vol1-leaf60.jpg ADDED

Git LFS Details

  • SHA256: b4bcd7943a6f0e4c87f156ef68c97dd08ac892f7c9c451ba023888f9047a4bb9
  • Pointer size: 131 Bytes
  • Size of remote file: 327 kB
images/britannica-vol1-leaf61.jpg ADDED

Git LFS Details

  • SHA256: 68128fb6aebb30e118ad526669a82b98cb21c8ba84fb896037c33bc2f26c7cab
  • Pointer size: 131 Bytes
  • Size of remote file: 349 kB
images/commoner-1901-01-23.jpg ADDED

Git LFS Details

  • SHA256: 4fa2ee21a53b01379547d1046b43372487374698e8fcddf3bea0843f9998354c
  • Pointer size: 131 Bytes
  • Size of remote file: 459 kB
index.html CHANGED
@@ -190,6 +190,13 @@
190
  <summary>Human transcription of this passage</summary>
191
  <div class="gold-text" id="gold-text"></div>
192
  </details>
 
 
 
 
 
 
 
193
  <div class="controls">
194
  <button class="run-btn" id="run">Correct the Proofs</button>
195
  <label class="toggle">
@@ -261,6 +268,21 @@
261
  </section>
262
 
263
  <section class="pane" id="pane-ledger">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  <div id="results-table"><p style="text-align:center;font-style:italic">Fetching the ledger…</p></div>
265
  <div class="footnotes" id="results-notes"></div>
266
  </section>
@@ -325,6 +347,18 @@ function setGoldAvailable(gold) {
325
  for (const id of ["no-gold-note", "no-gold-note-press"])
326
  $(id).classList.toggle("visible", !gold);
327
  }
 
 
 
 
 
 
 
 
 
 
 
 
328
  function currentGold() {
329
  return activeExample && $("ocr-input").value === activeExample.ocr_input ? (activeExample.gold || "") : "";
330
  }
@@ -386,6 +420,7 @@ async function loadExamples() {
386
  activeExample = e;
387
  updateCount();
388
  setGoldAvailable(e.gold || "");
 
389
  if (e.cached) renderCached(e);
390
  };
391
  chips.appendChild(b);
@@ -416,7 +451,7 @@ function updateCount() {
416
  el.textContent = `${n} / ${MAX_CHARS}`;
417
  el.classList.toggle("over", n > MAX_CHARS);
418
  }
419
- $("ocr-input").addEventListener("input", () => { activeExample = null; updateCount(); setGoldAvailable(""); });
420
  updateCount();
421
 
422
  /* ---------- run ---------- */
@@ -505,9 +540,9 @@ async function loadResults() {
505
  const notes = lines.filter(l => /^(Micro|Mean)/.test(l)).map(l => `<p>${l}</p>`).join("");
506
  $("results-notes").innerHTML = notes +
507
  `<p><b>Bold</b> marks the better of the two main models on each measure.
508
- The OCR‑seeded‑canvas row is greyed out: it converges fastest but barely edits anything (a negative result),
509
- so highlighting its numbers would mislead. Over‑correction: share of already‑correct characters the model changed.
510
- Fix rate: share of wrong characters the model changed.</p>`;
511
  } catch { $("results-table").innerHTML = "<p style='text-align:center;font-style:italic'>The ledger could not be fetched.</p>"; }
512
  }
513
 
 
190
  <summary>Human transcription of this passage</summary>
191
  <div class="gold-text" id="gold-text"></div>
192
  </details>
193
+ <details class="authority" id="page-image">
194
+ <summary>View the original page (full page — the passage is one excerpt from it)</summary>
195
+ <div class="gold-text" style="text-align:center">
196
+ <img id="page-img" loading="lazy" alt="Scanned source page" style="max-width:100%;border:1px solid var(--rule)" />
197
+ <div id="page-caption" style="font-size:.78rem;font-style:italic;margin-top:.4rem"></div>
198
+ </div>
199
+ </details>
200
  <div class="controls">
201
  <button class="run-btn" id="run">Correct the Proofs</button>
202
  <label class="toggle">
 
268
  </section>
269
 
270
  <section class="pane" id="pane-ledger">
271
+ <div class="footnotes" style="margin-bottom:1.4rem">
272
+ <p><b>The data.</b> 75 passages from <a href="https://doi.org/10.15131/shef.data.25439023">BLN600</a>,
273
+ a corpus of 600 excerpts of 19th‑century London newspapers (largely crime reporting) from the British
274
+ Library's collections, each paired with both the original OCR and a careful <em>human transcription</em>.
275
+ That human transcription is the “right answer” every number below is measured against. Passages longer
276
+ than DiffusionGemma's 256‑token output block were trimmed at a point where OCR and transcription align,
277
+ so the pairs stay parallel. (BLN600 is CC‑BY‑NC, so the passages themselves aren't republished here — only
278
+ these metrics.)</p>
279
+ <p><b>The task.</b> Both models got the identical instruction — fix recognition errors only, don't modernise
280
+ or rephrase — one passage at a time on the same A100 GPU. <b>CER / WER</b>: how far the output remains from
281
+ the human transcription, by character / by word (the “OCR input” row is the damage before any correction).
282
+ <b>Relative CER reduction</b>: how much of that damage the model repaired.
283
+ <b>Over‑correction</b>: how much text that was already right the model needlessly changed.
284
+ <b>Fix rate</b>: how much of what was actually wrong it fixed.</p>
285
+ </div>
286
  <div id="results-table"><p style="text-align:center;font-style:italic">Fetching the ledger…</p></div>
287
  <div class="footnotes" id="results-notes"></div>
288
  </section>
 
347
  for (const id of ["no-gold-note", "no-gold-note-press"])
348
  $(id).classList.toggle("visible", !gold);
349
  }
350
+ function setPageImage(e) {
351
+ const panel = $("page-image");
352
+ if (e && e.image) {
353
+ panel.classList.add("visible");
354
+ panel.open = false;
355
+ $("page-img").src = e.image;
356
+ $("page-caption").textContent = e.image_caption || "";
357
+ } else {
358
+ panel.classList.remove("visible");
359
+ $("page-img").removeAttribute("src");
360
+ }
361
+ }
362
  function currentGold() {
363
  return activeExample && $("ocr-input").value === activeExample.ocr_input ? (activeExample.gold || "") : "";
364
  }
 
420
  activeExample = e;
421
  updateCount();
422
  setGoldAvailable(e.gold || "");
423
+ setPageImage(e);
424
  if (e.cached) renderCached(e);
425
  };
426
  chips.appendChild(b);
 
451
  el.textContent = `${n} / ${MAX_CHARS}`;
452
  el.classList.toggle("over", n > MAX_CHARS);
453
  }
454
+ $("ocr-input").addEventListener("input", () => { activeExample = null; updateCount(); setGoldAvailable(""); setPageImage(null); });
455
  updateCount();
456
 
457
  /* ---------- run ---------- */
 
540
  const notes = lines.filter(l => /^(Micro|Mean)/.test(l)).map(l => `<p>${l}</p>`).join("");
541
  $("results-notes").innerHTML = notes +
542
  `<p><b>Bold</b> marks the better of the two main models on each measure.
543
+ The OCR‑seeded‑canvas row is greyed out: it converges fastest but barely edits anything
544
+ (a negative result see the repo notes), so highlighting its numbers would mislead.
545
+ Single run, one prompt, no significance testing a pragmatic day‑one benchmark, not a study.</p>`;
546
  } catch { $("results-table").innerHTML = "<p style='text-align:center;font-style:italic'>The ledger could not be fetched.</p>"; }
547
  }
548