Spaces:
Running on Zero
Running on Zero
Upload folder using huggingface_hub
Browse files- app.py +125 -142
- diff_utils.py +0 -39
- examples.json +32 -0
- examples_cached.json +188 -0
- index.html +411 -0
- requirements.txt +2 -0
- results/per_passage_metrics.jsonl +75 -0
- results/summary.md +35 -0
app.py
CHANGED
|
@@ -1,18 +1,22 @@
|
|
| 1 |
"""DiffusionGemma vs Gemma-4 on post-OCR correction — ZeroGPU comparison Space.
|
| 2 |
|
| 3 |
-
|
|
|
|
| 4 |
experimental block-diffusion LLM (google/diffusiongemma-26B-A4B-it) and an
|
| 5 |
autoregressive baseline (google/gemma-4-E4B-it).
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
import json
|
| 9 |
import os
|
|
|
|
| 10 |
import time
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
-
import gradio as gr
|
| 14 |
import spaces
|
| 15 |
import torch
|
|
|
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
AutoModelForMultimodalLM,
|
| 18 |
AutoProcessor,
|
|
@@ -20,7 +24,7 @@ from transformers import (
|
|
| 20 |
TextDiffusionStreamer,
|
| 21 |
)
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
# Keep in sync with benchmark.py PROMPT_TEMPLATE — the benchmark numbers in the
|
| 26 |
# results tab were produced with exactly this prompt.
|
|
@@ -35,15 +39,12 @@ OCR text:
|
|
| 35 |
{ocr}"""
|
| 36 |
|
| 37 |
MAX_INPUT_CHARS = 1200 # roughly the 220-token benchmark cap
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def model_path(volume_path: str, model_id: str) -> str:
|
| 41 |
-
"""Prefer a mounted hf:// volume
|
| 42 |
-
|
| 43 |
-
Volume reads go over FUSE, which is sometimes slower for safetensors loading
|
| 44 |
-
than a fresh download to local disk — set USE_VOLUMES=0 (Space variable) to
|
| 45 |
-
force from_pretrained downloads for comparison.
|
| 46 |
-
"""
|
| 47 |
if os.environ.get("USE_VOLUMES", "1") == "0":
|
| 48 |
return model_id
|
| 49 |
return volume_path if os.path.isdir(volume_path) else model_id
|
|
@@ -64,7 +65,7 @@ g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat1
|
|
| 64 |
print(f"Gemma-4 loaded in {time.perf_counter() - t0:.0f}s")
|
| 65 |
|
| 66 |
|
| 67 |
-
|
| 68 |
|
| 69 |
|
| 70 |
def extract_answer(raw: str) -> str:
|
|
@@ -79,6 +80,25 @@ def extract_answer(raw: str) -> str:
|
|
| 79 |
return raw.strip()
|
| 80 |
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
class SnapshotStreamer(TextDiffusionStreamer):
|
| 83 |
"""Captures the decoded canvas at each denoising step; suppresses the
|
| 84 |
parent's ANSI console printing."""
|
|
@@ -118,27 +138,66 @@ def _decode_generated(processor, output, input_len) -> str:
|
|
| 118 |
return extract_answer(raw)
|
| 119 |
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
inputs = _prepare_inputs(dg_processor, dg_model, ocr_text)
|
| 126 |
streamer = SnapshotStreamer(dg_processor.tokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
t0 = time.perf_counter()
|
| 128 |
-
output = dg_model.generate(**inputs,
|
| 129 |
torch.cuda.synchronize()
|
| 130 |
seconds = time.perf_counter() - t0
|
| 131 |
text = _decode_generated(dg_processor, output, inputs["input_ids"].shape[-1])
|
| 132 |
n_tokens = len(dg_processor.tokenizer(text)["input_ids"])
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
@spaces.GPU(duration=60, size="xlarge")
|
| 141 |
-
def run_gemma4(ocr_text: str):
|
|
|
|
|
|
|
|
|
|
| 142 |
inputs = _prepare_inputs(g4_processor, g4_model, ocr_text)
|
| 143 |
t0 = time.perf_counter()
|
| 144 |
output = g4_model.generate(**inputs, max_new_tokens=256, do_sample=False)
|
|
@@ -146,127 +205,51 @@ def run_gemma4(ocr_text: str):
|
|
| 146 |
seconds = time.perf_counter() - t0
|
| 147 |
text = _decode_generated(g4_processor, output, inputs["input_ids"].shape[-1])
|
| 148 |
n_tokens = len(g4_processor.tokenizer(text)["input_ids"])
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
json.loads(line)
|
| 169 |
-
for line in
|
| 170 |
if line.strip()
|
| 171 |
]
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
def check_length(text: str):
|
| 179 |
-
if len(text) > MAX_INPUT_CHARS:
|
| 180 |
-
raise gr.Error(
|
| 181 |
-
f"Input too long ({len(text)} chars). DiffusionGemma generates a single "
|
| 182 |
-
f"256-token block, so inputs are capped at ~{MAX_INPUT_CHARS} characters."
|
| 183 |
-
)
|
| 184 |
-
return text
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
def update_snapshot(snapshots: list[str], step: int) -> str:
|
| 188 |
-
if not snapshots:
|
| 189 |
-
return ""
|
| 190 |
-
return snapshots[min(int(step), len(snapshots) - 1)]
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
with gr.Blocks(title="DiffusionGemma vs Gemma-4: post-OCR correction") as demo:
|
| 194 |
-
gr.Markdown(
|
| 195 |
-
"# DiffusionGemma vs Gemma-4: post-OCR correction\n"
|
| 196 |
-
"Compare Google's **experimental diffusion LLM** "
|
| 197 |
-
"([google/diffusiongemma-26B-A4B-it](https://huggingface.co/google/diffusiongemma-26B-A4B-it), "
|
| 198 |
-
"26B MoE / 3.8B active, released 2026-06-10) against an autoregressive baseline "
|
| 199 |
-
"([google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)) on correcting "
|
| 200 |
-
"19th-century English newspaper OCR. Both run in bf16. Highlights show what each model "
|
| 201 |
-
"**changed relative to the OCR input** (yellow = changed, green = added, red ⌫ = deleted)."
|
| 202 |
-
)
|
| 203 |
-
|
| 204 |
-
with gr.Tab("Live comparison"):
|
| 205 |
-
with gr.Row():
|
| 206 |
-
example_dd = gr.Dropdown(
|
| 207 |
-
label="Example passages (ICDAR2019 post-OCR, CC-BY-4.0)",
|
| 208 |
-
choices=list(example_choices),
|
| 209 |
-
value=None,
|
| 210 |
-
scale=2,
|
| 211 |
-
)
|
| 212 |
-
ocr_box = gr.Textbox(
|
| 213 |
-
label="Noisy OCR text",
|
| 214 |
-
lines=6,
|
| 215 |
-
value=next(iter(example_choices.values()), ""),
|
| 216 |
-
max_length=MAX_INPUT_CHARS,
|
| 217 |
-
)
|
| 218 |
-
run_btn = gr.Button("Run both models", variant="primary")
|
| 219 |
-
with gr.Row():
|
| 220 |
-
with gr.Column():
|
| 221 |
-
gr.Markdown("### DiffusionGemma 26B-A4B (diffusion)")
|
| 222 |
-
dg_timing = gr.Markdown("")
|
| 223 |
-
dg_diff = gr.HighlightedText(
|
| 224 |
-
label="Output (diff vs input)", color_map=COLOR_MAP, combine_adjacent=True
|
| 225 |
-
)
|
| 226 |
-
with gr.Accordion("Raw output", open=False):
|
| 227 |
-
dg_raw = gr.Textbox(lines=6, show_label=False)
|
| 228 |
-
with gr.Column():
|
| 229 |
-
gr.Markdown("### Gemma-4-E4B (autoregressive)")
|
| 230 |
-
g4_timing = gr.Markdown("")
|
| 231 |
-
g4_diff = gr.HighlightedText(
|
| 232 |
-
label="Output (diff vs input)", color_map=COLOR_MAP, combine_adjacent=True
|
| 233 |
-
)
|
| 234 |
-
with gr.Accordion("Raw output", open=False):
|
| 235 |
-
g4_raw = gr.Textbox(lines=6, show_label=False)
|
| 236 |
-
|
| 237 |
-
snapshots_state = gr.State([])
|
| 238 |
-
example_dd.change(load_example, example_dd, ocr_box)
|
| 239 |
-
run_btn.click(check_length, ocr_box, ocr_box).success(
|
| 240 |
-
run_diffusiongemma, ocr_box, [dg_raw, dg_diff, dg_timing, snapshots_state]
|
| 241 |
-
).then(run_gemma4, ocr_box, [g4_raw, g4_diff, g4_timing])
|
| 242 |
-
|
| 243 |
-
with gr.Tab("Denoising progression"):
|
| 244 |
-
gr.Markdown(
|
| 245 |
-
"DiffusionGemma starts from a random 256-token canvas and iteratively denoises it. "
|
| 246 |
-
"Run a comparison first, then scrub through the intermediate canvas states."
|
| 247 |
-
)
|
| 248 |
-
step_slider = gr.Slider(0, 47, step=1, value=0, label="Denoising step")
|
| 249 |
-
snapshot_box = gr.Textbox(lines=10, label="Canvas at step", interactive=False)
|
| 250 |
-
step_slider.change(update_snapshot, [snapshots_state, step_slider], snapshot_box)
|
| 251 |
-
snapshots_state.change(
|
| 252 |
-
lambda s: (gr.Slider(0, max(len(s) - 1, 1), step=1, value=0), update_snapshot(s, 0)),
|
| 253 |
-
snapshots_state,
|
| 254 |
-
[step_slider, snapshot_box],
|
| 255 |
-
)
|
| 256 |
-
|
| 257 |
-
with gr.Tab("Benchmark results"):
|
| 258 |
-
gr.Markdown(summary_md)
|
| 259 |
-
if per_passage_rows:
|
| 260 |
-
gr.Markdown("### Per-passage metrics (BLN600, n=75)")
|
| 261 |
-
gr.DataFrame(
|
| 262 |
-
value=[[row.get(k) for k in per_passage_rows[0]] for row in per_passage_rows],
|
| 263 |
-
headers=list(per_passage_rows[0]),
|
| 264 |
-
interactive=False,
|
| 265 |
-
)
|
| 266 |
-
gr.Markdown(
|
| 267 |
-
"Benchmark texts come from [BLN600](https://doi.org/10.15131/shef.data.25439023) "
|
| 268 |
-
"(CC-BY-NC-4.0), so passage texts are not redistributed here — only metrics. "
|
| 269 |
-
"See the Space README for methodology and limitations."
|
| 270 |
-
)
|
| 271 |
-
|
| 272 |
-
demo.launch()
|
|
|
|
| 1 |
"""DiffusionGemma vs Gemma-4 on post-OCR correction — ZeroGPU comparison Space.
|
| 2 |
|
| 3 |
+
gradio.Server pattern: custom HTML frontend (index.html) + Gradio queuing
|
| 4 |
+
backend. Side-by-side correction of 19th-century English newspaper OCR by an
|
| 5 |
experimental block-diffusion LLM (google/diffusiongemma-26B-A4B-it) and an
|
| 6 |
autoregressive baseline (google/gemma-4-E4B-it).
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
import difflib
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
+
import re
|
| 13 |
import time
|
| 14 |
from pathlib import Path
|
| 15 |
|
|
|
|
| 16 |
import spaces
|
| 17 |
import torch
|
| 18 |
+
from fastapi.responses import HTMLResponse, JSONResponse
|
| 19 |
+
from gradio import Server
|
| 20 |
from transformers import (
|
| 21 |
AutoModelForMultimodalLM,
|
| 22 |
AutoProcessor,
|
|
|
|
| 24 |
TextDiffusionStreamer,
|
| 25 |
)
|
| 26 |
|
| 27 |
+
HERE = Path(__file__).resolve().parent
|
| 28 |
|
| 29 |
# Keep in sync with benchmark.py PROMPT_TEMPLATE — the benchmark numbers in the
|
| 30 |
# results tab were produced with exactly this prompt.
|
|
|
|
| 39 |
{ocr}"""
|
| 40 |
|
| 41 |
MAX_INPUT_CHARS = 1200 # roughly the 220-token benchmark cap
|
| 42 |
+
STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")
|
| 43 |
|
| 44 |
|
| 45 |
def model_path(volume_path: str, model_id: str) -> str:
|
| 46 |
+
"""Prefer a mounted hf:// volume unless USE_VOLUMES=0 (FUSE reads can be
|
| 47 |
+
slower for safetensors loading than a fresh download to local disk)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if os.environ.get("USE_VOLUMES", "1") == "0":
|
| 49 |
return model_id
|
| 50 |
return volume_path if os.path.isdir(volume_path) else model_id
|
|
|
|
| 65 |
print(f"Gemma-4 loaded in {time.perf_counter() - t0:.0f}s")
|
| 66 |
|
| 67 |
|
| 68 |
+
# ---------------------------------------------------------------- text utils
|
| 69 |
|
| 70 |
|
| 71 |
def extract_answer(raw: str) -> str:
|
|
|
|
| 80 |
return raw.strip()
|
| 81 |
|
| 82 |
|
| 83 |
+
def diff_segments(input_text: str, output_text: str) -> list[dict]:
|
| 84 |
+
"""Word+whitespace diff of output vs input -> [{text, op}] segments,
|
| 85 |
+
op in {same, changed, added, removed}. Rendered by the frontend."""
|
| 86 |
+
tokens_in = re.findall(r"\S+|\s+", input_text)
|
| 87 |
+
tokens_out = re.findall(r"\S+|\s+", output_text)
|
| 88 |
+
sm = difflib.SequenceMatcher(None, tokens_in, tokens_out, autojunk=False)
|
| 89 |
+
segments = []
|
| 90 |
+
for op, i1, i2, j1, j2 in sm.get_opcodes():
|
| 91 |
+
if op == "equal":
|
| 92 |
+
segments.append({"text": "".join(tokens_out[j1:j2]), "op": "same"})
|
| 93 |
+
elif op == "replace":
|
| 94 |
+
segments.append({"text": "".join(tokens_out[j1:j2]), "op": "changed"})
|
| 95 |
+
elif op == "insert":
|
| 96 |
+
segments.append({"text": "".join(tokens_out[j1:j2]), "op": "added"})
|
| 97 |
+
elif op == "delete":
|
| 98 |
+
segments.append({"text": "".join(tokens_in[i1:i2]), "op": "removed"})
|
| 99 |
+
return segments
|
| 100 |
+
|
| 101 |
+
|
| 102 |
class SnapshotStreamer(TextDiffusionStreamer):
|
| 103 |
"""Captures the decoded canvas at each denoising step; suppresses the
|
| 104 |
parent's ANSI console printing."""
|
|
|
|
| 138 |
return extract_answer(raw)
|
| 139 |
|
| 140 |
|
| 141 |
+
def _validate(ocr_text: str) -> str | None:
|
| 142 |
+
if not ocr_text or not ocr_text.strip():
|
| 143 |
+
return "Empty input."
|
| 144 |
+
if len(ocr_text) > MAX_INPUT_CHARS:
|
| 145 |
+
return (
|
| 146 |
+
f"Input too long ({len(ocr_text)} chars). DiffusionGemma generates a single "
|
| 147 |
+
f"256-token block, so inputs are capped at ~{MAX_INPUT_CHARS} characters."
|
| 148 |
+
)
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ---------------------------------------------------------------- API
|
| 153 |
+
|
| 154 |
+
app = Server()
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@app.api(name="run_diffusiongemma")
|
| 158 |
+
@spaces.GPU(duration=90, size="xlarge")
|
| 159 |
+
def run_diffusiongemma(ocr_text: str, canvas_init: bool = False) -> dict:
|
| 160 |
+
"""Correct OCR text with DiffusionGemma. canvas_init=True seeds the first
|
| 161 |
+
denoising canvas with the OCR text itself (experimental — under-corrects;
|
| 162 |
+
see the results tab) instead of random noise."""
|
| 163 |
+
if err := _validate(ocr_text):
|
| 164 |
+
return {"error": err}
|
| 165 |
inputs = _prepare_inputs(dg_processor, dg_model, ocr_text)
|
| 166 |
streamer = SnapshotStreamer(dg_processor.tokenizer)
|
| 167 |
+
gen_kwargs: dict = {"max_new_tokens": 256, "streamer": streamer}
|
| 168 |
+
if canvas_init:
|
| 169 |
+
canvas_length = getattr(dg_model.generation_config, "canvas_length", None) or 256
|
| 170 |
+
ids = dg_processor.tokenizer(ocr_text, add_special_tokens=False)["input_ids"]
|
| 171 |
+
ids = ids[:canvas_length]
|
| 172 |
+
vocab = dg_model.config.text_config.vocab_size
|
| 173 |
+
pad = torch.randint(vocab, (canvas_length - len(ids),))
|
| 174 |
+
canvas = torch.cat([torch.tensor(ids, dtype=torch.long), pad])
|
| 175 |
+
gen_kwargs["decoder_input_ids"] = canvas.unsqueeze(0).to(dg_model.device)
|
| 176 |
+
|
| 177 |
t0 = time.perf_counter()
|
| 178 |
+
output = dg_model.generate(**inputs, **gen_kwargs)
|
| 179 |
torch.cuda.synchronize()
|
| 180 |
seconds = time.perf_counter() - t0
|
| 181 |
text = _decode_generated(dg_processor, output, inputs["input_ids"].shape[-1])
|
| 182 |
n_tokens = len(dg_processor.tokenizer(text)["input_ids"])
|
| 183 |
+
return {
|
| 184 |
+
"text": text,
|
| 185 |
+
"diff": diff_segments(ocr_text.strip(), text),
|
| 186 |
+
"seconds": round(seconds, 2),
|
| 187 |
+
"tokens_per_second": round(n_tokens / seconds, 1),
|
| 188 |
+
"denoising_steps": len(streamer.snapshots),
|
| 189 |
+
"snapshots": [extract_answer(s) for s in streamer.snapshots],
|
| 190 |
+
"canvas_init": canvas_init,
|
| 191 |
+
"error": None,
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
@app.api(name="run_gemma4")
|
| 196 |
@spaces.GPU(duration=60, size="xlarge")
|
| 197 |
+
def run_gemma4(ocr_text: str) -> dict:
|
| 198 |
+
"""Correct OCR text with the autoregressive Gemma-4-E4B baseline (greedy)."""
|
| 199 |
+
if err := _validate(ocr_text):
|
| 200 |
+
return {"error": err}
|
| 201 |
inputs = _prepare_inputs(g4_processor, g4_model, ocr_text)
|
| 202 |
t0 = time.perf_counter()
|
| 203 |
output = g4_model.generate(**inputs, max_new_tokens=256, do_sample=False)
|
|
|
|
| 205 |
seconds = time.perf_counter() - t0
|
| 206 |
text = _decode_generated(g4_processor, output, inputs["input_ids"].shape[-1])
|
| 207 |
n_tokens = len(g4_processor.tokenizer(text)["input_ids"])
|
| 208 |
+
return {
|
| 209 |
+
"text": text,
|
| 210 |
+
"diff": diff_segments(ocr_text.strip(), text),
|
| 211 |
+
"seconds": round(seconds, 2),
|
| 212 |
+
"tokens_per_second": round(n_tokens / seconds, 1),
|
| 213 |
+
"error": None,
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
# ---------------------------------------------------------------- static data
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
@app.get("/", response_class=HTMLResponse)
|
| 221 |
+
async def homepage():
|
| 222 |
+
return (HERE / "index.html").read_text(encoding="utf-8")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
@app.get("/data/examples")
|
| 226 |
+
async def get_examples():
|
| 227 |
+
examples = json.loads((HERE / "examples.json").read_text())
|
| 228 |
+
cached = {}
|
| 229 |
+
cached_path = HERE / "examples_cached.json"
|
| 230 |
+
if cached_path.exists():
|
| 231 |
+
for e in json.loads(cached_path.read_text()):
|
| 232 |
+
for m, out in e["output"].items():
|
| 233 |
+
out.pop("_raw", None)
|
| 234 |
+
cached[e["id"]] = e["output"]
|
| 235 |
+
for e in examples:
|
| 236 |
+
e["cached"] = cached.get(e["id"])
|
| 237 |
+
if e["cached"]:
|
| 238 |
+
for m, out in e["cached"].items():
|
| 239 |
+
out["diff"] = diff_segments(e["ocr_input"].strip(), out["text"])
|
| 240 |
+
return JSONResponse(examples)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@app.get("/data/results")
|
| 244 |
+
async def get_results():
|
| 245 |
+
summary = (HERE / "results" / "summary.md").read_text()
|
| 246 |
+
rows = [
|
| 247 |
json.loads(line)
|
| 248 |
+
for line in (HERE / "results" / "per_passage_metrics.jsonl").read_text().splitlines()
|
| 249 |
if line.strip()
|
| 250 |
]
|
| 251 |
+
return JSONResponse({"summary_md": summary, "per_passage": rows})
|
| 252 |
|
| 253 |
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
app.launch(show_error=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
diff_utils.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
"""Token-level diff between OCR input and model output for gr.HighlightedText."""
|
| 2 |
-
|
| 3 |
-
import difflib
|
| 4 |
-
import re
|
| 5 |
-
|
| 6 |
-
COLOR_MAP = {"changed": "yellow", "added": "green", "removed": "red"}
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def diff_highlight(input_text: str, output_text: str) -> list[tuple[str, str | None]]:
|
| 10 |
-
"""Segments of `output_text` labelled by how they differ from `input_text`.
|
| 11 |
-
|
| 12 |
-
Word + whitespace tokenization (lossless), so highlights align with what
|
| 13 |
-
the reader sees. Deleted input text is marked with a small marker segment.
|
| 14 |
-
"""
|
| 15 |
-
tokens_in = re.findall(r"\S+|\s+", input_text)
|
| 16 |
-
tokens_out = re.findall(r"\S+|\s+", output_text)
|
| 17 |
-
sm = difflib.SequenceMatcher(None, tokens_in, tokens_out, autojunk=False)
|
| 18 |
-
segments: list[tuple[str, str | None]] = []
|
| 19 |
-
for op, i1, i2, j1, j2 in sm.get_opcodes():
|
| 20 |
-
if op == "equal":
|
| 21 |
-
segments.append(("".join(tokens_out[j1:j2]), None))
|
| 22 |
-
elif op == "replace":
|
| 23 |
-
segments.append(("".join(tokens_out[j1:j2]), "changed"))
|
| 24 |
-
elif op == "insert":
|
| 25 |
-
segments.append(("".join(tokens_out[j1:j2]), "added"))
|
| 26 |
-
elif op == "delete":
|
| 27 |
-
segments.append((" ⌫ ", "removed"))
|
| 28 |
-
return segments
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
if __name__ == "__main__":
|
| 32 |
-
segs = diff_highlight("the qvick brown fox jumps", "the quick brown fox")
|
| 33 |
-
print(segs)
|
| 34 |
-
assert ("the ", None) in segs or segs[0][1] is None
|
| 35 |
-
assert any(label == "changed" for _, label in segs)
|
| 36 |
-
assert any(label == "removed" for _, label in segs)
|
| 37 |
-
out = "".join(s for s, label in segs if label != "removed")
|
| 38 |
-
assert out == "the quick brown fox"
|
| 39 |
-
print("diff_utils ok")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/107.txt",
|
| 4 |
+
"label": "para Htio HOrastua.foz Da bateffe of the\u2026",
|
| 5 |
+
"ocr_input": "para Htio HOrastua.foz Da bateffe of the npgf> ji to l)is djamtnc.tDitf) manp toufyte Ipgbt Sa (o tbe comtc.w Defer noeD ootbtie ztU bis maptic.stonopngc enuprovtnc iOt these hnpgbceo.baupng full grete ttoimorr 2lno of manboDc.be putte bett? fpist aforniDrr ji)cu; eommaunDpng.lphe a gentpll bpng Co leue ba strpte.anD"
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"id": "example/icdar2019/evaluation_4M_without_Finnish/EN/EN1/2.txt",
|
| 9 |
+
"label": "26 The Academy of Sciences. into two\u2026",
|
| 10 |
+
"ocr_input": "26 The Academy of Sciences. into two unequal parts, as the two Tro\u2022 picks, and the two Polar Circles: Everj circle is divided into sixty parts, which they call first minutes; and each Minute likewise into sixty parts, which we call second Minutes. The Natural Sphere, or the Celestial Globe, befides the foregoing Circles, of fers to our view divers Constellations: The Antients reckoned eight and fort}, comprehending in this number all tk Stars to be seen in Greece, and all tk known Parts of the Wor Id ; 12 of those Constellations are contained in the %$\u2022 diack,i 1 are to be seen toward the Nortb ) and 15 towards the South; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theorical, is that part which by the help of some Engines and Orbs, ojfers to our view those C\u0153le/ltal Motions which are not so obvious to ever] common understanding."
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/52.txt",
|
| 14 |
+
"label": "3 Remarty on Des-CartesV Part. I. to\u2026",
|
| 15 |
+
"ocr_input": "3 Remarty on Des-CartesV Part. I. to conceive, That whatsoever thinks does not Exist 4t the Iime of its Thinking: Wherefore he con- eludes, That he is, and that he Thinks, is of aU the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an In vention as this, in order to Improve Know ledge , might not rather proceed from some Person in Bedlam, than from Des-Cartes ? Of whom if Interrogated, how he comes to sup pose, That a Man may point-blank, on the ac count of his Fiction, deny the Being of a God, together with all the visible Particulars already mention\u2019d ? As to the Existence of the Deity, he that considers the necessity of its Concession, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/39.txt",
|
| 19 |
+
"label": "43 2 AN ENGLISH ANTHOLOGY. [BK. IV.\u2026",
|
| 20 |
+
"ocr_input": "43 2 AN ENGLISH ANTHOLOGY. [BK. IV. CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried; Not a soldier discharged his farewell shot O\u2019er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam\u2019s misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. f Few and short were"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/28.txt",
|
| 24 |
+
"label": "144 AN ENGLISH ANTHOLOGY. [BK A hollow\u2026",
|
| 25 |
+
"ocr_input": "144 AN ENGLISH ANTHOLOGY. [BK A hollow wind did seem to answer, No; Go seek elsewhere. I did ; and going did a rainbow note; Surely, thought I, This is the lace of Peace\u2019s coat; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man ; Whom when for Peace I did demand, he thus began : There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived ; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth ; For they that taste it do rehearse That"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/122.txt",
|
| 29 |
+
"label": "t Jeademia Scienthrum. I $ mltiplicare feu\u2026",
|
| 30 |
+
"ocr_input": "t Jeademia Scienthrum. I $ mltiplicare feu in aliumduc\u20acreut4 2 eft toties fiimere multiplicandum quoties sumi potest in multiplicatore unitas: Quare si 4 ducas in 2,fumma ituraest8. Unum numcrum divi- lere per aliura ut 8 per 2 nihil aliud [A quam invenire quoties 2 conti- leantur in 8. Numerus planus a duo- >us numeris in fe invicem ductis pro- lucitur v i2 igitur est-numerus planus juia producitur a numero 6in aducto. Solidus numerus a tribus numeris ultiplicatis oritur : Ejusmodi est 24, ;x tribus enim hisce numeris multipli ers emergit 2, 3, 4 ; si enim 2 duco n 3 habeo 6, & siduxero6 in 4,"
|
| 31 |
+
}
|
| 32 |
+
]
|
examples_cached.json
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/107.txt",
|
| 4 |
+
"ocr_input": "para Htio HOrastua.foz Da bateffe of the npgf> ji to l)is djamtnc.tDitf) manp toufyte Ipgbt Sa (o tbe comtc.w Defer noeD ootbtie ztU bis maptic.stonopngc enuprovtnc iOt these hnpgbceo.baupng full grete ttoimorr 2lno of manboDc.be putte bett? fpist aforniDrr ji)cu; eommaunDpng.lphe a gentpll bpng Co leue ba strpte.anD",
|
| 5 |
+
"gold": "Pars And Adrastus.for derkenesse of the nygh Fro his chambre.with many torches lyght In to the courte.is descended downe All his mayne.stondynge enuyprowne Of these knyghtes.hauyng full grete wounder And of manhode.he putte hem fyrst asounder Hem commaundyng.lyke a gentyll kyng To leue her stryte.and",
|
| 6 |
+
"output": {
|
| 7 |
+
"diffusiongemma": {
|
| 8 |
+
"text": "per Horatio Horace. The battle of the night is to distant. Many fought for so the contest de feared not but mastering upon import these hopes. being full great tomorrow no of man be put best first afford for commanding. the gentle being able to spate and",
|
| 9 |
+
"seconds": 2.364,
|
| 10 |
+
"tokens_generated": 53,
|
| 11 |
+
"denoising_steps": 13,
|
| 12 |
+
"tokens_per_forward": 4,
|
| 13 |
+
"thought_chars": 0
|
| 14 |
+
},
|
| 15 |
+
"diffusiongemma_canvas": {
|
| 16 |
+
"text": "para Htio HOrastua.foz Da bateffe of the npgf> ji to l)is djamtnc.tDitf) manp toufyte Ipgbt Sa (o tbe comtc.w Defer noeD ootbtie ztU bis maptic.stonopngc enuprovtnc iOt these hnpgbceo.baupng full grete ttoimorr 2lno of manboDc.be putte bett? fpist aforniDrr ji)cu; eommaunDpng.lphe a gentpll bpng Co leue ba strpte.anD\n\n***Note:*** *The provided OCR text is severely corrupted, containing many unintelligible character strings (e.g., \"npgf>\", \"djamtnc.tDitf)\", \"enuprovtnc\"). Without the source image or more reliable reference, it is impossible to accurately reconstruct the intended 19th-century words without guessing or rephrasing.*",
|
| 17 |
+
"seconds": 2.705,
|
| 18 |
+
"tokens_generated": 219,
|
| 19 |
+
"denoising_steps": 14,
|
| 20 |
+
"tokens_per_forward": 15,
|
| 21 |
+
"thought_chars": 0
|
| 22 |
+
},
|
| 23 |
+
"gemma4": {
|
| 24 |
+
"text": "para Htio Horastua.foz Da bateffe of the npgf> ji to l)is djamtnc.tDitf) manp toufyte Ipgbt Sa (o tbe comtc.w Defer noeD ootbtie ztU bis maptic.stonopngc enuprovtnc iOt these hnpgbceo.baupng full grete ttoimorr 2lno of manboDc.be putte bett? fpist aforniDrr ji)cu; eommaunDpng.lphe a gentpll bpng Co leue ba strpte.anD",
|
| 25 |
+
"seconds": 10.214,
|
| 26 |
+
"tokens_generated": 140,
|
| 27 |
+
"denoising_steps": null,
|
| 28 |
+
"tokens_per_forward": null,
|
| 29 |
+
"thought_chars": 0
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"id": "example/icdar2019/evaluation_4M_without_Finnish/EN/EN1/2.txt",
|
| 35 |
+
"ocr_input": "26 The Academy of Sciences. into two unequal parts, as the two Tro\u2022 picks, and the two Polar Circles: Everj circle is divided into sixty parts, which they call first minutes; and each Minute likewise into sixty parts, which we call second Minutes. The Natural Sphere, or the Celestial Globe, befides the foregoing Circles, of fers to our view divers Constellations: The Antients reckoned eight and fort}, comprehending in this number all tk Stars to be seen in Greece, and all tk known Parts of the Wor Id ; 12 of those Constellations are contained in the %$\u2022 diack,i 1 are to be seen toward the Nortb ) and 15 towards the South; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theorical, is that part which by the help of some Engines and Orbs, ojfers to our view those C\u0153le/ltal Motions which are not so obvious to ever] common understanding.",
|
| 36 |
+
"gold": "into two unequal parts, as the two Tro- picks, and the two Polar Circles : Every circle is divided into fixty parts, which they call first minutes ; and each Minute likewise into ffxty parts, which we call second Minutes. This 26 The Academy of Sciences. The Natural Sphere, or the C\u0153lestial Globe, beffdes the foregoing Circles, of- fers to our view divers Constellations : The Antients reckon\u2019d eight and forty, comprehending in this number all the Stars to be feen in Greece, and all the known Parts of the World ; 12 of those Constellations are contain\u2019d in the Zo- diack,21 are to be seen toward the North, and 15 towards the South ; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theorical, is that part which by the help of some Engines and Orbs, offers to our view those C\u0153lestial Motions which are not so obvious to every common understanding.",
|
| 37 |
+
"output": {
|
| 38 |
+
"diffusiongemma": {
|
| 39 |
+
"text": "26 The Academy of Sciences. into two unequal parts, as the two Tropics, and the two Polar Circles: Every circle is divided into sixty parts, which they call first minutes; and each Minute likewise into sixty parts, which we call second Minutes. The Natural Sphere, or the Celestial Globe, beside the foregoing Circles, offers to our view divers Constellations: The Ancients reckoned eight and forty, comprehending in this number all the Stars to be seen in Greece, and all the known Parts of the World; 12 of those Constellations are contained in the Zodiac, 6 are to be seen toward the North and 15 towards the South; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theoretical, is that part which by the help of some Engines and Orbs, offers to our view those Celestial Motions which are not so obvious to every common understanding.",
|
| 40 |
+
"seconds": 1.927,
|
| 41 |
+
"tokens_generated": 191,
|
| 42 |
+
"denoising_steps": 10,
|
| 43 |
+
"tokens_per_forward": 19,
|
| 44 |
+
"thought_chars": 0
|
| 45 |
+
},
|
| 46 |
+
"diffusiongemma_canvas": {
|
| 47 |
+
"text": "26 The Academy of Sciences. into two unequal parts, as the two Tro\u2022 picks, and the two Polar Circles: Everj circle is divided into sixty parts, which they call first minutes; and each Minute likewise into sixty parts, which we call second Minutes. The Natural Sphere, or the Celestial Globe, befides the foregoing Circles, of fers to our view divers Constellations: The Antients reckoned eight and fort}, comprehending in this number all tk Stars to be seen in Greece, and all tk known Parts of the Wor Id ; 12 of those Constellations are contained in the %$\u2022 diack,i 1 are to be seen toward the Nortb ) and 15 towards the South; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theorical, is that part which by the help of some Engines and Orbs, ojfers to our view those C\u0153le/ltal Motions which are not so obvious to ever] common understanding.",
|
| 48 |
+
"seconds": 1.058,
|
| 49 |
+
"tokens_generated": 211,
|
| 50 |
+
"denoising_steps": 5,
|
| 51 |
+
"tokens_per_forward": 42,
|
| 52 |
+
"thought_chars": 0
|
| 53 |
+
},
|
| 54 |
+
"gemma4": {
|
| 55 |
+
"text": "26 The Academy of Sciences. into two unequal parts, as the two Tropics, and the two Polar Circles: Every circle is divided into sixty parts, which they call first minutes; and each Minute likewise into sixty parts, which we call second Minutes. The Natural Sphere, or the Celestial Globe, besides the foregoing Circles, offers to our view divers Constellations: The Ancients reckoned eight and forty, comprehending in this number all the Stars to be seen in Greece, and all the known Parts of the World; 12 of those Constellations are contained in the Zodiac, 1 are to be seen toward the North and 15 towards the South; but of late there are twelve other Constellations discovered towards the South. The Systematical Astronomy, which others call the Theoretical, is that part which by the help of some Engines and Orbs, offers to our view those Celestial Motions which are not so obvious to every common understanding.",
|
| 56 |
+
"seconds": 14.017,
|
| 57 |
+
"tokens_generated": 192,
|
| 58 |
+
"denoising_steps": null,
|
| 59 |
+
"tokens_per_forward": null,
|
| 60 |
+
"thought_chars": 0
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/52.txt",
|
| 66 |
+
"ocr_input": "3 Remarty on Des-CartesV Part. I. to conceive, That whatsoever thinks does not Exist 4t the Iime of its Thinking: Wherefore he con- eludes, That he is, and that he Thinks, is of aU the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an In vention as this, in order to Improve Know ledge , might not rather proceed from some Person in Bedlam, than from Des-Cartes ? Of whom if Interrogated, how he comes to sup pose, That a Man may point-blank, on the ac count of his Fiction, deny the Being of a God, together with all the visible Particulars already mention\u2019d ? As to the Existence of the Deity, he that considers the necessity of its Concession, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of",
|
| 67 |
+
"gold": "8 Remarks on Des-Cartes's Part. I. to conceive, That whatsoever thinks does not Exist at the Time of its Thinking : Wherefore he con\u2011 cludes, That he is, and that he Thinks, is of all the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an In\u2011 vention as this, in order to Improve Know\u2011 ledge , might not rather proceed from some Person in Bedlam , than from Des\u2011Cartes? Of whom if Interrogated, how he comes to sup\u2011 pose, That a Man may point\u2011blank, on the ac\u2011 count of his Fiction, deny the Being of a God, together with all the viffble Particulars already mention\u2019d ? As to the Existence of the Deity, he that conffders the nece\ueba7ty of its Conce\ueba7on, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of",
|
| 68 |
+
"output": {
|
| 69 |
+
"diffusiongemma": {
|
| 70 |
+
"text": "3 Remarks on Descartes. Part. I. to conceive, That whatsoever thinks does not Exist at the Time of its Thinking: Wherefore he concludes,, That he is, and that he Thinks, is of all the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an Invention as this, in order to Improve Knowledge, might not rather proceed from some Person in Bedlam, than from Descartes? Of whom if Interrogated, how he comes to suppose, That a Man may point-blank, on the account of his Fiction, deny the Being of a God, together with all the visible Particulars already mention\u2019d? As to the Existence of the Deity, he that considers the necessity of its Concession, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of",
|
| 71 |
+
"seconds": 3.626,
|
| 72 |
+
"tokens_generated": 188,
|
| 73 |
+
"denoising_steps": 20,
|
| 74 |
+
"tokens_per_forward": 9,
|
| 75 |
+
"thought_chars": 0
|
| 76 |
+
},
|
| 77 |
+
"diffusiongemma_canvas": {
|
| 78 |
+
"text": "3 Remarty on Des-CartesV Part. I. to conceive, That whatsoever thinks does not Exist 4t the Iime of its Thinking: Wherefore he con- eludes, That he is, and that he Thinks, is of aU the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an In vention as this, in order to Improve Know ledge , might not rather proceed from some Person in Bedlam, than from Des-Cartes ? Of whom if Interrogated, how he comes to sup pose, That a Man may point-blank, on the ac count of his Fiction, deny the Being of a God, together with all the visible Particulars already mention\u2019d ? As to the Existence of the Deity, he that considers the necessity of its Concession, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of",
|
| 79 |
+
"seconds": 0.701,
|
| 80 |
+
"tokens_generated": 206,
|
| 81 |
+
"denoising_steps": 3,
|
| 82 |
+
"tokens_per_forward": 69,
|
| 83 |
+
"thought_chars": 0
|
| 84 |
+
},
|
| 85 |
+
"gemma4": {
|
| 86 |
+
"text": "3 Remarty on Des-CartesV Part. I. to conceive, That whatsoever thinks does not Exist at the Time of its Thinking: Wherefore he con- eludes, That he is, and that he Thinks, is of aU the most certain Truth that can be acquired by Philosophical Order. Who would not judge, That so wild an In vention as this, in order to Improve Know ledge , might not rather proceed from some Person in Bedlam, than from Des-Cartes ? Of whom if Interrogated, how he comes to sup pose, That a Man may point-blank, on the ac count of his Fiction, deny the Being of a God, together with all the visible Particulars already mention\u2019d ? As to the Existence of the Deity, he that considers the necessity of its Concession, as he beholds the wonderful Conduct of the Universe, with so many miraculous Objects as are contain\u2019d therein, will as soon give credit to any Forgery of",
|
| 87 |
+
"seconds": 14.885,
|
| 88 |
+
"tokens_generated": 204,
|
| 89 |
+
"denoising_steps": null,
|
| 90 |
+
"tokens_per_forward": null,
|
| 91 |
+
"thought_chars": 0
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/39.txt",
|
| 97 |
+
"ocr_input": "43 2 AN ENGLISH ANTHOLOGY. [BK. IV. CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried; Not a soldier discharged his farewell shot O\u2019er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam\u2019s misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. f Few and short were",
|
| 98 |
+
"gold": "432 * First published anonymously in the Newry Telegraph, 1817. \u2020 ' A question arose as to which was the most perfect ode that had been produced. Shelley contended for Coleridge's On Switzer- land; others named some of Moore's Melodies, and Campbell's Hohenlinden. Lord Byron said, \" I will show you an ode I consider little inferior to the best which the present prolific age has produced \". CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried ; Not a soldier discharged his farewell shot O'er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam's misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. \u2020 Few and short were",
|
| 99 |
+
"output": {
|
| 100 |
+
"diffusiongemma": {
|
| 101 |
+
"text": "432 AN ENGLISH ANTHOLOGY. [BK. IV. CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried; Not a soldier discharged his farewell shot O\u2019er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam\u2019s misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. Few and short were",
|
| 102 |
+
"seconds": 1.367,
|
| 103 |
+
"tokens_generated": 139,
|
| 104 |
+
"denoising_steps": 7,
|
| 105 |
+
"tokens_per_forward": 20,
|
| 106 |
+
"thought_chars": 0
|
| 107 |
+
},
|
| 108 |
+
"diffusiongemma_canvas": {
|
| 109 |
+
"text": "43 2 AN ENGLISH ANTHOLOGY. [BK. IV. CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried; Not a soldier discharged his farewell shot O\u2019er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam\u2019s misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. Few and short were",
|
| 110 |
+
"seconds": 0.701,
|
| 111 |
+
"tokens_generated": 140,
|
| 112 |
+
"denoising_steps": 3,
|
| 113 |
+
"tokens_per_forward": 47,
|
| 114 |
+
"thought_chars": 0
|
| 115 |
+
},
|
| 116 |
+
"gemma4": {
|
| 117 |
+
"text": "43 2 AN ENGLISH ANTHOLOGY. [BK. IV. CCCXXXIV. THE BURIAL OF SIR JOHN MOORE.* Not a drum was heard, not a funeral note, As his corpse to the rampart we hurried; Not a soldier discharged his farewell shot O\u2019er the grave where our hero we buried. We buried him darkly at dead of night, The sods with our bayonets turning; By the struggling moonbeam\u2019s misty light, And the lantern dimly burning. No useless coffin enclosed his breast, Not in sheet nor in shroud we wound him; But he lay like a warrior taking his rest, With his martial cloak around him. f Few and short were",
|
| 118 |
+
"seconds": 10.352,
|
| 119 |
+
"tokens_generated": 142,
|
| 120 |
+
"denoising_steps": null,
|
| 121 |
+
"tokens_per_forward": null,
|
| 122 |
+
"thought_chars": 0
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/28.txt",
|
| 128 |
+
"ocr_input": "144 AN ENGLISH ANTHOLOGY. [BK A hollow wind did seem to answer, No; Go seek elsewhere. I did ; and going did a rainbow note; Surely, thought I, This is the lace of Peace\u2019s coat; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man ; Whom when for Peace I did demand, he thus began : There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived ; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth ; For they that taste it do rehearse That",
|
| 129 |
+
"gold": "144 A hollow wind did seem to answer, No ; Go seek elsewhere. I did ; and going did a rainbow note ; Surely, thought I, This is the lace of Peace's coat ; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial ; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man ; Whom when for Peace I did demand, he thus began : There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived ; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat ; Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth ; For they that taste it do rehearse That",
|
| 130 |
+
"output": {
|
| 131 |
+
"diffusiongemma": {
|
| 132 |
+
"text": "144 AN ENGLISH ANTHOLOGY. [A hollow wind did seem to answer, No; Go seek elsewhere. I did; and going did a rainbow note; Surely, thought I, This is the lace of Peace\u2019s coat; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man; Whom when for Peace I did demand, he thus began: There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth; For they that taste it do rehearse That",
|
| 133 |
+
"seconds": 0.844,
|
| 134 |
+
"tokens_generated": 220,
|
| 135 |
+
"denoising_steps": 4,
|
| 136 |
+
"tokens_per_forward": 55,
|
| 137 |
+
"thought_chars": 0
|
| 138 |
+
},
|
| 139 |
+
"diffusiongemma_canvas": {
|
| 140 |
+
"text": "144 AN ENGLISH ANTHOLOGY. [BK A hollow wind did seem to answer, No; Go seek elsewhere. I did ; and going did a rainbow note; Surely, thought I, This is the lace of Peace\u2019s coat; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man ; Whom when for Peace I did demand, he thus began : There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived ; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth ; For they that taste it do rehearse That",
|
| 141 |
+
"seconds": 0.528,
|
| 142 |
+
"tokens_generated": 221,
|
| 143 |
+
"denoising_steps": 2,
|
| 144 |
+
"tokens_per_forward": 110,
|
| 145 |
+
"thought_chars": 0
|
| 146 |
+
},
|
| 147 |
+
"gemma4": {
|
| 148 |
+
"text": "144 AN ENGLISH ANTHOLOGY. [A hollow wind did seem to answer, No; Go seek elsewhere. I did; and going did a rainbow note; Surely, thought I, This is the lace of Peace\u2019s coat; I will search out the matter. But while I looked the clouds immediately Did break and scatter. Then went I to a garden and did spy A gallant flower, The crown imperial; sure, said I, Peace at the root must dwell. But when I digged, I saw a worm devour What showed so well. At length I met a reverend good old man; Whom when for Peace I did demand, he thus began: There was a Prince of old At Salem dwelt, who lived with good increase Of flock and fold. He sweetly lived; yet sweetness did not save His life from foes. But after death out of his grave There sprang twelve stalks of wheat Which many wondering at, got some of those To plant and set. It prospered strangely, and did soon disperse Through all the earth; For they that taste it do rehearse That",
|
| 149 |
+
"seconds": 16.111,
|
| 150 |
+
"tokens_generated": 220,
|
| 151 |
+
"denoising_steps": null,
|
| 152 |
+
"tokens_per_forward": null,
|
| 153 |
+
"thought_chars": 0
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"id": "example/icdar2019/full_22M_without_Finnish/EN/EN1/122.txt",
|
| 159 |
+
"ocr_input": "t Jeademia Scienthrum. I $ mltiplicare feu in aliumduc\u20acreut4 2 eft toties fiimere multiplicandum quoties sumi potest in multiplicatore unitas: Quare si 4 ducas in 2,fumma ituraest8. Unum numcrum divi- lere per aliura ut 8 per 2 nihil aliud [A quam invenire quoties 2 conti- leantur in 8. Numerus planus a duo- >us numeris in fe invicem ductis pro- lucitur v i2 igitur est-numerus planus juia producitur a numero 6in aducto. Solidus numerus a tribus numeris ultiplicatis oritur : Ejusmodi est 24, ;x tribus enim hisce numeris multipli ers emergit 2, 3, 4 ; si enim 2 duco n 3 habeo 6, & siduxero6 in 4,",
|
| 160 |
+
"gold": "mltiplicare seu in alium ducere ut 4 2 est toties sumere multiplicandum quoties sumi potest in multiplicatore unitas: Quare ff 4 ducas in 2,summa utura est 8. Unum numerum divi- dere per alium ut 8 per 2 nihil aliud est quam invenire quoties 2 conti- neantur in 8. Numerus planus a duo- us numeris in se invicem ductis pro- ducitur, 12 igitur est numerus planus quia producitur a numero 6 in 2 ducto. Solidus numerus a tribus numeris multiplicatis oritur : Ejusmodi est 24, ex tribus enim hisce numeris multipli- catis emergit 2, 3, 4 ; ff enim 2 duco n 3 habeo 6, & ff duxero 6 in 4,",
|
| 161 |
+
"output": {
|
| 162 |
+
"diffusiongemma": {
|
| 163 |
+
"text": "In Academia Scientorum. Multiplicare feu in aliumducere est toties fines multiplicandum quoties sumi potest in multiplicatore unitas: Quare si duas in 2, summa itura est. Unum numerum dividere per alia ut 8 per 2 nihil aliud quam invenire quoties 2 continentur in 8. Numerus planus a duois numeris in se invicem ductis prolucitur; igitur est numerus planus quia producitur a numero 6 inducto. Solidus numerus a tribus numeris multiplicatis oritur: Ejusmodi est 24, quae ex tribus enim hisce numeris multiplicis emergit 2, 3, 4; si enim 2 duco 3 habeo 6, & si duco in 4,",
|
| 164 |
+
"seconds": 2.015,
|
| 165 |
+
"tokens_generated": 181,
|
| 166 |
+
"denoising_steps": 11,
|
| 167 |
+
"tokens_per_forward": 16,
|
| 168 |
+
"thought_chars": 0
|
| 169 |
+
},
|
| 170 |
+
"diffusiongemma_canvas": {
|
| 171 |
+
"text": "t Jeademia Scienthrum. I $ mltiplicare feu in aliumduc\u20acreut4 2 eft toties fiimere multiplicandum quoties sumi potest in multiplicatore unitas: Quare si 4 ducas in 2,fumma ituraest8. Unum numcrum divi- lere per aliura ut 8 per 2 nihil aliud [A quam invenire quoties 2 conti- leantur in 8. Numerus planus a duo- >us numeris in fe invicem ductis pro- lucitur v i2 igitur est-numerus planus juia producitur a numero 6in aducto. Solidus numerus a tribus numeris ultiplicatis oritur : Ejusmodi est 24, ;x tribus enim hisce numeris multipli ers emergit 2, 3, 4 ; si enim 2 duco n 3 habeo 6, & siduxero6 in 4,",
|
| 172 |
+
"seconds": 0.516,
|
| 173 |
+
"tokens_generated": 221,
|
| 174 |
+
"denoising_steps": 2,
|
| 175 |
+
"tokens_per_forward": 110,
|
| 176 |
+
"thought_chars": 0
|
| 177 |
+
},
|
| 178 |
+
"gemma4": {
|
| 179 |
+
"text": "De Academia Scientiarum. I multiplicare feu in alium ducereut a est toties simere multiplicandum quoties sumi potest in multiplicatore unitas: Quare si 4 ducas in 2, summa itura est. Unum numcrum dividere per aliura ut 8 per 2 nihil aliud [a quam invenire quoties 2 continentur in 8. Numerus planus a duo-us numeris in se invicem ductis pro-lucitur vi2 igitur est-numerus planus quia producitur a numero 6 in aducto. Solidus numerus a tribus numeris ultiplicatis oritur : Ejusmodi est 24, ;x tribus enim hisce numeris multipli er emergit 2, 3, 4 ; si enim 2 duco n 3 habeo 6, & siduxero6 in 4,",
|
| 180 |
+
"seconds": 14.6,
|
| 181 |
+
"tokens_generated": 200,
|
| 182 |
+
"denoising_steps": null,
|
| 183 |
+
"tokens_per_forward": null,
|
| 184 |
+
"thought_chars": 0
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
}
|
| 188 |
+
]
|
index.html
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 6 |
+
<title>The Post-OCR Gazette — DiffusionGemma vs Gemma-4</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
| 8 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=UnifrakturCook:wght@700&family=Old+Standard+TT:ital,wght@0,400;0,700;1,400&family=Special+Elite&display=swap" rel="stylesheet" />
|
| 10 |
+
<style>
|
| 11 |
+
:root {
|
| 12 |
+
--paper: #f4ecd8;
|
| 13 |
+
--paper-deep: #ece1c6;
|
| 14 |
+
--ink: #1d1a14;
|
| 15 |
+
--ink-soft: #4a4339;
|
| 16 |
+
--rule: #2a251c;
|
| 17 |
+
--oxblood: #7a1f1f;
|
| 18 |
+
--ochre-bg: #e8d27a66;
|
| 19 |
+
--ochre-edge: #a8842a;
|
| 20 |
+
--green-ink: #2e5d34;
|
| 21 |
+
--red-ink: #9c2b22;
|
| 22 |
+
--col-gap: 2.5rem;
|
| 23 |
+
}
|
| 24 |
+
* { box-sizing: border-box; }
|
| 25 |
+
html { background: #d9cdaf; }
|
| 26 |
+
body {
|
| 27 |
+
margin: 0 auto;
|
| 28 |
+
max-width: 1180px;
|
| 29 |
+
padding: 2.2rem 2.4rem 4rem;
|
| 30 |
+
background:
|
| 31 |
+
url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='180' height='180'%3E%3Cfilter id='n'%3E%3CfeTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2'/%3E%3CfeColorMatrix values='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.05 0'/%3E%3C/filter%3E%3Crect width='180' height='180' filter='url(%23n)'/%3E%3C/svg%3E"),
|
| 32 |
+
var(--paper);
|
| 33 |
+
color: var(--ink);
|
| 34 |
+
font-family: "Old Standard TT", "Iowan Old Style", Georgia, serif;
|
| 35 |
+
font-size: 17px;
|
| 36 |
+
line-height: 1.55;
|
| 37 |
+
box-shadow: 0 0 60px rgba(0,0,0,.35);
|
| 38 |
+
min-height: 100vh;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
/* ---------- masthead ---------- */
|
| 42 |
+
header { text-align: center; animation: settle .7s ease-out both; }
|
| 43 |
+
.gazette-name {
|
| 44 |
+
font-family: "UnifrakturCook", serif;
|
| 45 |
+
font-size: clamp(2.4rem, 5.5vw, 4.2rem);
|
| 46 |
+
margin: 0;
|
| 47 |
+
letter-spacing: .01em;
|
| 48 |
+
}
|
| 49 |
+
.gazette-name .amp { color: var(--oxblood); }
|
| 50 |
+
.dateline {
|
| 51 |
+
display: flex; align-items: center; gap: 1rem;
|
| 52 |
+
margin: .65rem 0 0;
|
| 53 |
+
font-variant: small-caps; letter-spacing: .22em; font-size: .8rem; color: var(--ink-soft);
|
| 54 |
+
}
|
| 55 |
+
.dateline::before, .dateline::after { content: ""; flex: 1; border-top: 1px solid var(--rule); }
|
| 56 |
+
.double-rule { border: 0; border-top: 3px double var(--rule); margin: .8rem 0 0; }
|
| 57 |
+
.standfirst {
|
| 58 |
+
max-width: 62ch; margin: 1.1rem auto 0; font-style: italic; color: var(--ink-soft); font-size: 1.02rem;
|
| 59 |
+
}
|
| 60 |
+
.standfirst a { color: var(--oxblood); }
|
| 61 |
+
|
| 62 |
+
/* ---------- composing desk ---------- */
|
| 63 |
+
.desk { margin-top: 2rem; animation: settle .7s .12s ease-out both; }
|
| 64 |
+
.desk-head {
|
| 65 |
+
font-variant: small-caps; letter-spacing: .3em; font-size: .78rem;
|
| 66 |
+
color: var(--ink-soft); text-align: center; margin-bottom: .8rem;
|
| 67 |
+
}
|
| 68 |
+
.desk-head::before { content: "❧ "; color: var(--oxblood); }
|
| 69 |
+
.desk-head::after { content: " ☙"; color: var(--oxblood); }
|
| 70 |
+
.chips { display: flex; flex-wrap: wrap; gap: .5rem; justify-content: center; }
|
| 71 |
+
.chip {
|
| 72 |
+
font-family: inherit; font-size: .82rem; cursor: pointer;
|
| 73 |
+
background: transparent; color: var(--ink);
|
| 74 |
+
border: 1px solid var(--rule); border-radius: 0; padding: .3rem .7rem;
|
| 75 |
+
transition: background .15s, color .15s;
|
| 76 |
+
}
|
| 77 |
+
.chip:hover { background: var(--ink); color: var(--paper); }
|
| 78 |
+
.chip.active { background: var(--oxblood); color: var(--paper); border-color: var(--oxblood); }
|
| 79 |
+
textarea {
|
| 80 |
+
width: 100%; min-height: 9.5rem; margin-top: 1rem; padding: 1rem 1.1rem;
|
| 81 |
+
font-family: "Special Elite", "Courier New", monospace; font-size: .92rem; line-height: 1.7;
|
| 82 |
+
color: var(--ink); background: var(--paper-deep);
|
| 83 |
+
border: 1px solid var(--rule); outline: none; resize: vertical;
|
| 84 |
+
box-shadow: inset 0 1px 6px rgba(0,0,0,.12);
|
| 85 |
+
}
|
| 86 |
+
textarea:focus { border-color: var(--oxblood); }
|
| 87 |
+
.controls { display: flex; flex-wrap: wrap; align-items: center; gap: 1.2rem; margin-top: .9rem; }
|
| 88 |
+
.run-btn {
|
| 89 |
+
font-family: inherit; font-variant: small-caps; letter-spacing: .18em; font-size: 1rem;
|
| 90 |
+
background: var(--ink); color: var(--paper); border: 1px solid var(--ink);
|
| 91 |
+
padding: .55rem 1.6rem; cursor: pointer; transition: background .15s, transform .05s;
|
| 92 |
+
}
|
| 93 |
+
.run-btn:hover { background: var(--oxblood); border-color: var(--oxblood); }
|
| 94 |
+
.run-btn:active { transform: translateY(1px); }
|
| 95 |
+
.run-btn[disabled] { opacity: .45; cursor: wait; }
|
| 96 |
+
label.toggle {
|
| 97 |
+
display: inline-flex; align-items: center; gap: .45rem; font-size: .88rem; color: var(--ink-soft); cursor: pointer;
|
| 98 |
+
}
|
| 99 |
+
label.toggle input { accent-color: var(--oxblood); }
|
| 100 |
+
.toggle .tag {
|
| 101 |
+
font-variant: small-caps; font-size: .68rem; letter-spacing: .12em;
|
| 102 |
+
border: 1px solid var(--ochre-edge); color: var(--ochre-edge); padding: 0 .35rem;
|
| 103 |
+
}
|
| 104 |
+
.charcount { margin-left: auto; font-size: .78rem; color: var(--ink-soft); font-variant: small-caps; letter-spacing: .1em; }
|
| 105 |
+
.charcount.over { color: var(--red-ink); }
|
| 106 |
+
|
| 107 |
+
/* ---------- columns ---------- */
|
| 108 |
+
.columns {
|
| 109 |
+
display: grid; grid-template-columns: 1fr 1fr; gap: 0 var(--col-gap);
|
| 110 |
+
margin-top: 2.4rem; position: relative; animation: settle .7s .22s ease-out both;
|
| 111 |
+
}
|
| 112 |
+
.columns::before {
|
| 113 |
+
content: ""; position: absolute; top: 0; bottom: 0; left: 50%;
|
| 114 |
+
border-left: 1px solid var(--rule);
|
| 115 |
+
}
|
| 116 |
+
@media (max-width: 760px) {
|
| 117 |
+
.columns { grid-template-columns: 1fr; gap: 2.2rem; }
|
| 118 |
+
.columns::before { display: none; }
|
| 119 |
+
}
|
| 120 |
+
.col-head { border-top: 3px double var(--rule); border-bottom: 1px solid var(--rule); padding: .45rem 0 .4rem; text-align: center; }
|
| 121 |
+
.col-head h2 { margin: 0; font-size: 1.25rem; letter-spacing: .04em; font-weight: 700; }
|
| 122 |
+
.col-head .col-sub { font-variant: small-caps; letter-spacing: .25em; font-size: .7rem; color: var(--ink-soft); }
|
| 123 |
+
.col-head .col-sub a { color: inherit; text-decoration: none; border-bottom: 1px dotted var(--ink-soft); }
|
| 124 |
+
.statline {
|
| 125 |
+
display: flex; justify-content: center; gap: 1.6rem; padding: .55rem 0;
|
| 126 |
+
border-bottom: 1px solid var(--rule); font-size: .8rem; font-variant: small-caps; letter-spacing: .08em;
|
| 127 |
+
color: var(--ink-soft); min-height: 2.2rem; align-items: baseline;
|
| 128 |
+
}
|
| 129 |
+
.statline b { font-size: 1.25rem; color: var(--ink); font-variant: normal; letter-spacing: 0; }
|
| 130 |
+
.statline .cached-tag { color: var(--oxblood); font-style: italic; font-variant: normal; letter-spacing: 0; }
|
| 131 |
+
.proof {
|
| 132 |
+
padding: 1rem .2rem 0; min-height: 8rem; font-size: 1rem; line-height: 1.75; word-wrap: break-word;
|
| 133 |
+
}
|
| 134 |
+
.proof .placeholder { color: var(--ink-soft); font-style: italic; opacity: .7; }
|
| 135 |
+
.proof .spinner { font-style: italic; color: var(--oxblood); }
|
| 136 |
+
.proof .spinner::after { content: ""; animation: dots 1.2s steps(4) infinite; }
|
| 137 |
+
@keyframes dots { 0% { content: ""; } 25% { content: "."; } 50% { content: ".."; } 75% { content: "..."; } }
|
| 138 |
+
.seg-changed { background: var(--ochre-bg); border-bottom: 2px solid var(--ochre-edge); }
|
| 139 |
+
.seg-added { color: var(--green-ink); border-bottom: 2px solid var(--green-ink); font-weight: 700; }
|
| 140 |
+
.seg-removed { color: var(--red-ink); text-decoration: line-through; opacity: .75; }
|
| 141 |
+
.error-box {
|
| 142 |
+
border: 1px solid var(--red-ink); color: var(--red-ink); padding: .6rem .9rem; font-style: italic; margin-top: 1rem;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
/* ---------- press (denoising scrubber) ---------- */
|
| 146 |
+
.press { margin-top: 1.4rem; border-top: 1px solid var(--rule); padding-top: .8rem; display: none; }
|
| 147 |
+
.press.visible { display: block; }
|
| 148 |
+
.press .press-head { font-variant: small-caps; letter-spacing: .2em; font-size: .72rem; color: var(--ink-soft); display: flex; justify-content: space-between; }
|
| 149 |
+
.press input[type=range] { width: 100%; accent-color: var(--oxblood); margin: .5rem 0; }
|
| 150 |
+
.press .canvas-view {
|
| 151 |
+
font-family: "Special Elite", monospace; font-size: .8rem; line-height: 1.65;
|
| 152 |
+
background: var(--paper-deep); border: 1px dashed var(--rule);
|
| 153 |
+
padding: .8rem .9rem; min-height: 5.5rem; max-height: 14rem; overflow-y: auto; white-space: pre-wrap;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
/* ---------- legend ---------- */
|
| 157 |
+
.legend { display: flex; gap: 1.6rem; justify-content: center; margin-top: 2rem; font-size: .8rem; color: var(--ink-soft); flex-wrap: wrap; }
|
| 158 |
+
.legend span.swatch { padding: 0 .4rem; }
|
| 159 |
+
|
| 160 |
+
/* ---------- results ---------- */
|
| 161 |
+
.results { margin-top: 3rem; animation: settle .7s .3s ease-out both; }
|
| 162 |
+
.section-head {
|
| 163 |
+
text-align: center; border-top: 3px double var(--rule); padding-top: .6rem; margin-bottom: 1.1rem;
|
| 164 |
+
}
|
| 165 |
+
.section-head h2 { margin: 0; font-size: 1.45rem; letter-spacing: .03em; }
|
| 166 |
+
.section-head .col-sub { font-variant: small-caps; letter-spacing: .25em; font-size: .72rem; color: var(--ink-soft); }
|
| 167 |
+
table { border-collapse: collapse; margin: 0 auto; font-size: .92rem; }
|
| 168 |
+
th, td { padding: .45rem .9rem; border-bottom: 1px solid #00000022; text-align: right; }
|
| 169 |
+
th:first-child, td:first-child { text-align: left; }
|
| 170 |
+
thead th { border-bottom: 2px solid var(--rule); font-variant: small-caps; letter-spacing: .06em; font-weight: 700; }
|
| 171 |
+
tbody tr:first-child td { color: var(--ink-soft); font-style: italic; }
|
| 172 |
+
tbody tr:nth-child(2) td:first-child::before { content: "★ "; color: var(--oxblood); }
|
| 173 |
+
.footnotes { max-width: 75ch; margin: 1.3rem auto 0; font-size: .85rem; color: var(--ink-soft); }
|
| 174 |
+
.footnotes p { margin: .3rem 0; }
|
| 175 |
+
|
| 176 |
+
footer {
|
| 177 |
+
margin-top: 3.5rem; border-top: 3px double var(--rule); padding-top: 1rem;
|
| 178 |
+
text-align: center; font-size: .82rem; color: var(--ink-soft);
|
| 179 |
+
}
|
| 180 |
+
footer a { color: var(--oxblood); }
|
| 181 |
+
|
| 182 |
+
@keyframes settle { from { opacity: 0; transform: translateY(14px); } to { opacity: 1; transform: none; } }
|
| 183 |
+
@media (prefers-reduced-motion: reduce) { * { animation: none !important; transition: none !important; } }
|
| 184 |
+
</style>
|
| 185 |
+
</head>
|
| 186 |
+
<body>
|
| 187 |
+
|
| 188 |
+
<header>
|
| 189 |
+
<h1 class="gazette-name">The Post‑OCR Gazette</h1>
|
| 190 |
+
<div class="dateline"><span>Vol. I — Diffusion <span style="font-variant:normal">vs</span> Autoregression — Price one GPU‑second</span></div>
|
| 191 |
+
<hr class="double-rule" />
|
| 192 |
+
<p class="standfirst">
|
| 193 |
+
In which <a href="https://huggingface.co/google/diffusiongemma-26B-A4B-it">DiffusionGemma‑26B‑A4B</a>,
|
| 194 |
+
an experimental block‑diffusion language model, and
|
| 195 |
+
<a href="https://huggingface.co/google/gemma-4-E4B-it">Gemma‑4‑E4B</a>, its autoregressive cousin,
|
| 196 |
+
are set against one another in the correction of nineteenth‑century newspaper OCR.
|
| 197 |
+
Marks upon the proofs show what each model <em>changed</em> in your text.
|
| 198 |
+
</p>
|
| 199 |
+
</header>
|
| 200 |
+
|
| 201 |
+
<section class="desk">
|
| 202 |
+
<div class="desk-head">The Composing Desk</div>
|
| 203 |
+
<div class="chips" id="chips"></div>
|
| 204 |
+
<textarea id="ocr-input" spellcheck="false" placeholder="Paste noisy OCR text here, or pick a passage above…"></textarea>
|
| 205 |
+
<div class="controls">
|
| 206 |
+
<button class="run-btn" id="run">Correct the Proofs</button>
|
| 207 |
+
<label class="toggle">
|
| 208 |
+
<input type="checkbox" id="canvas-toggle" />
|
| 209 |
+
seed canvas with OCR text <span class="tag">experimental — under‑corrects</span>
|
| 210 |
+
</label>
|
| 211 |
+
<span class="charcount" id="charcount"></span>
|
| 212 |
+
</div>
|
| 213 |
+
</section>
|
| 214 |
+
|
| 215 |
+
<section class="columns">
|
| 216 |
+
<div class="col" id="col-dg">
|
| 217 |
+
<div class="col-head">
|
| 218 |
+
<h2>The Diffusionist</h2>
|
| 219 |
+
<div class="col-sub"><a href="https://huggingface.co/google/diffusiongemma-26B-A4B-it">DiffusionGemma‑26B‑A4B‑it</a> · denoises 256 tokens in parallel</div>
|
| 220 |
+
</div>
|
| 221 |
+
<div class="statline" id="stats-dg"><span class="placeholder">awaiting copy</span></div>
|
| 222 |
+
<div class="proof" id="proof-dg"><span class="placeholder">The diffusion model’s corrected proof will appear here.</span></div>
|
| 223 |
+
<div class="press" id="press">
|
| 224 |
+
<div class="press-head"><span>The Press — denoising, step by step</span><span id="press-step"></span></div>
|
| 225 |
+
<input type="range" id="press-slider" min="0" max="0" value="0" step="1" />
|
| 226 |
+
<div class="canvas-view" id="press-canvas"></div>
|
| 227 |
+
</div>
|
| 228 |
+
</div>
|
| 229 |
+
<div class="col" id="col-g4">
|
| 230 |
+
<div class="col-head">
|
| 231 |
+
<h2>The Autoregressive</h2>
|
| 232 |
+
<div class="col-sub"><a href="https://huggingface.co/google/gemma-4-E4B-it">Gemma‑4‑E4B‑it</a> · one token after another, greedily</div>
|
| 233 |
+
</div>
|
| 234 |
+
<div class="statline" id="stats-g4"><span class="placeholder">awaiting copy</span></div>
|
| 235 |
+
<div class="proof" id="proof-g4"><span class="placeholder">The autoregressive model’s corrected proof will appear here.</span></div>
|
| 236 |
+
</div>
|
| 237 |
+
</section>
|
| 238 |
+
|
| 239 |
+
<div class="legend">
|
| 240 |
+
<span><span class="swatch seg-changed">changed</span> — altered from the input</span>
|
| 241 |
+
<span><span class="swatch seg-added">added</span> — inserted text</span>
|
| 242 |
+
<span><span class="swatch seg-removed">removed</span> — deleted from the input</span>
|
| 243 |
+
</div>
|
| 244 |
+
|
| 245 |
+
<section class="results">
|
| 246 |
+
<div class="section-head">
|
| 247 |
+
<h2>The Ledger</h2>
|
| 248 |
+
<div class="col-sub">75 passages of BLN600 · A100‑80GB · full methodology in the repository</div>
|
| 249 |
+
</div>
|
| 250 |
+
<div id="results-table"><p style="text-align:center;font-style:italic">Fetching the ledger…</p></div>
|
| 251 |
+
<div class="footnotes" id="results-notes"></div>
|
| 252 |
+
</section>
|
| 253 |
+
|
| 254 |
+
<footer>
|
| 255 |
+
<p>
|
| 256 |
+
Benchmark texts: <a href="https://doi.org/10.15131/shef.data.25439023">BLN600</a> (CC‑BY‑NC — metrics only republished here).
|
| 257 |
+
Demo passages: ICDAR2019 post‑OCR (CC‑BY‑4.0). DiffusionGemma is experimental and one day old at press time;
|
| 258 |
+
its sampler has no greedy mode, so the comparison is defaults‑vs‑greedy. Single run, no significance testing.
|
| 259 |
+
</p>
|
| 260 |
+
<p>Set in Old Standard & Special Elite · powered by ZeroGPU · by <a href="https://huggingface.co/davanstrien">davanstrien</a></p>
|
| 261 |
+
</footer>
|
| 262 |
+
|
| 263 |
+
<script type="module">
|
| 264 |
+
import { Client } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
|
| 265 |
+
|
| 266 |
+
const $ = (id) => document.getElementById(id);
|
| 267 |
+
const MAX_CHARS = 1200;
|
| 268 |
+
let client = null, examples = [], activeExample = null, snapshots = [];
|
| 269 |
+
|
| 270 |
+
const connect = (async () => { client = await Client.connect(window.location.origin); })();
|
| 271 |
+
|
| 272 |
+
/* ---------- helpers ---------- */
|
| 273 |
+
function renderDiff(el, segs) {
|
| 274 |
+
el.innerHTML = "";
|
| 275 |
+
for (const s of segs) {
|
| 276 |
+
const span = document.createElement("span");
|
| 277 |
+
if (s.op !== "same") span.className = "seg-" + s.op;
|
| 278 |
+
span.textContent = s.op === "removed" ? (s.text.trim() ? s.text : " ⌫ ") : s.text;
|
| 279 |
+
el.appendChild(span);
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
function stats(el, r, extra = "") {
|
| 283 |
+
el.innerHTML = `<span><b>${r.seconds}</b> s</span><span><b>${r.tokens_per_second}</b> tok/s</span>` +
|
| 284 |
+
(r.denoising_steps ? `<span><b>${r.denoising_steps}</b> steps</span>` : "") + extra;
|
| 285 |
+
}
|
| 286 |
+
function spinner(el, msg) { el.innerHTML = `<span class="spinner">${msg}</span>`; }
|
| 287 |
+
function showError(el, msg) { el.innerHTML = `<div class="error-box">${msg}</div>`; }
|
| 288 |
+
|
| 289 |
+
function showSnapshots(snaps) {
|
| 290 |
+
snapshots = snaps || [];
|
| 291 |
+
const press = $("press");
|
| 292 |
+
if (!snapshots.length) { press.classList.remove("visible"); return; }
|
| 293 |
+
press.classList.add("visible");
|
| 294 |
+
const slider = $("press-slider");
|
| 295 |
+
slider.max = snapshots.length - 1;
|
| 296 |
+
slider.value = snapshots.length - 1;
|
| 297 |
+
updatePress();
|
| 298 |
+
}
|
| 299 |
+
function updatePress() {
|
| 300 |
+
const i = +$("press-slider").value;
|
| 301 |
+
$("press-step").textContent = `step ${i + 1} of ${snapshots.length}`;
|
| 302 |
+
$("press-canvas").textContent = snapshots[i] || "";
|
| 303 |
+
}
|
| 304 |
+
$("press-slider").addEventListener("input", updatePress);
|
| 305 |
+
|
| 306 |
+
/* ---------- examples ---------- */
|
| 307 |
+
async function loadExamples() {
|
| 308 |
+
const res = await fetch("data/examples");
|
| 309 |
+
examples = await res.json();
|
| 310 |
+
const chips = $("chips");
|
| 311 |
+
examples.forEach((e, i) => {
|
| 312 |
+
const b = document.createElement("button");
|
| 313 |
+
b.className = "chip"; b.textContent = e.label;
|
| 314 |
+
b.onclick = () => {
|
| 315 |
+
document.querySelectorAll(".chip").forEach(c => c.classList.remove("active"));
|
| 316 |
+
b.classList.add("active");
|
| 317 |
+
$("ocr-input").value = e.ocr_input;
|
| 318 |
+
activeExample = e;
|
| 319 |
+
updateCount();
|
| 320 |
+
if (e.cached) renderCached(e);
|
| 321 |
+
};
|
| 322 |
+
chips.appendChild(b);
|
| 323 |
+
if (i === 0) b.click();
|
| 324 |
+
});
|
| 325 |
+
}
|
| 326 |
+
function renderCached(e) {
|
| 327 |
+
const dg = e.cached.diffusiongemma, g4 = e.cached.gemma4;
|
| 328 |
+
const tag = `<span class="cached-tag">— from the morning edition (pre‑computed)</span>`;
|
| 329 |
+
if (dg) {
|
| 330 |
+
stats($("stats-dg"), { seconds: dg.seconds, tokens_per_second: Math.round(dg.tokens_generated / dg.seconds), denoising_steps: dg.denoising_steps }, tag);
|
| 331 |
+
renderDiff($("proof-dg"), dg.diff);
|
| 332 |
+
}
|
| 333 |
+
if (g4) {
|
| 334 |
+
stats($("stats-g4"), { seconds: g4.seconds, tokens_per_second: Math.round(g4.tokens_generated / g4.seconds) }, tag);
|
| 335 |
+
renderDiff($("proof-g4"), g4.diff);
|
| 336 |
+
}
|
| 337 |
+
showSnapshots([]);
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
/* ---------- character count ---------- */
|
| 341 |
+
function updateCount() {
|
| 342 |
+
const n = $("ocr-input").value.length;
|
| 343 |
+
const el = $("charcount");
|
| 344 |
+
el.textContent = `${n} / ${MAX_CHARS} characters`;
|
| 345 |
+
el.classList.toggle("over", n > MAX_CHARS);
|
| 346 |
+
}
|
| 347 |
+
$("ocr-input").addEventListener("input", () => { activeExample = null; updateCount(); });
|
| 348 |
+
updateCount();
|
| 349 |
+
|
| 350 |
+
/* ---------- run ---------- */
|
| 351 |
+
$("run").addEventListener("click", async () => {
|
| 352 |
+
const text = $("ocr-input").value.trim();
|
| 353 |
+
if (!text) return;
|
| 354 |
+
if (text.length > MAX_CHARS) { showError($("proof-dg"), `Input too long (${text.length} chars; the cap is ${MAX_CHARS} — DiffusionGemma writes a single 256‑token block).`); return; }
|
| 355 |
+
const btn = $("run");
|
| 356 |
+
btn.disabled = true;
|
| 357 |
+
await connect;
|
| 358 |
+
spinner($("proof-dg"), "denoising the canvas"); $("stats-dg").innerHTML = "";
|
| 359 |
+
spinner($("proof-g4"), "queued behind the diffusionist"); $("stats-g4").innerHTML = "";
|
| 360 |
+
showSnapshots([]);
|
| 361 |
+
try {
|
| 362 |
+
const dg = (await client.predict("/run_diffusiongemma", {
|
| 363 |
+
ocr_text: text, canvas_init: $("canvas-toggle").checked,
|
| 364 |
+
})).data[0];
|
| 365 |
+
if (dg.error) showError($("proof-dg"), dg.error);
|
| 366 |
+
else {
|
| 367 |
+
stats($("stats-dg"), dg, dg.canvas_init ? `<span class="cached-tag">— OCR‑seeded canvas</span>` : "");
|
| 368 |
+
renderDiff($("proof-dg"), dg.diff);
|
| 369 |
+
showSnapshots(dg.snapshots);
|
| 370 |
+
}
|
| 371 |
+
spinner($("proof-g4"), "composing, one token at a time");
|
| 372 |
+
const g4 = (await client.predict("/run_gemma4", { ocr_text: text })).data[0];
|
| 373 |
+
if (g4.error) showError($("proof-g4"), g4.error);
|
| 374 |
+
else { stats($("stats-g4"), g4); renderDiff($("proof-g4"), g4.diff); }
|
| 375 |
+
} catch (err) {
|
| 376 |
+
showError($("proof-dg"), `The press has jammed: ${err.message || err}`);
|
| 377 |
+
$("proof-g4").innerHTML = "";
|
| 378 |
+
} finally {
|
| 379 |
+
btn.disabled = false;
|
| 380 |
+
}
|
| 381 |
+
});
|
| 382 |
+
|
| 383 |
+
/* ---------- results ledger ---------- */
|
| 384 |
+
async function loadResults() {
|
| 385 |
+
try {
|
| 386 |
+
const res = await fetch("data/results");
|
| 387 |
+
const data = await res.json();
|
| 388 |
+
const lines = data.summary_md.split("\n");
|
| 389 |
+
const tableLines = lines.filter(l => l.trim().startsWith("|"));
|
| 390 |
+
if (tableLines.length > 2) {
|
| 391 |
+
const parse = (l) => l.split("|").slice(1, -1).map(c => c.trim());
|
| 392 |
+
const head = parse(tableLines[0]);
|
| 393 |
+
const rows = tableLines.slice(2).map(parse);
|
| 394 |
+
let html = "<table><thead><tr>" + head.map(h => `<th>${h}</th>`).join("") + "</tr></thead><tbody>";
|
| 395 |
+
for (const r of rows) html += "<tr>" + r.map(c => `<td>${c}</td>`).join("") + "</tr>";
|
| 396 |
+
html += "</tbody></table>";
|
| 397 |
+
$("results-table").innerHTML = html;
|
| 398 |
+
}
|
| 399 |
+
const notes = lines.filter(l => /^(Micro|Mean)/.test(l)).map(l => `<p>${l}</p>`).join("");
|
| 400 |
+
$("results-notes").innerHTML = notes +
|
| 401 |
+
`<p>Over‑correction: of input characters already correct (vs the gold transcription), the share the model changed.
|
| 402 |
+
Fix rate: of input characters that were wrong, the share the model changed.
|
| 403 |
+
The OCR‑seeded‑canvas condition (toggle above) converges in 2–5 steps but barely edits — a negative result, reported honestly.</p>`;
|
| 404 |
+
} catch { $("results-table").innerHTML = "<p style='text-align:center;font-style:italic'>The ledger could not be fetched.</p>"; }
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
loadExamples();
|
| 408 |
+
loadResults();
|
| 409 |
+
</script>
|
| 410 |
+
</body>
|
| 411 |
+
</html>
|
requirements.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
|
|
| 1 |
transformers>=5.11,<6
|
| 2 |
accelerate
|
| 3 |
pillow
|
| 4 |
torchvision
|
|
|
|
|
|
| 1 |
+
gradio>=6.16.0
|
| 2 |
transformers>=5.11,<6
|
| 3 |
accelerate
|
| 4 |
pillow
|
| 5 |
torchvision
|
| 6 |
+
spaces
|
results/per_passage_metrics.jsonl
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "bln600/3200810915", "n_chars_gold": 980, "cer_input": 0.02040816326530612, "wer_input": 0.09027777777777778, "cer_dg": 0.00510204081632653, "wer_dg": 0.034722222222222224, "rel_cer_red_dg": 0.75, "overcorr_dg": 0.002068252326783868, "fixrate_dg": 1.0, "seconds_dg": 1.051, "tok_s_dg": 194.10085632730733, "denoising_steps_dg": 5, "cer_dgc": 0.02040816326530612, "wer_dgc": 0.09027777777777778, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.069, "tok_s_dgc": 202.99345182413472, "denoising_steps_dgc": 5, "cer_g4": 0.0020408163265306124, "wer_g4": 0.013888888888888888, "rel_cer_red_g4": 0.9, "overcorr_g4": 0.0, "fixrate_g4": 1.0, "seconds_g4": 14.924, "tok_s_g4": 13.669257571696596}
|
| 2 |
+
{"id": "bln600/3200807901", "n_chars_gold": 911, "cer_input": 0.02854006586169045, "wer_input": 0.15441176470588236, "cer_dg": 0.010976948408342482, "wer_dg": 0.03676470588235294, "rel_cer_red_dg": 0.6153846153846154, "overcorr_dg": 0.012168141592920354, "fixrate_dg": 0.7916666666666666, "seconds_dg": 1.057, "tok_s_dg": 188.2686849574267, "denoising_steps_dg": 5, "cer_dgc": 0.02854006586169045, "wer_dgc": 0.15441176470588236, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.537, "tok_s_dgc": 407.8212290502793, "denoising_steps_dgc": 2, "cer_g4": 0.005488474204171241, "wer_g4": 0.029411764705882353, "rel_cer_red_g4": 0.8076923076923076, "overcorr_g4": 0.00331858407079646, "fixrate_g4": 0.7083333333333334, "seconds_g4": 15.093, "tok_s_g4": 13.714967203339297}
|
| 3 |
+
{"id": "bln600/3206205267", "n_chars_gold": 759, "cer_input": 0.11594202898550725, "wer_input": 0.27972027972027974, "cer_dg": 0.08959156785243742, "wer_dg": 0.14685314685314685, "rel_cer_red_dg": 0.22727272727272724, "overcorr_dg": 0.008275862068965517, "fixrate_dg": 0.5555555555555556, "seconds_dg": 3.012, "tok_s_dg": 63.41301460823373, "denoising_steps_dg": 16, "cer_dgc": 0.11594202898550725, "wer_dgc": 0.27972027972027974, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.538, "tok_s_dgc": 407.0631970260223, "denoising_steps_dgc": 2, "cer_g4": 0.10408432147562582, "wer_g4": 0.2097902097902098, "rel_cer_red_g4": 0.10227272727272728, "overcorr_g4": 0.001379310344827586, "fixrate_g4": 0.18055555555555555, "seconds_g4": 15.211, "tok_s_g4": 13.674314640720532}
|
| 4 |
+
{"id": "bln600/3206195717", "n_chars_gold": 903, "cer_input": 0.028792912513842746, "wer_input": 0.11409395973154363, "cer_dg": 0.015503875968992248, "wer_dg": 0.06711409395973154, "rel_cer_red_dg": 0.46153846153846156, "overcorr_dg": 0.001128668171557562, "fixrate_dg": 0.5909090909090909, "seconds_dg": 1.041, "tok_s_dg": 193.0835734870317, "denoising_steps_dg": 5, "cer_dgc": 0.028792912513842746, "wer_dgc": 0.11409395973154363, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.534, "tok_s_dgc": 402.6217228464419, "denoising_steps_dgc": 2, "cer_g4": 0.015503875968992248, "wer_g4": 0.0738255033557047, "rel_cer_red_g4": 0.46153846153846156, "overcorr_g4": 0.0, "fixrate_g4": 0.5454545454545454, "seconds_g4": 14.775, "tok_s_g4": 13.739424703891709}
|
| 5 |
+
{"id": "bln600/3205317819", "n_chars_gold": 879, "cer_input": 0.04664391353811149, "wer_input": 0.1375, "cer_dg": 0.030716723549488054, "wer_dg": 0.08125, "rel_cer_red_dg": 0.3414634146341463, "overcorr_dg": 0.012895662368112544, "fixrate_dg": 0.8648648648648649, "seconds_dg": 1.041, "tok_s_dg": 187.31988472622479, "denoising_steps_dg": 5, "cer_dgc": 0.04664391353811149, "wer_dgc": 0.1375, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 413.0841121495327, "denoising_steps_dgc": 2, "cer_g4": 0.026166097838452786, "wer_g4": 0.05625, "rel_cer_red_g4": 0.43902439024390244, "overcorr_g4": 0.0035169988276670576, "fixrate_g4": 0.7837837837837838, "seconds_g4": 14.724, "tok_s_g4": 13.71909807117631}
|
| 6 |
+
{"id": "bln600/3200811336", "n_chars_gold": 750, "cer_input": 0.07333333333333333, "wer_input": 0.3007518796992481, "cer_dg": 0.014666666666666666, "wer_dg": 0.05263157894736842, "rel_cer_red_dg": 0.7999999999999999, "overcorr_dg": 0.006983240223463687, "fixrate_dg": 1.0, "seconds_dg": 1.765, "tok_s_dg": 103.68271954674222, "denoising_steps_dg": 9, "cer_dgc": 0.27066666666666667, "wer_dgc": 0.44360902255639095, "rel_cer_red_dgc": -2.690909090909091, "overcorr_dgc": 0.0, "fixrate_dgc": 0.02040816326530612, "seconds_dgc": 1.995, "tok_s_dgc": 128.32080200501252, "denoising_steps_dgc": 10, "cer_g4": 0.02666666666666667, "wer_g4": 0.12030075187969924, "rel_cer_red_g4": 0.6363636363636362, "overcorr_g4": 0.0, "fixrate_g4": 0.7551020408163265, "seconds_g4": 14.135, "tok_s_g4": 13.724796604174037}
|
| 7 |
+
{"id": "bln600/3200810789", "n_chars_gold": 720, "cer_input": 0.09444444444444444, "wer_input": 0.25925925925925924, "cer_dg": 0.043055555555555555, "wer_dg": 0.07407407407407407, "rel_cer_red_dg": 0.5441176470588235, "overcorr_dg": 0.019005847953216373, "fixrate_dg": 0.9253731343283582, "seconds_dg": 1.736, "tok_s_dg": 95.6221198156682, "denoising_steps_dg": 9, "cer_dgc": 0.09444444444444444, "wer_dgc": 0.25925925925925924, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.53, "tok_s_dgc": 411.3207547169811, "denoising_steps_dgc": 2, "cer_g4": 0.0625, "wer_g4": 0.1388888888888889, "rel_cer_red_g4": 0.338235294117647, "overcorr_g4": 0.011695906432748537, "fixrate_g4": 0.47761194029850745, "seconds_g4": 13.862, "tok_s_g4": 13.706535853412205}
|
| 8 |
+
{"id": "bln600/3206321007", "n_chars_gold": 938, "cer_input": 0.011727078891257996, "wer_input": 0.0625, "cer_dg": 0.0021321961620469083, "wer_dg": 0.0125, "rel_cer_red_dg": 0.8181818181818182, "overcorr_dg": 0.001072961373390558, "fixrate_dg": 1.0, "seconds_dg": 1.047, "tok_s_dg": 193.8872970391595, "denoising_steps_dg": 5, "cer_dgc": 0.011727078891257996, "wer_dgc": 0.0625, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.704, "tok_s_dgc": 296.875, "denoising_steps_dgc": 3, "cer_g4": 0.0031982942430703624, "wer_g4": 0.025, "rel_cer_red_g4": 0.7272727272727272, "overcorr_g4": 0.0, "fixrate_g4": 0.8, "seconds_g4": 14.837, "tok_s_g4": 13.749410258138438}
|
| 9 |
+
{"id": "bln600/3200810696", "n_chars_gold": 785, "cer_input": 0.08789808917197452, "wer_input": 0.20454545454545456, "cer_dg": 0.02038216560509554, "wer_dg": 0.06060606060606061, "rel_cer_red_dg": 0.7681159420289856, "overcorr_dg": 0.003911342894393742, "fixrate_dg": 0.8260869565217391, "seconds_dg": 2.126, "tok_s_dg": 86.5475070555033, "denoising_steps_dg": 11, "cer_dgc": 0.08789808917197452, "wer_dgc": 0.20454545454545456, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.238, "tok_s_dgc": 177.7059773828756, "denoising_steps_dgc": 6, "cer_g4": 0.06878980891719745, "wer_g4": 0.16666666666666666, "rel_cer_red_g4": 0.21739130434782605, "overcorr_g4": 0.001303780964797914, "fixrate_g4": 0.30434782608695654, "seconds_g4": 15.031, "tok_s_g4": 13.70500964673009}
|
| 10 |
+
{"id": "bln600/3206247303", "n_chars_gold": 737, "cer_input": 0.09905020352781546, "wer_input": 0.28, "cer_dg": 0.06377204884667571, "wer_dg": 0.144, "rel_cer_red_dg": 0.35616438356164376, "overcorr_dg": 0.031294452347083924, "fixrate_dg": 0.9130434782608695, "seconds_dg": 2.99, "tok_s_dg": 58.8628762541806, "denoising_steps_dg": 16, "cer_dgc": 0.09905020352781546, "wer_dgc": 0.28, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.71, "tok_s_dgc": 309.85915492957747, "denoising_steps_dgc": 3, "cer_g4": 0.05970149253731343, "wer_g4": 0.16, "rel_cer_red_g4": 0.3972602739726027, "overcorr_g4": 0.007112375533428165, "fixrate_g4": 0.6956521739130435, "seconds_g4": 14.213, "tok_s_g4": 13.719833954830086}
|
| 11 |
+
{"id": "bln600/3200808401", "n_chars_gold": 724, "cer_input": 0.11602209944751381, "wer_input": 0.31451612903225806, "cer_dg": 0.03867403314917127, "wer_dg": 0.08064516129032258, "rel_cer_red_dg": 0.6666666666666666, "overcorr_dg": 0.012949640287769784, "fixrate_dg": 0.8795180722891566, "seconds_dg": 1.936, "tok_s_dg": 90.39256198347108, "denoising_steps_dg": 10, "cer_dgc": 0.11602209944751381, "wer_dgc": 0.31451612903225806, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.707, "tok_s_dgc": 311.1739745403112, "denoising_steps_dgc": 3, "cer_g4": 0.062154696132596686, "wer_g4": 0.1532258064516129, "rel_cer_red_g4": 0.46428571428571425, "overcorr_g4": 0.0028776978417266188, "fixrate_g4": 0.4819277108433735, "seconds_g4": 14.12, "tok_s_g4": 13.739376770538245}
|
| 12 |
+
{"id": "bln600/3200808398", "n_chars_gold": 756, "cer_input": 0.10449735449735449, "wer_input": 0.37815126050420167, "cer_dg": 0.03968253968253968, "wer_dg": 0.08403361344537816, "rel_cer_red_dg": 0.620253164556962, "overcorr_dg": 0.02631578947368421, "fixrate_dg": 0.9210526315789473, "seconds_dg": 1.562, "tok_s_dg": 103.0729833546735, "denoising_steps_dg": 8, "cer_dgc": 0.10449735449735449, "wer_dgc": 0.37815126050420167, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.532, "tok_s_dgc": 415.4135338345864, "denoising_steps_dgc": 2, "cer_g4": 0.062169312169312166, "wer_g4": 0.17647058823529413, "rel_cer_red_g4": 0.4050632911392405, "overcorr_g4": 0.002770083102493075, "fixrate_g4": 0.4868421052631579, "seconds_g4": 13.707, "tok_s_g4": 13.715619756328882}
|
| 13 |
+
{"id": "bln600/3200810738", "n_chars_gold": 890, "cer_input": 0.06404494382022471, "wer_input": 0.16666666666666666, "cer_dg": 0.04157303370786517, "wer_dg": 0.05555555555555555, "rel_cer_red_dg": 0.35087719298245607, "overcorr_dg": 0.00813953488372093, "fixrate_dg": 0.6590909090909091, "seconds_dg": 1.41, "tok_s_dg": 141.13475177304966, "denoising_steps_dg": 7, "cer_dgc": 0.06404494382022471, "wer_dgc": 0.16666666666666666, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.536, "tok_s_dgc": 412.3134328358209, "denoising_steps_dgc": 2, "cer_g4": 0.03932584269662921, "wer_g4": 0.05555555555555555, "rel_cer_red_g4": 0.38596491228070173, "overcorr_g4": 0.005813953488372093, "fixrate_g4": 0.6136363636363636, "seconds_g4": 14.664, "tok_s_g4": 13.707037643207856}
|
| 14 |
+
{"id": "bln600/3205295659", "n_chars_gold": 863, "cer_input": 0.0776361529548088, "wer_input": 0.2517482517482518, "cer_dg": 0.04055619930475087, "wer_dg": 0.0979020979020979, "rel_cer_red_dg": 0.4776119402985074, "overcorr_dg": 0.019559902200488997, "fixrate_dg": 0.9047619047619048, "seconds_dg": 1.749, "tok_s_dg": 101.20068610634648, "denoising_steps_dg": 9, "cer_dgc": 0.0776361529548088, "wer_dgc": 0.2517482517482518, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 403.7383177570093, "denoising_steps_dgc": 2, "cer_g4": 0.03707995365005794, "wer_g4": 0.0979020979020979, "rel_cer_red_g4": 0.5223880597014925, "overcorr_g4": 0.008557457212713936, "fixrate_g4": 0.6825396825396826, "seconds_g4": 12.979, "tok_s_g4": 13.714461822944758}
|
| 15 |
+
{"id": "bln600/3206191683", "n_chars_gold": 930, "cer_input": 0.021505376344086023, "wer_input": 0.10457516339869281, "cer_dg": 0.012903225806451613, "wer_dg": 0.06535947712418301, "rel_cer_red_dg": 0.4, "overcorr_dg": 0.004347826086956522, "fixrate_dg": 0.8, "seconds_dg": 1.025, "tok_s_dg": 199.02439024390245, "denoising_steps_dg": 5, "cer_dgc": 0.021505376344086023, "wer_dgc": 0.10457516339869281, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.53, "tok_s_dgc": 411.3207547169811, "denoising_steps_dgc": 2, "cer_g4": 0.010752688172043012, "wer_g4": 0.0457516339869281, "rel_cer_red_g4": 0.5, "overcorr_g4": 0.002173913043478261, "fixrate_g4": 0.8, "seconds_g4": 14.743, "tok_s_g4": 13.701417621922268}
|
| 16 |
+
{"id": "bln600/3206277695", "n_chars_gold": 809, "cer_input": 0.08776266996291718, "wer_input": 0.3387096774193548, "cer_dg": 0.049443757725587144, "wer_dg": 0.11290322580645161, "rel_cer_red_dg": 0.43661971830985913, "overcorr_dg": 0.02340702210663199, "fixrate_dg": 0.9056603773584906, "seconds_dg": 7.955, "tok_s_dg": 20.86737900691389, "denoising_steps_dg": 44, "cer_dgc": 0.242274412855377, "wer_dgc": 0.4596774193548387, "rel_cer_red_dgc": -1.7605633802816902, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.808, "tok_s_dgc": 141.5929203539823, "denoising_steps_dgc": 9, "cer_g4": 0.0754017305315204, "wer_g4": 0.2903225806451613, "rel_cer_red_g4": 0.14084507042253516, "overcorr_g4": 0.0039011703511053317, "fixrate_g4": 0.20754716981132076, "seconds_g4": 15.438, "tok_s_g4": 13.667573519885995}
|
| 17 |
+
{"id": "bln600/3200807974", "n_chars_gold": 958, "cer_input": 0.022964509394572025, "wer_input": 0.1258741258741259, "cer_dg": 0.003131524008350731, "wer_dg": 0.02097902097902098, "rel_cer_red_dg": 0.8636363636363636, "overcorr_dg": 0.0020942408376963353, "fixrate_dg": 1.0, "seconds_dg": 1.031, "tok_s_dg": 194.95635305528614, "denoising_steps_dg": 5, "cer_dgc": 0.021920668058455117, "wer_dgc": 0.11888111888111888, "rel_cer_red_dgc": 0.04545454545454539, "overcorr_dgc": 0.0, "fixrate_dgc": 0.047619047619047616, "seconds_dgc": 0.706, "tok_s_dgc": 310.19830028328613, "denoising_steps_dgc": 3, "cer_g4": 0.003131524008350731, "wer_g4": 0.027972027972027972, "rel_cer_red_g4": 0.8636363636363636, "overcorr_g4": 0.0010471204188481676, "fixrate_g4": 1.0, "seconds_g4": 14.642, "tok_s_g4": 13.727632837044121}
|
| 18 |
+
{"id": "bln600/3206328498", "n_chars_gold": 834, "cer_input": 0.06115107913669065, "wer_input": 0.2462686567164179, "cer_dg": 0.03597122302158273, "wer_dg": 0.09701492537313433, "rel_cer_red_dg": 0.411764705882353, "overcorr_dg": 0.01373283395755306, "fixrate_dg": 0.9722222222222222, "seconds_dg": 1.768, "tok_s_dg": 109.72850678733032, "denoising_steps_dg": 9, "cer_dgc": 0.06115107913669065, "wer_dgc": 0.2462686567164179, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.536, "tok_s_dgc": 404.8507462686567, "denoising_steps_dgc": 2, "cer_g4": 0.045563549160671464, "wer_g4": 0.13432835820895522, "rel_cer_red_g4": 0.2549019607843137, "overcorr_g4": 0.0149812734082397, "fixrate_g4": 0.6111111111111112, "seconds_g4": 14.601, "tok_s_g4": 13.697691938908292}
|
| 19 |
+
{"id": "bln600/3200813273", "n_chars_gold": 794, "cer_input": 0.08438287153652393, "wer_input": 0.3492063492063492, "cer_dg": 0.0491183879093199, "wer_dg": 0.11904761904761904, "rel_cer_red_dg": 0.41791044776119407, "overcorr_dg": 0.02230971128608924, "fixrate_dg": 0.9047619047619048, "seconds_dg": 1.947, "tok_s_dg": 88.34103749357986, "denoising_steps_dg": 10, "cer_dgc": 0.08438287153652393, "wer_dgc": 0.3492063492063492, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.537, "tok_s_dgc": 411.5456238361266, "denoising_steps_dgc": 2, "cer_g4": 0.027707808564231738, "wer_g4": 0.09523809523809523, "rel_cer_red_g4": 0.6716417910447762, "overcorr_g4": 0.003937007874015748, "fixrate_g4": 0.7619047619047619, "seconds_g4": 13.188, "tok_s_g4": 13.724598119502577}
|
| 20 |
+
{"id": "bln600/3207167129", "n_chars_gold": 869, "cer_input": 0.04142692750287687, "wer_input": 0.16326530612244897, "cer_dg": 0.00805523590333717, "wer_dg": 0.04081632653061224, "rel_cer_red_dg": 0.8055555555555556, "overcorr_dg": 0.0035671819262782403, "fixrate_dg": 0.9655172413793104, "seconds_dg": 2.292, "tok_s_dg": 86.82373472949389, "denoising_steps_dg": 12, "cer_dgc": 0.04142692750287687, "wer_dgc": 0.16326530612244897, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.533, "tok_s_dgc": 414.6341463414634, "denoising_steps_dgc": 2, "cer_g4": 0.010356731875719217, "wer_g4": 0.047619047619047616, "rel_cer_red_g4": 0.75, "overcorr_g4": 0.0023781212841854932, "fixrate_g4": 0.896551724137931, "seconds_g4": 14.677, "tok_s_g4": 13.694896777270559}
|
| 21 |
+
{"id": "bln600/3206247284", "n_chars_gold": 794, "cer_input": 0.08060453400503778, "wer_input": 0.2619047619047619, "cer_dg": 0.05667506297229219, "wer_dg": 0.1349206349206349, "rel_cer_red_dg": 0.296875, "overcorr_dg": 0.03439153439153439, "fixrate_dg": 0.8703703703703703, "seconds_dg": 3.406, "tok_s_dg": 51.08631826189078, "denoising_steps_dg": 18, "cer_dgc": 0.08060453400503778, "wer_dgc": 0.2619047619047619, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.533, "tok_s_dgc": 412.75797373358347, "denoising_steps_dgc": 2, "cer_g4": 0.04408060453400504, "wer_g4": 0.10317460317460317, "rel_cer_red_g4": 0.453125, "overcorr_g4": 0.013227513227513227, "fixrate_g4": 0.8148148148148148, "seconds_g4": 13.427, "tok_s_g4": 13.703731287703881}
|
| 22 |
+
{"id": "bln600/3205297535", "n_chars_gold": 860, "cer_input": 0.015116279069767442, "wer_input": 0.041379310344827586, "cer_dg": 0.002325581395348837, "wer_dg": 0.020689655172413793, "rel_cer_red_dg": 0.8461538461538461, "overcorr_dg": 0.002328288707799767, "fixrate_dg": 0.9230769230769231, "seconds_dg": 0.866, "tok_s_dg": 235.56581986143186, "denoising_steps_dg": 4, "cer_dgc": 0.015116279069767442, "wer_dgc": 0.041379310344827586, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.534, "tok_s_dgc": 400.74906367041194, "denoising_steps_dgc": 2, "cer_g4": 0.010465116279069767, "wer_g4": 0.027586206896551724, "rel_cer_red_g4": 0.3076923076923077, "overcorr_g4": 0.0011641443538998836, "fixrate_g4": 0.3076923076923077, "seconds_g4": 15.283, "tok_s_g4": 13.740757704639142}
|
| 23 |
+
{"id": "bln600/3206257745", "n_chars_gold": 922, "cer_input": 0.027114967462039046, "wer_input": 0.11320754716981132, "cer_dg": 0.004338394793926247, "wer_dg": 0.031446540880503145, "rel_cer_red_dg": 0.8400000000000001, "overcorr_dg": 0.0, "fixrate_dg": 0.9545454545454546, "seconds_dg": 1.221, "tok_s_dg": 167.07616707616705, "denoising_steps_dg": 6, "cer_dgc": 0.027114967462039046, "wer_dgc": 0.11320754716981132, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.063, "tok_s_dgc": 203.19849482596427, "denoising_steps_dgc": 5, "cer_g4": 0.020607375271149676, "wer_g4": 0.07547169811320754, "rel_cer_red_g4": 0.23999999999999996, "overcorr_g4": 0.0, "fixrate_g4": 0.22727272727272727, "seconds_g4": 15.492, "tok_s_g4": 13.684482313452104}
|
| 24 |
+
{"id": "bln600/3206206317", "n_chars_gold": 894, "cer_input": 0.02796420581655481, "wer_input": 0.10738255033557047, "cer_dg": 0.02348993288590604, "wer_dg": 0.053691275167785234, "rel_cer_red_dg": 0.15999999999999995, "overcorr_dg": 0.011428571428571429, "fixrate_dg": 0.6818181818181818, "seconds_dg": 1.561, "tok_s_dg": 128.76361306854582, "denoising_steps_dg": 8, "cer_dgc": 0.02796420581655481, "wer_dgc": 0.10738255033557047, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 401.86915887850466, "denoising_steps_dgc": 2, "cer_g4": 0.01901565995525727, "wer_g4": 0.053691275167785234, "rel_cer_red_g4": 0.32, "overcorr_g4": 0.0034285714285714284, "fixrate_g4": 0.6363636363636364, "seconds_g4": 14.841, "tok_s_g4": 13.745704467353953}
|
| 25 |
+
{"id": "bln600/3200801615", "n_chars_gold": 656, "cer_input": 0.14786585365853658, "wer_input": 0.40540540540540543, "cer_dg": 0.053353658536585365, "wer_dg": 0.0990990990990991, "rel_cer_red_dg": 0.6391752577319588, "overcorr_dg": 0.014469453376205787, "fixrate_dg": 0.9, "seconds_dg": 2.285, "tok_s_dg": 72.21006564551422, "denoising_steps_dg": 12, "cer_dgc": 0.14786585365853658, "wer_dgc": 0.40540540540540543, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.531, "tok_s_dgc": 412.42937853107344, "denoising_steps_dgc": 2, "cer_g4": 0.12804878048780488, "wer_g4": 0.34234234234234234, "rel_cer_red_g4": 0.13402061855670097, "overcorr_g4": 0.006430868167202572, "fixrate_g4": 0.15555555555555556, "seconds_g4": 14.788, "tok_s_g4": 13.727346497159859}
|
| 26 |
+
{"id": "bln600/3200811798", "n_chars_gold": 751, "cer_input": 0.10119840213049268, "wer_input": 0.3790322580645161, "cer_dg": 0.07856191744340879, "wer_dg": 0.13709677419354838, "rel_cer_red_dg": 0.2236842105263158, "overcorr_dg": 0.05270655270655271, "fixrate_dg": 0.9384615384615385, "seconds_dg": 1.395, "tok_s_dg": 113.97849462365592, "denoising_steps_dg": 7, "cer_dgc": 0.28495339547270304, "wer_dgc": 0.5161290322580645, "rel_cer_red_dgc": -1.8157894736842102, "overcorr_dgc": 0.0014245014245014246, "fixrate_dgc": 0.03076923076923077, "seconds_dgc": 1.641, "tok_s_dgc": 156.00243753808653, "denoising_steps_dgc": 8, "cer_g4": 0.05059920106524634, "wer_g4": 0.12096774193548387, "rel_cer_red_g4": 0.5, "overcorr_g4": 0.019943019943019943, "fixrate_g4": 0.9076923076923077, "seconds_g4": 12.041, "tok_s_g4": 13.703180798936964}
|
| 27 |
+
{"id": "bln600/3207166294", "n_chars_gold": 542, "cer_input": 0.31365313653136534, "wer_input": 0.7865168539325843, "cer_dg": 0.22140221402214022, "wer_dg": 0.3146067415730337, "rel_cer_red_dg": 0.2941176470588236, "overcorr_dg": 0.10983981693363844, "fixrate_dg": 0.7861635220125787, "seconds_dg": 2.109, "tok_s_dg": 56.42484589853011, "denoising_steps_dg": 11, "cer_dgc": 0.31365313653136534, "wer_dgc": 0.7865168539325843, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.872, "tok_s_dgc": 251.14678899082568, "denoising_steps_dgc": 4, "cer_g4": 0.31365313653136534, "wer_g4": 0.7865168539325843, "rel_cer_red_g4": 0.0, "overcorr_g4": 0.0, "fixrate_g4": 0.0, "seconds_g4": 16.109, "tok_s_g4": 13.656961946737846}
|
| 28 |
+
{"id": "bln600/3206227029", "n_chars_gold": 745, "cer_input": 0.07919463087248323, "wer_input": 0.3170731707317073, "cer_dg": 0.028187919463087248, "wer_dg": 0.0975609756097561, "rel_cer_red_dg": 0.6440677966101696, "overcorr_dg": 0.013850415512465374, "fixrate_dg": 0.9230769230769231, "seconds_dg": 1.418, "tok_s_dg": 125.52891396332863, "denoising_steps_dg": 7, "cer_dgc": 0.07919463087248323, "wer_dgc": 0.3170731707317073, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.539, "tok_s_dgc": 402.59740259740255, "denoising_steps_dgc": 2, "cer_g4": 0.026845637583892617, "wer_g4": 0.08130081300813008, "rel_cer_red_g4": 0.6610169491525424, "overcorr_g4": 0.012465373961218837, "fixrate_g4": 0.9038461538461539, "seconds_g4": 13.098, "tok_s_g4": 13.74255611543747}
|
| 29 |
+
{"id": "bln600/3206328810", "n_chars_gold": 778, "cer_input": 0.032133676092544985, "wer_input": 0.14960629921259844, "cer_dg": 0.030848329048843187, "wer_dg": 0.08661417322834646, "rel_cer_red_dg": 0.039999999999999966, "overcorr_dg": 0.015665796344647518, "fixrate_dg": 0.8260869565217391, "seconds_dg": 2.102, "tok_s_dg": 96.0989533777355, "denoising_steps_dg": 11, "cer_dgc": 0.032133676092544985, "wer_dgc": 0.14960629921259844, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.71, "tok_s_dgc": 309.85915492957747, "denoising_steps_dgc": 3, "cer_g4": 0.021850899742930592, "wer_g4": 0.09448818897637795, "rel_cer_red_g4": 0.31999999999999995, "overcorr_g4": 0.0039164490861618795, "fixrate_g4": 0.4782608695652174, "seconds_g4": 15.514, "tok_s_g4": 13.665076704911693}
|
| 30 |
+
{"id": "bln600/3200811642", "n_chars_gold": 864, "cer_input": 0.052083333333333336, "wer_input": 0.2, "cer_dg": 0.02546296296296296, "wer_dg": 0.11428571428571428, "rel_cer_red_dg": 0.5111111111111112, "overcorr_dg": 0.006045949214026602, "fixrate_dg": 0.7714285714285715, "seconds_dg": 3.179, "tok_s_dg": 61.9691726958163, "denoising_steps_dg": 17, "cer_dgc": 0.05092592592592592, "wer_dgc": 0.2, "rel_cer_red_dgc": 0.02222222222222232, "overcorr_dgc": 0.0, "fixrate_dgc": 0.02857142857142857, "seconds_dgc": 1.057, "tok_s_dgc": 203.4058656575213, "denoising_steps_dgc": 5, "cer_g4": 0.03125, "wer_g4": 0.1, "rel_cer_red_g4": 0.4, "overcorr_g4": 0.006045949214026602, "fixrate_g4": 0.6571428571428571, "seconds_g4": 14.206, "tok_s_g4": 13.726594396733775}
|
| 31 |
+
{"id": "bln600/3202982093", "n_chars_gold": 852, "cer_input": 0.03403755868544601, "wer_input": 0.1232876712328767, "cer_dg": 0.01995305164319249, "wer_dg": 0.0547945205479452, "rel_cer_red_dg": 0.41379310344827586, "overcorr_dg": 0.0024009603841536613, "fixrate_dg": 0.625, "seconds_dg": 1.211, "tok_s_dg": 166.804293971924, "denoising_steps_dg": 6, "cer_dgc": 0.03403755868544601, "wer_dgc": 0.1232876712328767, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.532, "tok_s_dgc": 400.37593984962405, "denoising_steps_dgc": 2, "cer_g4": 0.02112676056338028, "wer_g4": 0.07534246575342465, "rel_cer_red_g4": 0.3793103448275862, "overcorr_g4": 0.0012004801920768306, "fixrate_g4": 0.4166666666666667, "seconds_g4": 14.959, "tok_s_g4": 13.704124607259844}
|
| 32 |
+
{"id": "bln600/3206225744", "n_chars_gold": 921, "cer_input": 0.0249728555917481, "wer_input": 0.12578616352201258, "cer_dg": 0.018458197611292075, "wer_dg": 0.0440251572327044, "rel_cer_red_dg": 0.2608695652173913, "overcorr_dg": 0.005512679162072767, "fixrate_dg": 0.65, "seconds_dg": 4.989, "tok_s_dg": 40.28863499699339, "denoising_steps_dg": 27, "cer_dgc": 0.0249728555917481, "wer_dgc": 0.12578616352201258, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.537, "tok_s_dgc": 405.95903165735564, "denoising_steps_dgc": 2, "cer_g4": 0.014115092290988056, "wer_g4": 0.03773584905660377, "rel_cer_red_g4": 0.4347826086956522, "overcorr_g4": 0.004410143329658214, "fixrate_g4": 0.7, "seconds_g4": 14.716, "tok_s_g4": 13.726556129382985}
|
| 33 |
+
{"id": "bln600/3207642541", "n_chars_gold": 781, "cer_input": 0.07554417413572344, "wer_input": 0.23622047244094488, "cer_dg": 0.056338028169014086, "wer_dg": 0.11023622047244094, "rel_cer_red_dg": 0.25423728813559326, "overcorr_dg": 0.021361815754339118, "fixrate_dg": 0.8928571428571429, "seconds_dg": 1.926, "tok_s_dg": 93.45794392523365, "denoising_steps_dg": 10, "cer_dgc": 0.07554417413572344, "wer_dgc": 0.23622047244094488, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.877, "tok_s_dgc": 251.99543899657925, "denoising_steps_dgc": 4, "cer_g4": 0.0499359795134443, "wer_g4": 0.13385826771653545, "rel_cer_red_g4": 0.3389830508474577, "overcorr_g4": 0.014686248331108143, "fixrate_g4": 0.5357142857142857, "seconds_g4": 13.935, "tok_s_g4": 13.706494438464299}
|
| 34 |
+
{"id": "bln600/3200810720", "n_chars_gold": 956, "cer_input": 0.09518828451882845, "wer_input": 0.13043478260869565, "cer_dg": 0.08054393305439331, "wer_dg": 0.08074534161490683, "rel_cer_red_dg": 0.1538461538461538, "overcorr_dg": 0.007625272331154684, "fixrate_dg": 0.4262295081967213, "seconds_dg": 3.336, "tok_s_dg": 61.15107913669065, "denoising_steps_dg": 18, "cer_dgc": 0.09518828451882845, "wer_dgc": 0.13043478260869565, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.538, "tok_s_dgc": 408.92193308550185, "denoising_steps_dgc": 2, "cer_g4": 0.08891213389121339, "wer_g4": 0.11180124223602485, "rel_cer_red_g4": 0.06593406593406596, "overcorr_g4": 0.0, "fixrate_g4": 0.21311475409836064, "seconds_g4": 15.435, "tok_s_g4": 13.67022999676061}
|
| 35 |
+
{"id": "bln600/3206237533", "n_chars_gold": 959, "cer_input": 0.04692387904066736, "wer_input": 0.12101910828025478, "cer_dg": 0.029197080291970802, "wer_dg": 0.01910828025477707, "rel_cer_red_dg": 0.37777777777777777, "overcorr_dg": 0.0010775862068965517, "fixrate_dg": 0.8823529411764706, "seconds_dg": 1.234, "tok_s_dg": 155.59157212317666, "denoising_steps_dg": 6, "cer_dgc": 0.04692387904066736, "wer_dgc": 0.12101910828025478, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.71, "tok_s_dgc": 290.14084507042253, "denoising_steps_dgc": 3, "cer_g4": 0.035453597497393116, "wer_g4": 0.05732484076433121, "rel_cer_red_g4": 0.24444444444444446, "overcorr_g4": 0.0, "fixrate_g4": 0.5882352941176471, "seconds_g4": 14.2, "tok_s_g4": 13.732394366197184}
|
| 36 |
+
{"id": "bln600/3200810766", "n_chars_gold": 913, "cer_input": 0.023001095290251915, "wer_input": 0.12101910828025478, "cer_dg": 0.002190580503833516, "wer_dg": 0.012738853503184714, "rel_cer_red_dg": 0.9047619047619047, "overcorr_dg": 0.0, "fixrate_dg": 0.95, "seconds_dg": 0.868, "tok_s_dg": 228.110599078341, "denoising_steps_dg": 4, "cer_dgc": 0.023001095290251915, "wer_dgc": 0.12101910828025478, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 413.0841121495327, "denoising_steps_dgc": 2, "cer_g4": 0.0032858707557502738, "wer_g4": 0.01910828025477707, "rel_cer_red_g4": 0.8571428571428571, "overcorr_g4": 0.0, "fixrate_g4": 0.9, "seconds_g4": 14.418, "tok_s_g4": 13.73283395755306}
|
| 37 |
+
{"id": "bln600/3206232849", "n_chars_gold": 797, "cer_input": 0.037641154328732745, "wer_input": 0.1310344827586207, "cer_dg": 0.01756587202007528, "wer_dg": 0.020689655172413793, "rel_cer_red_dg": 0.5333333333333333, "overcorr_dg": 0.00903225806451613, "fixrate_dg": 1.0, "seconds_dg": 1.207, "tok_s_dg": 166.52858326429163, "denoising_steps_dg": 6, "cer_dgc": 0.1794228356336261, "wer_dgc": 0.23448275862068965, "rel_cer_red_dgc": -3.766666666666667, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.616, "tok_s_dgc": 158.41584158415841, "denoising_steps_dgc": 8, "cer_g4": 0.026348808030112924, "wer_g4": 0.06896551724137931, "rel_cer_red_g4": 0.29999999999999993, "overcorr_g4": 0.0, "fixrate_g4": 0.5454545454545454, "seconds_g4": 15.383, "tok_s_g4": 13.716440226223755}
|
| 38 |
+
{"id": "bln600/3206227140", "n_chars_gold": 946, "cer_input": 0.02959830866807611, "wer_input": 0.11538461538461539, "cer_dg": 0.017970401691331923, "wer_dg": 0.02564102564102564, "rel_cer_red_dg": 0.39285714285714285, "overcorr_dg": 0.006514657980456026, "fixrate_dg": 0.9444444444444444, "seconds_dg": 2.277, "tok_s_dg": 86.5173473869126, "denoising_steps_dg": 12, "cer_dgc": 0.02959830866807611, "wer_dgc": 0.11538461538461539, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.526, "tok_s_dgc": 393.5361216730038, "denoising_steps_dgc": 2, "cer_g4": 0.014799154334038054, "wer_g4": 0.02564102564102564, "rel_cer_red_g4": 0.5, "overcorr_g4": 0.0, "fixrate_g4": 0.7777777777777778, "seconds_g4": 14.725, "tok_s_g4": 13.718166383701188}
|
| 39 |
+
{"id": "bln600/3206201441", "n_chars_gold": 887, "cer_input": 0.018038331454340473, "wer_input": 0.09395973154362416, "cer_dg": 0.010146561443066516, "wer_dg": 0.040268456375838924, "rel_cer_red_dg": 0.4375, "overcorr_dg": 0.004555808656036446, "fixrate_dg": 0.75, "seconds_dg": 1.577, "tok_s_dg": 128.72542802790107, "denoising_steps_dg": 8, "cer_dgc": 0.018038331454340473, "wer_dgc": 0.09395973154362416, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.705, "tok_s_dgc": 304.96453900709224, "denoising_steps_dgc": 3, "cer_g4": 0.007891770011273957, "wer_g4": 0.040268456375838924, "rel_cer_red_g4": 0.5625, "overcorr_g4": 0.0011389521640091116, "fixrate_g4": 0.625, "seconds_g4": 15.086, "tok_s_g4": 13.721331035397057}
|
| 40 |
+
{"id": "bln600/3200810140", "n_chars_gold": 688, "cer_input": 0.16715116279069767, "wer_input": 0.4112903225806452, "cer_dg": 0.059593023255813955, "wer_dg": 0.11290322580645161, "rel_cer_red_dg": 0.6434782608695653, "overcorr_dg": 0.013740458015267175, "fixrate_dg": 0.8407079646017699, "seconds_dg": 1.748, "tok_s_dg": 92.10526315789474, "denoising_steps_dg": 9, "cer_dgc": 0.32848837209302323, "wer_dgc": 0.532258064516129, "rel_cer_red_dgc": -0.9652173913043478, "overcorr_dgc": 0.0, "fixrate_dgc": 0.008849557522123894, "seconds_dgc": 1.641, "tok_s_dgc": 156.00243753808653, "denoising_steps_dgc": 8, "cer_g4": 0.10901162790697674, "wer_g4": 0.20161290322580644, "rel_cer_red_g4": 0.34782608695652173, "overcorr_g4": 0.004580152671755725, "fixrate_g4": 0.48672566371681414, "seconds_g4": 13.021, "tok_s_g4": 13.670225021119728}
|
| 41 |
+
{"id": "bln600/3206260927", "n_chars_gold": 931, "cer_input": 0.03007518796992481, "wer_input": 0.14814814814814814, "cer_dg": 0.01288936627282492, "wer_dg": 0.037037037037037035, "rel_cer_red_dg": 0.5714285714285713, "overcorr_dg": 0.004395604395604396, "fixrate_dg": 0.8636363636363636, "seconds_dg": 1.601, "tok_s_dg": 126.17114303560275, "denoising_steps_dg": 8, "cer_dgc": 0.03007518796992481, "wer_dgc": 0.14814814814814814, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.045454545454545456, "seconds_dgc": 0.889, "tok_s_dgc": 241.84476940382453, "denoising_steps_dgc": 4, "cer_g4": 0.00966702470461869, "wer_g4": 0.04938271604938271, "rel_cer_red_g4": 0.6785714285714286, "overcorr_g4": 0.001098901098901099, "fixrate_g4": 0.8181818181818182, "seconds_g4": 14.839, "tok_s_g4": 13.680167127164903}
|
| 42 |
+
{"id": "bln600/3206319497", "n_chars_gold": 732, "cer_input": 0.1051912568306011, "wer_input": 0.3492063492063492, "cer_dg": 0.056010928961748634, "wer_dg": 0.11904761904761904, "rel_cer_red_dg": 0.4675324675324676, "overcorr_dg": 0.02180232558139535, "fixrate_dg": 0.9193548387096774, "seconds_dg": 2.111, "tok_s_dg": 79.58313595452391, "denoising_steps_dg": 11, "cer_dgc": 0.32650273224043713, "wer_dgc": 0.5714285714285714, "rel_cer_red_dgc": -2.1038961038961035, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 2.354, "tok_s_dgc": 108.75106202209005, "denoising_steps_dgc": 12, "cer_g4": 0.05874316939890711, "wer_g4": 0.1349206349206349, "rel_cer_red_g4": 0.44155844155844154, "overcorr_g4": 0.00436046511627907, "fixrate_g4": 0.5161290322580645, "seconds_g4": 13.76, "tok_s_g4": 13.73546511627907}
|
| 43 |
+
{"id": "bln600/3200811142", "n_chars_gold": 802, "cer_input": 0.07605985037406483, "wer_input": 0.22556390977443608, "cer_dg": 0.02743142144638404, "wer_dg": 0.05263157894736842, "rel_cer_red_dg": 0.639344262295082, "overcorr_dg": 0.0037831021437578815, "fixrate_dg": 0.8360655737704918, "seconds_dg": 2.271, "tok_s_dg": 83.22324966974901, "denoising_steps_dg": 12, "cer_dgc": 0.07605985037406483, "wer_dgc": 0.22556390977443608, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.879, "tok_s_dgc": 251.42207053469852, "denoising_steps_dgc": 4, "cer_g4": 0.03117206982543641, "wer_g4": 0.06015037593984962, "rel_cer_red_g4": 0.5901639344262295, "overcorr_g4": 0.008827238335435058, "fixrate_g4": 0.7377049180327869, "seconds_g4": 13.801, "tok_s_g4": 13.694659807260344}
|
| 44 |
+
{"id": "bln600/3206237352", "n_chars_gold": 807, "cer_input": 0.07311028500619579, "wer_input": 0.35036496350364965, "cer_dg": 0.022304832713754646, "wer_dg": 0.058394160583941604, "rel_cer_red_dg": 0.6949152542372882, "overcorr_dg": 0.007741935483870968, "fixrate_dg": 0.9464285714285714, "seconds_dg": 2.821, "tok_s_dg": 58.84438142502658, "denoising_steps_dg": 15, "cer_dgc": 0.07187112763320942, "wer_dgc": 0.34306569343065696, "rel_cer_red_dgc": 0.01694915254237287, "overcorr_dgc": 0.0, "fixrate_dgc": 0.017857142857142856, "seconds_dgc": 0.53, "tok_s_dgc": 405.66037735849056, "denoising_steps_dgc": 2, "cer_g4": 0.01858736059479554, "wer_g4": 0.06569343065693431, "rel_cer_red_g4": 0.7457627118644067, "overcorr_g4": 0.003870967741935484, "fixrate_g4": 0.8214285714285714, "seconds_g4": 12.59, "tok_s_g4": 13.741064336775219}
|
| 45 |
+
{"id": "bln600/3200810602", "n_chars_gold": 837, "cer_input": 0.02986857825567503, "wer_input": 0.13380281690140844, "cer_dg": 0.017921146953405017, "wer_dg": 0.04929577464788732, "rel_cer_red_dg": 0.4, "overcorr_dg": 0.012165450121654502, "fixrate_dg": 0.8333333333333334, "seconds_dg": 2.3, "tok_s_dg": 84.78260869565219, "denoising_steps_dg": 12, "cer_dgc": 0.02986857825567503, "wer_dgc": 0.13380281690140844, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 413.0841121495327, "denoising_steps_dgc": 2, "cer_g4": 0.005973715651135006, "wer_g4": 0.04225352112676056, "rel_cer_red_g4": 0.8, "overcorr_g4": 0.0012165450121654502, "fixrate_g4": 0.8333333333333334, "seconds_g4": 14.433, "tok_s_g4": 13.718561629598836}
|
| 46 |
+
{"id": "bln600/3206211444", "n_chars_gold": 811, "cer_input": 0.06165228113440197, "wer_input": 0.2116788321167883, "cer_dg": 0.040690505548705305, "wer_dg": 0.08029197080291971, "rel_cer_red_dg": 0.3399999999999999, "overcorr_dg": 0.021711366538952746, "fixrate_dg": 0.9523809523809523, "seconds_dg": 2.459, "tok_s_dg": 74.82716551443676, "denoising_steps_dg": 13, "cer_dgc": 0.06535141800246609, "wer_dgc": 0.2116788321167883, "rel_cer_red_dgc": -0.06000000000000002, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.045, "tok_s_dgc": 210.5263157894737, "denoising_steps_dgc": 5, "cer_g4": 0.02342786683107275, "wer_g4": 0.08759124087591241, "rel_cer_red_g4": 0.62, "overcorr_g4": 0.0038314176245210726, "fixrate_g4": 0.7380952380952381, "seconds_g4": 14.124, "tok_s_g4": 13.73548569810252}
|
| 47 |
+
{"id": "bln600/3206232857", "n_chars_gold": 840, "cer_input": 0.06666666666666667, "wer_input": 0.2808219178082192, "cer_dg": 0.058333333333333334, "wer_dg": 0.08904109589041095, "rel_cer_red_dg": 0.12499999999999997, "overcorr_dg": 0.04084158415841584, "fixrate_dg": 0.8571428571428571, "seconds_dg": 1.924, "tok_s_dg": 96.15384615384616, "denoising_steps_dg": 10, "cer_dgc": 0.06666666666666667, "wer_dgc": 0.2808219178082192, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.875, "tok_s_dgc": 252.57142857142858, "denoising_steps_dgc": 4, "cer_g4": 0.02857142857142857, "wer_g4": 0.10273972602739725, "rel_cer_red_g4": 0.5714285714285715, "overcorr_g4": 0.0037128712871287127, "fixrate_g4": 0.6428571428571429, "seconds_g4": 14.063, "tok_s_g4": 13.723956481547322}
|
| 48 |
+
{"id": "bln600/3200812731", "n_chars_gold": 949, "cer_input": 0.030558482613277135, "wer_input": 0.15060240963855423, "cer_dg": 0.01791359325605901, "wer_dg": 0.03614457831325301, "rel_cer_red_dg": 0.4137931034482759, "overcorr_dg": 0.006458557588805167, "fixrate_dg": 0.9230769230769231, "seconds_dg": 1.579, "tok_s_dg": 126.6624445851805, "denoising_steps_dg": 8, "cer_dgc": 0.030558482613277135, "wer_dgc": 0.15060240963855423, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.711, "tok_s_dgc": 305.2039381153305, "denoising_steps_dgc": 3, "cer_g4": 0.015806111696522657, "wer_g4": 0.05421686746987952, "rel_cer_red_g4": 0.48275862068965514, "overcorr_g4": 0.004305705059203444, "fixrate_g4": 0.8076923076923077, "seconds_g4": 14.808, "tok_s_g4": 13.70880605078336}
|
| 49 |
+
{"id": "bln600/3200810449", "n_chars_gold": 711, "cer_input": 0.10970464135021098, "wer_input": 0.37168141592920356, "cer_dg": 0.046413502109704644, "wer_dg": 0.11504424778761062, "rel_cer_red_dg": 0.576923076923077, "overcorr_dg": 0.014705882352941176, "fixrate_dg": 0.8571428571428571, "seconds_dg": 2.289, "tok_s_dg": 77.32634338138925, "denoising_steps_dg": 12, "cer_dgc": 0.1111111111111111, "wer_dgc": 0.37168141592920356, "rel_cer_red_dgc": -0.012820512820512706, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.537, "tok_s_dgc": 405.95903165735564, "denoising_steps_dgc": 2, "cer_g4": 0.052039381153305204, "wer_g4": 0.13274336283185842, "rel_cer_red_g4": 0.5256410256410257, "overcorr_g4": 0.010294117647058823, "fixrate_g4": 0.7571428571428571, "seconds_g4": 13.032, "tok_s_g4": 13.735420503376304}
|
| 50 |
+
{"id": "bln600/3200810146", "n_chars_gold": 810, "cer_input": 0.06296296296296296, "wer_input": 0.2265625, "cer_dg": 0.016049382716049384, "wer_dg": 0.078125, "rel_cer_red_dg": 0.7450980392156862, "overcorr_dg": 0.008871989860583017, "fixrate_dg": 0.94, "seconds_dg": 1.761, "tok_s_dg": 101.64679159568428, "denoising_steps_dg": 9, "cer_dgc": 0.06296296296296296, "wer_dgc": 0.2265625, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.062, "tok_s_dgc": 206.21468926553672, "denoising_steps_dgc": 5, "cer_g4": 0.019753086419753086, "wer_g4": 0.09375, "rel_cer_red_g4": 0.6862745098039216, "overcorr_g4": 0.0025348542458808617, "fixrate_g4": 0.76, "seconds_g4": 13.71, "tok_s_g4": 13.712618526622903}
|
| 51 |
+
{"id": "bln600/3206190899", "n_chars_gold": 891, "cer_input": 0.03928170594837262, "wer_input": 0.14383561643835616, "cer_dg": 0.026936026936026935, "wer_dg": 0.0821917808219178, "rel_cer_red_dg": 0.31428571428571433, "overcorr_dg": 0.008101851851851851, "fixrate_dg": 0.8214285714285714, "seconds_dg": 1.226, "tok_s_dg": 161.50081566068516, "denoising_steps_dg": 6, "cer_dgc": 0.03928170594837262, "wer_dgc": 0.14383561643835616, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.884, "tok_s_dgc": 243.21266968325793, "denoising_steps_dgc": 4, "cer_g4": 0.026936026936026935, "wer_g4": 0.08904109589041095, "rel_cer_red_g4": 0.31428571428571433, "overcorr_g4": 0.0023148148148148147, "fixrate_g4": 0.5, "seconds_g4": 14.978, "tok_s_g4": 13.68674055281079}
|
| 52 |
+
{"id": "bln600/3206210894", "n_chars_gold": 945, "cer_input": 0.039153439153439155, "wer_input": 0.16149068322981366, "cer_dg": 0.008465608465608466, "wer_dg": 0.024844720496894408, "rel_cer_red_dg": 0.7837837837837838, "overcorr_dg": 0.003264417845484222, "fixrate_dg": 0.9696969696969697, "seconds_dg": 2.442, "tok_s_dg": 80.26208026208026, "denoising_steps_dg": 13, "cer_dgc": 0.039153439153439155, "wer_dgc": 0.16149068322981366, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.702, "tok_s_dgc": 313.3903133903134, "denoising_steps_dgc": 3, "cer_g4": 0.005291005291005291, "wer_g4": 0.024844720496894408, "rel_cer_red_g4": 0.8648648648648649, "overcorr_g4": 0.0, "fixrate_g4": 0.8484848484848485, "seconds_g4": 14.386, "tok_s_g4": 13.693869039343808}
|
| 53 |
+
{"id": "bln600/3200810641", "n_chars_gold": 667, "cer_input": 0.18440779610194902, "wer_input": 0.375, "cer_dg": 0.029985007496251874, "wer_dg": 0.07142857142857142, "rel_cer_red_dg": 0.8373983739837398, "overcorr_dg": 0.009216589861751152, "fixrate_dg": 0.9274193548387096, "seconds_dg": 4.74, "tok_s_dg": 37.130801687763714, "denoising_steps_dg": 26, "cer_dgc": 0.18440779610194902, "wer_dgc": 0.375, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.704, "tok_s_dgc": 311.0795454545455, "denoising_steps_dgc": 3, "cer_g4": 0.12143928035982009, "wer_g4": 0.16071428571428573, "rel_cer_red_g4": 0.3414634146341463, "overcorr_g4": 0.0030721966205837174, "fixrate_g4": 0.532258064516129, "seconds_g4": 14.058, "tok_s_g4": 13.65770379854887}
|
| 54 |
+
{"id": "bln600/3206329327", "n_chars_gold": 905, "cer_input": 0.01878453038674033, "wer_input": 0.09316770186335403, "cer_dg": 0.011049723756906077, "wer_dg": 0.037267080745341616, "rel_cer_red_dg": 0.411764705882353, "overcorr_dg": 0.004479283314669653, "fixrate_dg": 0.7857142857142857, "seconds_dg": 1.037, "tok_s_dg": 198.6499517839923, "denoising_steps_dg": 5, "cer_dgc": 0.01878453038674033, "wer_dgc": 0.09316770186335403, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.704, "tok_s_dgc": 305.3977272727273, "denoising_steps_dgc": 3, "cer_g4": 0.013259668508287293, "wer_g4": 0.049689440993788817, "rel_cer_red_g4": 0.29411764705882354, "overcorr_g4": 0.0011198208286674132, "fixrate_g4": 0.5714285714285714, "seconds_g4": 15.226, "tok_s_g4": 13.72652042558781}
|
| 55 |
+
{"id": "bln600/3200810779", "n_chars_gold": 868, "cer_input": 0.026497695852534562, "wer_input": 0.10596026490066225, "cer_dg": 0.013824884792626729, "wer_dg": 0.059602649006622516, "rel_cer_red_dg": 0.47826086956521735, "overcorr_dg": 0.005847953216374269, "fixrate_dg": 0.8260869565217391, "seconds_dg": 1.225, "tok_s_dg": 159.18367346938774, "denoising_steps_dg": 6, "cer_dgc": 0.026497695852534562, "wer_dgc": 0.10596026490066225, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.54, "tok_s_dgc": 407.4074074074074, "denoising_steps_dgc": 2, "cer_g4": 0.016129032258064516, "wer_g4": 0.052980132450331126, "rel_cer_red_g4": 0.391304347826087, "overcorr_g4": 0.0035087719298245615, "fixrate_g4": 0.6521739130434783, "seconds_g4": 14.838, "tok_s_g4": 13.68108909556544}
|
| 56 |
+
{"id": "bln600/3206323884", "n_chars_gold": 867, "cer_input": 0.04498269896193772, "wer_input": 0.19727891156462585, "cer_dg": 0.02191464821222607, "wer_dg": 0.06802721088435375, "rel_cer_red_dg": 0.5128205128205128, "overcorr_dg": 0.0059594755661501785, "fixrate_dg": 0.8518518518518519, "seconds_dg": 2.108, "tok_s_dg": 92.50474383301707, "denoising_steps_dg": 11, "cer_dgc": 0.04498269896193772, "wer_dgc": 0.19727891156462585, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.71, "tok_s_dgc": 309.85915492957747, "denoising_steps_dgc": 3, "cer_g4": 0.025374855824682813, "wer_g4": 0.08163265306122448, "rel_cer_red_g4": 0.43589743589743596, "overcorr_g4": 0.0023837902264600714, "fixrate_g4": 0.5925925925925926, "seconds_g4": 14.647, "tok_s_g4": 13.722946678500717}
|
| 57 |
+
{"id": "bln600/3206327837", "n_chars_gold": 831, "cer_input": 0.1552346570397112, "wer_input": 0.3897058823529412, "cer_dg": 0.08904933814681107, "wer_dg": 0.1323529411764706, "rel_cer_red_dg": 0.42635658914728686, "overcorr_dg": 0.010825439783491205, "fixrate_dg": 0.7634408602150538, "seconds_dg": 1.573, "tok_s_dg": 102.98792116973935, "denoising_steps_dg": 8, "cer_dgc": 0.1552346570397112, "wer_dgc": 0.3897058823529412, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.529, "tok_s_dgc": 417.76937618147446, "denoising_steps_dgc": 2, "cer_g4": 0.10469314079422383, "wer_g4": 0.16911764705882354, "rel_cer_red_g4": 0.32558139534883723, "overcorr_g4": 0.005412719891745603, "fixrate_g4": 0.5591397849462365, "seconds_g4": 13.069, "tok_s_g4": 13.696533782232764}
|
| 58 |
+
{"id": "bln600/3206258468", "n_chars_gold": 889, "cer_input": 0.012373453318335208, "wer_input": 0.07051282051282051, "cer_dg": 0.01124859392575928, "wer_dg": 0.038461538461538464, "rel_cer_red_dg": 0.09090909090909098, "overcorr_dg": 0.00340522133938706, "fixrate_dg": 0.4444444444444444, "seconds_dg": 1.383, "tok_s_dg": 146.059291395517, "denoising_steps_dg": 7, "cer_dgc": 0.010123734533183352, "wer_dgc": 0.057692307692307696, "rel_cer_red_dgc": 0.18181818181818182, "overcorr_dgc": 0.0, "fixrate_dgc": 0.2222222222222222, "seconds_dgc": 0.71, "tok_s_dgc": 288.7323943661972, "denoising_steps_dgc": 3, "cer_g4": 0.006749156355455568, "wer_g4": 0.03205128205128205, "rel_cer_red_g4": 0.4545454545454546, "overcorr_g4": 0.0, "fixrate_g4": 0.4444444444444444, "seconds_g4": 14.849, "tok_s_g4": 13.73829887534514}
|
| 59 |
+
{"id": "bln600/3206234047", "n_chars_gold": 919, "cer_input": 0.009793253536452665, "wer_input": 0.05952380952380952, "cer_dg": 0.001088139281828074, "wer_dg": 0.005952380952380952, "rel_cer_red_dg": 0.888888888888889, "overcorr_dg": 0.0, "fixrate_dg": 0.8888888888888888, "seconds_dg": 1.229, "tok_s_dg": 168.42961757526442, "denoising_steps_dg": 6, "cer_dgc": 0.009793253536452665, "wer_dgc": 0.05952380952380952, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.538, "tok_s_dgc": 392.19330855018586, "denoising_steps_dgc": 2, "cer_g4": 0.003264417845484222, "wer_g4": 0.017857142857142856, "rel_cer_red_g4": 0.6666666666666666, "overcorr_g4": 0.0, "fixrate_g4": 0.6666666666666666, "seconds_g4": 15.196, "tok_s_g4": 13.68781258225849}
|
| 60 |
+
{"id": "bln600/3200811894", "n_chars_gold": 907, "cer_input": 0.04630650496141125, "wer_input": 0.125, "cer_dg": 0.022050716648291068, "wer_dg": 0.05625, "rel_cer_red_dg": 0.5238095238095238, "overcorr_dg": 0.010112359550561797, "fixrate_dg": 0.8536585365853658, "seconds_dg": 1.4, "tok_s_dg": 140.0, "denoising_steps_dg": 7, "cer_dgc": 0.04630650496141125, "wer_dgc": 0.125, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.535, "tok_s_dgc": 411.21495327102804, "denoising_steps_dgc": 2, "cer_g4": 0.03858875413450937, "wer_g4": 0.0625, "rel_cer_red_g4": 0.16666666666666669, "overcorr_g4": 0.011235955056179775, "fixrate_g4": 0.43902439024390244, "seconds_g4": 14.813, "tok_s_g4": 13.704178761898332}
|
| 61 |
+
{"id": "bln600/3206235678", "n_chars_gold": 962, "cer_input": 0.02182952182952183, "wer_input": 0.09876543209876543, "cer_dg": 0.015592515592515593, "wer_dg": 0.04938271604938271, "rel_cer_red_dg": 0.2857142857142857, "overcorr_dg": 0.005296610169491525, "fixrate_dg": 0.7058823529411765, "seconds_dg": 1.38, "tok_s_dg": 145.6521739130435, "denoising_steps_dg": 7, "cer_dgc": 0.02182952182952183, "wer_dgc": 0.09876543209876543, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.526, "tok_s_dgc": 408.74524714828897, "denoising_steps_dgc": 2, "cer_g4": 0.011434511434511435, "wer_g4": 0.05555555555555555, "rel_cer_red_g4": 0.47619047619047616, "overcorr_g4": 0.0, "fixrate_g4": 0.6470588235294118, "seconds_g4": 15.092, "tok_s_g4": 13.71587596077392}
|
| 62 |
+
{"id": "bln600/3206232665", "n_chars_gold": 877, "cer_input": 0.027366020524515394, "wer_input": 0.1437908496732026, "cer_dg": 0.005701254275940707, "wer_dg": 0.032679738562091505, "rel_cer_red_dg": 0.7916666666666666, "overcorr_dg": 0.0011614401858304297, "fixrate_dg": 0.9, "seconds_dg": 1.399, "tok_s_dg": 146.53323802716227, "denoising_steps_dg": 7, "cer_dgc": 0.027366020524515394, "wer_dgc": 0.1437908496732026, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.53, "tok_s_dgc": 409.4339622641509, "denoising_steps_dgc": 2, "cer_g4": 0.0034207525655644243, "wer_g4": 0.026143790849673203, "rel_cer_red_g4": 0.875, "overcorr_g4": 0.0011614401858304297, "fixrate_g4": 0.95, "seconds_g4": 15.092, "tok_s_g4": 13.71587596077392}
|
| 63 |
+
{"id": "bln600/3202413243", "n_chars_gold": 815, "cer_input": 0.023312883435582823, "wer_input": 0.11564625850340136, "cer_dg": 0.007361963190184049, "wer_dg": 0.027210884353741496, "rel_cer_red_dg": 0.6842105263157895, "overcorr_dg": 0.0025031289111389237, "fixrate_dg": 0.9230769230769231, "seconds_dg": 1.216, "tok_s_dg": 162.82894736842107, "denoising_steps_dg": 6, "cer_dgc": 0.023312883435582823, "wer_dgc": 0.11564625850340136, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.53, "tok_s_dgc": 394.33962264150944, "denoising_steps_dgc": 2, "cer_g4": 0.011042944785276074, "wer_g4": 0.061224489795918366, "rel_cer_red_g4": 0.5263157894736842, "overcorr_g4": 0.0025031289111389237, "fixrate_g4": 0.5384615384615384, "seconds_g4": 14.542, "tok_s_g4": 13.684500068766333}
|
| 64 |
+
{"id": "bln600/3206201960", "n_chars_gold": 997, "cer_input": 0.01905717151454363, "wer_input": 0.08092485549132948, "cer_dg": 0.004012036108324975, "wer_dg": 0.017341040462427744, "rel_cer_red_dg": 0.7894736842105263, "overcorr_dg": 0.0, "fixrate_dg": 0.8823529411764706, "seconds_dg": 1.027, "tok_s_dg": 198.63680623174295, "denoising_steps_dg": 5, "cer_dgc": 0.01905717151454363, "wer_dgc": 0.08092485549132948, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.531, "tok_s_dgc": 412.42937853107344, "denoising_steps_dgc": 2, "cer_g4": 0.010030090270812437, "wer_g4": 0.03468208092485549, "rel_cer_red_g4": 0.4736842105263158, "overcorr_g4": 0.0, "fixrate_g4": 0.47058823529411764, "seconds_g4": 15.017, "tok_s_g4": 13.717786508623561}
|
| 65 |
+
{"id": "bln600/3200810452", "n_chars_gold": 676, "cer_input": 0.1893491124260355, "wer_input": 0.4444444444444444, "cer_dg": 0.11094674556213018, "wer_dg": 0.1623931623931624, "rel_cer_red_dg": 0.4140625, "overcorr_dg": 0.06570512820512821, "fixrate_dg": 0.9243697478991597, "seconds_dg": 2.112, "tok_s_dg": 67.70833333333333, "denoising_steps_dg": 11, "cer_dgc": 0.1893491124260355, "wer_dgc": 0.4444444444444444, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.711, "tok_s_dgc": 309.42334739803096, "denoising_steps_dgc": 3, "cer_g4": 0.1849112426035503, "wer_g4": 0.4444444444444444, "rel_cer_red_g4": 0.023437499999999972, "overcorr_g4": 0.0016025641025641025, "fixrate_g4": 0.058823529411764705, "seconds_g4": 15.685, "tok_s_g4": 13.57985336308575}
|
| 66 |
+
{"id": "bln600/3200812176", "n_chars_gold": 758, "cer_input": 0.18601583113456466, "wer_input": 0.3888888888888889, "cer_dg": 0.12532981530343007, "wer_dg": 0.19047619047619047, "rel_cer_red_dg": 0.32624113475177313, "overcorr_dg": 0.05597579425113464, "fixrate_dg": 0.8282828282828283, "seconds_dg": 1.925, "tok_s_dg": 74.28571428571428, "denoising_steps_dg": 10, "cer_dgc": 0.18601583113456466, "wer_dgc": 0.3888888888888889, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.538, "tok_s_dgc": 408.92193308550185, "denoising_steps_dgc": 2, "cer_g4": 0.1266490765171504, "wer_g4": 0.2222222222222222, "rel_cer_red_g4": 0.3191489361702128, "overcorr_g4": 0.037821482602118005, "fixrate_g4": 0.8383838383838383, "seconds_g4": 10.814, "tok_s_g4": 13.685962641020899}
|
| 67 |
+
{"id": "bln600/3207644658", "n_chars_gold": 1018, "cer_input": 0.03339882121807466, "wer_input": 0.1696969696969697, "cer_dg": 0.0137524557956778, "wer_dg": 0.03636363636363636, "rel_cer_red_dg": 0.5882352941176471, "overcorr_dg": 0.005045408678102927, "fixrate_dg": 1.0, "seconds_dg": 1.215, "tok_s_dg": 164.60905349794237, "denoising_steps_dg": 6, "cer_dgc": 0.03241650294695481, "wer_dgc": 0.16363636363636364, "rel_cer_red_dgc": 0.029411764705882408, "overcorr_dgc": 0.0, "fixrate_dgc": 0.03571428571428571, "seconds_dgc": 0.883, "tok_s_dgc": 248.01812004530012, "denoising_steps_dgc": 4, "cer_g4": 0.015717092337917484, "wer_g4": 0.06060606060606061, "rel_cer_red_g4": 0.5294117647058824, "overcorr_g4": 0.0030272452068617556, "fixrate_g4": 0.7142857142857143, "seconds_g4": 14.625, "tok_s_g4": 13.675213675213675}
|
| 68 |
+
{"id": "bln600/3200811895", "n_chars_gold": 819, "cer_input": 0.07326007326007326, "wer_input": 0.22962962962962963, "cer_dg": 0.045177045177045176, "wer_dg": 0.06666666666666667, "rel_cer_red_dg": 0.38333333333333336, "overcorr_dg": 0.02746566791510612, "fixrate_dg": 0.9491525423728814, "seconds_dg": 1.761, "tok_s_dg": 100.51107325383306, "denoising_steps_dg": 9, "cer_dgc": 0.07326007326007326, "wer_dgc": 0.22962962962962963, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.714, "tok_s_dgc": 309.5238095238095, "denoising_steps_dgc": 3, "cer_g4": 0.03785103785103785, "wer_g4": 0.08888888888888889, "rel_cer_red_g4": 0.4833333333333334, "overcorr_g4": 0.0, "fixrate_g4": 0.6949152542372882, "seconds_g4": 14.63, "tok_s_g4": 13.670539986329459}
|
| 69 |
+
{"id": "bln600/3206260984", "n_chars_gold": 865, "cer_input": 0.03468208092485549, "wer_input": 0.1510791366906475, "cer_dg": 0.015028901734104046, "wer_dg": 0.05755395683453238, "rel_cer_red_dg": 0.5666666666666667, "overcorr_dg": 0.005868544600938967, "fixrate_dg": 0.8148148148148148, "seconds_dg": 1.394, "tok_s_dg": 144.1893830703013, "denoising_steps_dg": 7, "cer_dgc": 0.03468208092485549, "wer_dgc": 0.1510791366906475, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.706, "tok_s_dgc": 310.19830028328613, "denoising_steps_dgc": 3, "cer_g4": 0.013872832369942197, "wer_g4": 0.05755395683453238, "rel_cer_red_g4": 0.6, "overcorr_g4": 0.0011737089201877935, "fixrate_g4": 0.6666666666666666, "seconds_g4": 15.071, "tok_s_g4": 13.668635127065224}
|
| 70 |
+
{"id": "bln600/3206237358", "n_chars_gold": 864, "cer_input": 0.06018518518518518, "wer_input": 0.24, "cer_dg": 0.017361111111111112, "wer_dg": 0.05333333333333334, "rel_cer_red_dg": 0.7115384615384616, "overcorr_dg": 0.013205282112845138, "fixrate_dg": 0.9777777777777777, "seconds_dg": 1.749, "tok_s_dg": 103.48770726129216, "denoising_steps_dg": 9, "cer_dgc": 0.06018518518518518, "wer_dgc": 0.24, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.698, "tok_s_dgc": 315.18624641833816, "denoising_steps_dgc": 3, "cer_g4": 0.016203703703703703, "wer_g4": 0.03333333333333333, "rel_cer_red_g4": 0.7307692307692308, "overcorr_g4": 0.009603841536614645, "fixrate_g4": 0.9111111111111111, "seconds_g4": 13.466, "tok_s_g4": 13.664042774394773}
|
| 71 |
+
{"id": "bln600/3206203527", "n_chars_gold": 867, "cer_input": 0.03229527104959631, "wer_input": 0.13836477987421383, "cer_dg": 0.02306805074971165, "wer_dg": 0.05660377358490566, "rel_cer_red_dg": 0.28571428571428564, "overcorr_dg": 0.009478672985781991, "fixrate_dg": 0.85, "seconds_dg": 1.224, "tok_s_dg": 164.2156862745098, "denoising_steps_dg": 6, "cer_dgc": 0.03229527104959631, "wer_dgc": 0.13836477987421383, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.706, "tok_s_dgc": 307.36543909348444, "denoising_steps_dgc": 3, "cer_g4": 0.012687427912341407, "wer_g4": 0.05660377358490566, "rel_cer_red_g4": 0.6071428571428572, "overcorr_g4": 0.001184834123222749, "fixrate_g4": 0.75, "seconds_g4": 14.975, "tok_s_g4": 13.689482470784641}
|
| 72 |
+
{"id": "bln600/3205295999", "n_chars_gold": 748, "cer_input": 0.12834224598930483, "wer_input": 0.42276422764227645, "cer_dg": 0.07887700534759358, "wer_dg": 0.17886178861788618, "rel_cer_red_dg": 0.38541666666666674, "overcorr_dg": 0.03654970760233918, "fixrate_dg": 0.8860759493670886, "seconds_dg": 1.753, "tok_s_dg": 88.9903023388477, "denoising_steps_dg": 9, "cer_dgc": 0.12834224598930483, "wer_dgc": 0.42276422764227645, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.702, "tok_s_dgc": 313.3903133903134, "denoising_steps_dgc": 3, "cer_g4": 0.07754010695187166, "wer_g4": 0.16260162601626016, "rel_cer_red_g4": 0.39583333333333337, "overcorr_g4": 0.027777777777777776, "fixrate_g4": 0.8354430379746836, "seconds_g4": 11.884, "tok_s_g4": 13.631773813530797}
|
| 73 |
+
{"id": "bln600/3206223444", "n_chars_gold": 845, "cer_input": 0.13254437869822486, "wer_input": 0.3191489361702128, "cer_dg": 0.1467455621301775, "wer_dg": 0.24113475177304963, "rel_cer_red_dg": -0.10714285714285701, "overcorr_dg": 0.05548037889039242, "fixrate_dg": 0.7878787878787878, "seconds_dg": 5.699, "tok_s_dg": 30.707141603790138, "denoising_steps_dg": 31, "cer_dgc": 0.13254437869822486, "wer_dgc": 0.3191489361702128, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.709, "tok_s_dgc": 310.29619181946407, "denoising_steps_dgc": 3, "cer_g4": 0.12662721893491125, "wer_g4": 0.28368794326241137, "rel_cer_red_g4": 0.04464285714285716, "overcorr_g4": 0.0040595399188092015, "fixrate_g4": 0.19696969696969696, "seconds_g4": 15.486, "tok_s_g4": 13.625209866976624}
|
| 74 |
+
{"id": "bln600/3200810270", "n_chars_gold": 814, "cer_input": 0.038083538083538086, "wer_input": 0.16666666666666666, "cer_dg": 0.01597051597051597, "wer_dg": 0.08333333333333333, "rel_cer_red_dg": 0.5806451612903226, "overcorr_dg": 0.005025125628140704, "fixrate_dg": 0.7857142857142857, "seconds_dg": 1.383, "tok_s_dg": 145.3362255965293, "denoising_steps_dg": 7, "cer_dgc": 0.038083538083538086, "wer_dgc": 0.16666666666666666, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 1.055, "tok_s_dgc": 209.478672985782, "denoising_steps_dgc": 5, "cer_g4": 0.0171990171990172, "wer_g4": 0.06818181818181818, "rel_cer_red_g4": 0.5483870967741936, "overcorr_g4": 0.0, "fixrate_g4": 0.75, "seconds_g4": 14.589, "tok_s_g4": 13.708958804578792}
|
| 75 |
+
{"id": "bln600/3206190914", "n_chars_gold": 879, "cer_input": 0.04664391353811149, "wer_input": 0.12666666666666668, "cer_dg": 0.03981797497155859, "wer_dg": 0.08666666666666667, "rel_cer_red_dg": 0.1463414634146341, "overcorr_dg": 0.010588235294117647, "fixrate_dg": 0.6764705882352942, "seconds_dg": 1.392, "tok_s_dg": 142.24137931034483, "denoising_steps_dg": 7, "cer_dgc": 0.04664391353811149, "wer_dgc": 0.12666666666666668, "rel_cer_red_dgc": 0.0, "overcorr_dgc": 0.0, "fixrate_dgc": 0.0, "seconds_dgc": 0.881, "tok_s_dgc": 240.6356413166856, "denoising_steps_dgc": 4, "cer_g4": 0.04323094425483504, "wer_g4": 0.1, "rel_cer_red_g4": 0.07317073170731697, "overcorr_g4": 0.002352941176470588, "fixrate_g4": 0.17647058823529413, "seconds_g4": 15.216, "tok_s_g4": 13.669821240799159}
|
results/summary.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Benchmark summary
|
| 2 |
+
|
| 3 |
+
Passages: 75 · macro means over passages (micro CER in footnote)
|
| 4 |
+
|
| 5 |
+
| Model | CER ↓ | WER ↓ | Rel. CER reduction ↑ | Over-correction ↓ | Fix rate ↑ | Median s/passage | tok/s |
|
| 6 |
+
|---|---|---|---|---|---|---|---|
|
| 7 |
+
| OCR input (uncorrected) | 0.066 | 0.215 | — | — | — | — | — |
|
| 8 |
+
| DiffusionGemma 26B-A4B-it | 0.036 | 0.076 | 49.4% | 1.4% | 85.2% | 1.74 | 118.7 |
|
| 9 |
+
| DiffusionGemma (OCR-seeded canvas) | 0.081 | 0.226 | -17.2% | 0.0% | 0.6% | 0.70 | 323.2 |
|
| 10 |
+
| Gemma-4-E4B-it | 0.042 | 0.107 | 45.9% | 0.4% | 61.5% | 14.68 | 13.7 |
|
| 11 |
+
|
| 12 |
+
Micro (corpus-level) CER — input: 0.062, DiffusionGemma 26B-A4B-it: 0.033, DiffusionGemma (OCR-seeded canvas): 0.075, Gemma-4-E4B-it: 0.038.
|
| 13 |
+
Mean denoising steps, DiffusionGemma 26B-A4B-it: 10.1 (max 48).
|
| 14 |
+
Mean denoising steps, DiffusionGemma (OCR-seeded canvas): 3.3 (max 48).
|
| 15 |
+
|
| 16 |
+
## Config
|
| 17 |
+
|
| 18 |
+
```json
|
| 19 |
+
{
|
| 20 |
+
"date": "2026-06-10",
|
| 21 |
+
"dataset": "bln600",
|
| 22 |
+
"n": 75,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"max_passage_tokens": 220,
|
| 25 |
+
"prompt": "Correct the OCR errors in the following text from a 19th-century English newspaper.\nFix only recognition errors (wrong, missing, or extra characters). Do not modernise spelling, do not rephrase, and do not add or remove content. Preserve the original punctuation unless it is clearly an OCR error.\nOutput only the corrected text, with no commentary or preamble.\n\nOCR text:\n{ocr}",
|
| 26 |
+
"transformers": "5.11.0",
|
| 27 |
+
"torch": "2.9.1+cu128",
|
| 28 |
+
"gpu": "NVIDIA A100-SXM4-80GB",
|
| 29 |
+
"generation": {
|
| 30 |
+
"diffusiongemma": "generation_config defaults (entropy sampler), max_new_tokens=256",
|
| 31 |
+
"diffusiongemma_canvas": "as diffusiongemma, but first canvas seeded with the OCR text via decoder_input_ids (random tail padding, seed 0)",
|
| 32 |
+
"gemma4": "do_sample=False (greedy), max_new_tokens=256"
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
```
|