Spaces:

davanstrien
/

diffusiongemma-ocr-correction

Running on Zero

App Files Files Community

davanstrien HF Staff commited on 1 day ago

Commit

163634f

verified ·

1 Parent(s): 7b9f964

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +47 -7
app.py +262 -0
diff_utils.py +39 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,53 @@
 ---
-title: Diffusiongemma Ocr
-emoji: 🏃
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.17.3
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DiffusionGemma vs Gemma-4 — Post-OCR Correction
+emoji: 📰
+colorFrom: yellow
+colorTo: red
 sdk: gradio
+sdk_version: "5.49.1"
 app_file: app.py
 pinned: false
+license: apache-2.0
+short_description: Diffusion vs autoregressive LLM on historical OCR cleanup
+models:
+  - google/diffusiongemma-26B-A4B-it
+  - google/gemma-4-E4B-it
 ---
+# DiffusionGemma vs Gemma-4: post-OCR correction
+A pragmatic first-pass comparison of Google's **experimental diffusion LLM**
+[DiffusionGemma-26B-A4B-it](https://huggingface.co/google/diffusiongemma-26B-A4B-it)
+(released 2026-06-10; 26B MoE, 3.8B active; generates 256-token blocks by iterative
+denoising) against an autoregressive baseline,
+[Gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it) (~4.5B effective),
+on **post-OCR correction of 19th-century English newspaper text**.
+**Hypothesis**: a diffusion LM treats correction as denoising, so it may be
+(a) faster and (b) less prone to *over-correction* — rewriting text that was
+already correct — than an autoregressive model, possibly at some accuracy cost.
+## Method (v1, pragmatic)
+- 75 passages from [BLN600](https://doi.org/10.15131/shef.data.25439023)
+  (19th-c British Library newspapers, aligned OCR + human gold transcription),
+  align-trimmed to ≤220 Gemma tokens so outputs fit DiffusionGemma's single
+  256-token block. Identical prompt for both models; thinking mode off; bf16;
+  batch size 1; A100-80GB.
+- Gemma-4 decodes greedily. DiffusionGemma uses its generation-config default
+  entropy sampler (**no greedy equivalent exists** for the diffusion sampler —
+  this is an unavoidable asymmetry, not a tuning choice).
+- **Over-correction rate**: of input characters that were already correct
+  (per input↔gold character alignment), the fraction the model changed
+  (per input↔output alignment). **Fix rate**: of input characters that were
+  wrong, the fraction the model changed. Text NFC-normalized, whitespace
+  collapsed, before all metrics. CER/WER via jiwer.
+## Limitations
+n=75, single prompt, one run (no seeds/significance testing), 256-token block
+caps passage length, tokens/sec for DiffusionGemma is computed over denoising
+the whole block, DiffusionGemma is experimental and one day old at benchmark
+time. Live demo examples are from ICDAR2019 post-OCR (CC-BY-4.0) because
+BLN600's CC-BY-NC license doesn't permit redistribution here; benchmark passage
+texts are likewise not republished — only per-passage metrics.

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""DiffusionGemma vs Gemma-4 on post-OCR correction — ZeroGPU comparison Space.
+Side-by-side correction of 19th-century English newspaper OCR by an
+experimental block-diffusion LLM (google/diffusiongemma-26B-A4B-it) and an
+autoregressive baseline (google/gemma-4-E4B-it).
+"""
+import json
+import os
+import time
+from pathlib import Path
+import gradio as gr
+import spaces
+import torch
+from transformers import (
+    AutoModelForMultimodalLM,
+    AutoProcessor,
+    DiffusionGemmaForBlockDiffusion,
+    TextDiffusionStreamer,
+)
+from diff_utils import COLOR_MAP, diff_highlight
+# Keep in sync with benchmark.py PROMPT_TEMPLATE — the benchmark numbers in the
+# results tab were produced with exactly this prompt.
+PROMPT_TEMPLATE = """\
+Correct the OCR errors in the following text from a 19th-century English newspaper.
+Fix only recognition errors (wrong, missing, or extra characters). Do not modernise \
+spelling, do not rephrase, and do not add or remove content. Preserve the original \
+punctuation unless it is clearly an OCR error.
+Output only the corrected text, with no commentary or preamble.
+OCR text:
+{ocr}"""
+MAX_INPUT_CHARS = 1200  # roughly the 220-token benchmark cap
+def model_path(volume_path: str, model_id: str) -> str:
+    """Prefer a mounted hf:// volume (see `hf spaces volumes`) over a download."""
+    return volume_path if os.path.isdir(volume_path) else model_id
+DG_PATH = model_path("/models/dg", "google/diffusiongemma-26B-A4B-it")
+G4_PATH = model_path("/models/gemma", "google/gemma-4-E4B-it")
+print(f"loading DiffusionGemma from {DG_PATH} ...")
+dg_processor = AutoProcessor.from_pretrained(DG_PATH)
+dg_model = DiffusionGemmaForBlockDiffusion.from_pretrained(DG_PATH, dtype=torch.bfloat16).to("cuda")
+print(f"loading Gemma-4 from {G4_PATH} ...")
+g4_processor = AutoProcessor.from_pretrained(G4_PATH)
+g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat16).to("cuda")
+print("models loaded")
+STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")
+def extract_answer(raw: str) -> str:
+    """DiffusionGemma's block looks like `<|channel>thought\\n<channel|>ANSWER<turn|>...`
+    even with thinking off — the answer is the text after the last `<channel|>`.
+    Gemma-4 emits plain text; we just cut at the first stop marker."""
+    stops = [i for m in STOP_MARKERS if (i := raw.find(m)) != -1]
+    if stops:
+        raw = raw[: min(stops)]
+    if "<channel|>" in raw:
+        raw = raw.rpartition("<channel|>")[2]
+    return raw.strip()
+class SnapshotStreamer(TextDiffusionStreamer):
+    """Captures the decoded canvas at each denoising step; suppresses the
+    parent's ANSI console printing."""
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer=tokenizer)
+        self.tok = tokenizer
+        self.snapshots: list[str] = []
+    def put_draft(self, value, **kwargs):
+        try:
+            ids = value[0] if value.ndim > 1 else value
+            self.snapshots.append(self.tok.decode(ids, skip_special_tokens=False))
+        except Exception:
+            pass
+    def put(self, value):
+        pass
+    def end(self):
+        pass
+def _prepare_inputs(processor, model, ocr_text: str):
+    message = [{"role": "user", "content": PROMPT_TEMPLATE.format(ocr=ocr_text.strip())}]
+    return processor.apply_chat_template(
+        message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+    ).to(model.device)
+def _decode_generated(processor, output, input_len) -> str:
+    # DiffusionGemma returns a DiffusionGemmaGenerationOutput whose .sequences
+    # includes the prompt (like AR generate, which returns a plain tensor).
+    seq = output.sequences if hasattr(output, "sequences") else output
+    generated = seq[0][input_len:] if seq.shape[-1] > input_len else seq[0]
+    raw = processor.tokenizer.decode(generated, skip_special_tokens=False)
+    return extract_answer(raw)
+# size="xlarge" (96GB) on both: total module-level CUDA state is ~68GB bf16,
+# which exceeds the default 48GB ZeroGPU slice.
+@spaces.GPU(duration=120, size="xlarge")
+def run_diffusiongemma(ocr_text: str):
+    inputs = _prepare_inputs(dg_processor, dg_model, ocr_text)
+    streamer = SnapshotStreamer(dg_processor.tokenizer)
+    t0 = time.perf_counter()
+    output = dg_model.generate(**inputs, max_new_tokens=256, streamer=streamer)
+    torch.cuda.synchronize()
+    seconds = time.perf_counter() - t0
+    text = _decode_generated(dg_processor, output, inputs["input_ids"].shape[-1])
+    n_tokens = len(dg_processor.tokenizer(text)["input_ids"])
+    timing = (
+        f"**{seconds:.1f}s** · ~{n_tokens / seconds:.0f} tok/s · "
+        f"{len(streamer.snapshots)} denoising steps"
+    )
+    return text, diff_highlight(ocr_text, text), timing, streamer.snapshots
+@spaces.GPU(duration=60, size="xlarge")
+def run_gemma4(ocr_text: str):
+    inputs = _prepare_inputs(g4_processor, g4_model, ocr_text)
+    t0 = time.perf_counter()
+    output = g4_model.generate(**inputs, max_new_tokens=256, do_sample=False)
+    torch.cuda.synchronize()
+    seconds = time.perf_counter() - t0
+    text = _decode_generated(g4_processor, output, inputs["input_ids"].shape[-1])
+    n_tokens = len(g4_processor.tokenizer(text)["input_ids"])
+    timing = f"**{seconds:.1f}s** · ~{n_tokens / seconds:.0f} tok/s (greedy)"
+    return text, diff_highlight(ocr_text, text), timing
+# ---------------------------------------------------------------- UI data
+examples: list[dict] = []
+examples_path = Path("examples.json")
+if examples_path.exists():
+    examples = json.loads(examples_path.read_text())
+example_choices = {e["label"]: e["ocr_input"] for e in examples}
+summary_md = "*Benchmark results pending — see the repo for methodology.*"
+if Path("results/summary.md").exists():
+    summary_md = Path("results/summary.md").read_text()
+per_passage_rows = []
+if Path("results/per_passage_metrics.jsonl").exists():
+    per_passage_rows = [
+        json.loads(line)
+        for line in Path("results/per_passage_metrics.jsonl").read_text().splitlines()
+        if line.strip()
+    ]
+def load_example(label: str) -> str:
+    return example_choices.get(label, "")
+def check_length(text: str):
+    if len(text) > MAX_INPUT_CHARS:
+        raise gr.Error(
+            f"Input too long ({len(text)} chars). DiffusionGemma generates a single "
+            f"256-token block, so inputs are capped at ~{MAX_INPUT_CHARS} characters."
+        )
+    return text
+def update_snapshot(snapshots: list[str], step: int) -> str:
+    if not snapshots:
+        return ""
+    return snapshots[min(int(step), len(snapshots) - 1)]
+with gr.Blocks(title="DiffusionGemma vs Gemma-4: post-OCR correction") as demo:
+    gr.Markdown(
+        "# DiffusionGemma vs Gemma-4: post-OCR correction\n"
+        "Compare Google's **experimental diffusion LLM** "
+        "([google/diffusiongemma-26B-A4B-it](https://huggingface.co/google/diffusiongemma-26B-A4B-it), "
+        "26B MoE / 3.8B active, released 2026-06-10) against an autoregressive baseline "
+        "([google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)) on correcting "
+        "19th-century English newspaper OCR. Both run in bf16. Highlights show what each model "
+        "**changed relative to the OCR input** (yellow = changed, green = added, red ⌫ = deleted)."
+    )
+    with gr.Tab("Live comparison"):
+        with gr.Row():
+            example_dd = gr.Dropdown(
+                label="Example passages (ICDAR2019 post-OCR, CC-BY-4.0)",
+                choices=list(example_choices),
+                value=None,
+                scale=2,
+            )
+        ocr_box = gr.Textbox(
+            label="Noisy OCR text",
+            lines=6,
+            value=next(iter(example_choices.values()), ""),
+            max_length=MAX_INPUT_CHARS,
+        )
+        run_btn = gr.Button("Run both models", variant="primary")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### DiffusionGemma 26B-A4B (diffusion)")
+                dg_timing = gr.Markdown("")
+                dg_diff = gr.HighlightedText(
+                    label="Output (diff vs input)", color_map=COLOR_MAP, combine_adjacent=True
+                )
+                with gr.Accordion("Raw output", open=False):
+                    dg_raw = gr.Textbox(lines=6, show_label=False)
+            with gr.Column():
+                gr.Markdown("### Gemma-4-E4B (autoregressive)")
+                g4_timing = gr.Markdown("")
+                g4_diff = gr.HighlightedText(
+                    label="Output (diff vs input)", color_map=COLOR_MAP, combine_adjacent=True
+                )
+                with gr.Accordion("Raw output", open=False):
+                    g4_raw = gr.Textbox(lines=6, show_label=False)
+        snapshots_state = gr.State([])
+        example_dd.change(load_example, example_dd, ocr_box)
+        run_btn.click(check_length, ocr_box, ocr_box).success(
+            run_diffusiongemma, ocr_box, [dg_raw, dg_diff, dg_timing, snapshots_state]
+        ).then(run_gemma4, ocr_box, [g4_raw, g4_diff, g4_timing])
+    with gr.Tab("Denoising progression"):
+        gr.Markdown(
+            "DiffusionGemma starts from a random 256-token canvas and iteratively denoises it. "
+            "Run a comparison first, then scrub through the intermediate canvas states."
+        )
+        step_slider = gr.Slider(0, 47, step=1, value=0, label="Denoising step")
+        snapshot_box = gr.Textbox(lines=10, label="Canvas at step", interactive=False)
+        step_slider.change(update_snapshot, [snapshots_state, step_slider], snapshot_box)
+        snapshots_state.change(
+            lambda s: (gr.Slider(0, max(len(s) - 1, 1), step=1, value=0), update_snapshot(s, 0)),
+            snapshots_state,
+            [step_slider, snapshot_box],
+        )
+    with gr.Tab("Benchmark results"):
+        gr.Markdown(summary_md)
+        if per_passage_rows:
+            gr.Markdown("### Per-passage metrics (BLN600, n=75)")
+            gr.DataFrame(
+                value=[[row.get(k) for k in per_passage_rows[0]] for row in per_passage_rows],
+                headers=list(per_passage_rows[0]),
+                interactive=False,
+            )
+        gr.Markdown(
+            "Benchmark texts come from [BLN600](https://doi.org/10.15131/shef.data.25439023) "
+            "(CC-BY-NC-4.0), so passage texts are not redistributed here — only metrics. "
+            "See the Space README for methodology and limitations."
+        )
+demo.launch()

diff_utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Token-level diff between OCR input and model output for gr.HighlightedText."""
+import difflib
+import re
+COLOR_MAP = {"changed": "yellow", "added": "green", "removed": "red"}
+def diff_highlight(input_text: str, output_text: str) -> list[tuple[str, str | None]]:
+    """Segments of `output_text` labelled by how they differ from `input_text`.
+    Word + whitespace tokenization (lossless), so highlights align with what
+    the reader sees. Deleted input text is marked with a small marker segment.
+    """
+    tokens_in = re.findall(r"\S+|\s+", input_text)
+    tokens_out = re.findall(r"\S+|\s+", output_text)
+    sm = difflib.SequenceMatcher(None, tokens_in, tokens_out, autojunk=False)
+    segments: list[tuple[str, str | None]] = []
+    for op, i1, i2, j1, j2 in sm.get_opcodes():
+        if op == "equal":
+            segments.append(("".join(tokens_out[j1:j2]), None))
+        elif op == "replace":
+            segments.append(("".join(tokens_out[j1:j2]), "changed"))
+        elif op == "insert":
+            segments.append(("".join(tokens_out[j1:j2]), "added"))
+        elif op == "delete":
+            segments.append((" ⌫ ", "removed"))
+    return segments
+if __name__ == "__main__":
+    segs = diff_highlight("the qvick brown fox jumps", "the quick brown fox")
+    print(segs)
+    assert ("the ", None) in segs or segs[0][1] is None
+    assert any(label == "changed" for _, label in segs)
+    assert any(label == "removed" for _, label in segs)
+    out = "".join(s for s, label in segs if label != "removed")
+    assert out == "the quick brown fox"
+    print("diff_utils ok")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers>=5.11,<6
+accelerate
+pillow
+torchvision