MultiVLM-OCR

Sleeping

App Files Files Community

Geraldine commited on 27 days ago

Commit

136b6ad

verified ·

1 Parent(s): f4fb569

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -19

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ from PIL import Image
 from huggingface_hub import snapshot_download
 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
@@ -130,14 +132,12 @@ model_v = load_model_with_attention_fallback(
     torch_dtype=torch.float16
 ).to(device).eval()
-MODEL_ID_Y = "rednote-hilab/dots.ocr"
-MODEL_PATH_Y = resolve_dots_ocr_model_path(MODEL_ID_Y)
-processor_y = AutoProcessor.from_pretrained(MODEL_PATH_Y, trust_remote_code=True)
-model_y = load_model_with_attention_fallback(
-    AutoModelForCausalLM,
-    MODEL_PATH_Y,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
 ).to(device).eval()
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -168,7 +168,7 @@ model_m = load_model_with_attention_fallback(
 MODEL_MAP = {
     "Nanonets-OCR2-3B": (processor_v, model_v),
-    "dots.OCR": (processor_y, model_y),
     "olmOCR-7B-0725": (processor_w, model_w),
     "Qwen3-VL-4B-Instruct": (processor_m, model_m),
     "Qwen2-VL-OCR-2B": (processor_x, model_x),
@@ -184,7 +184,7 @@ PROMPTS = {
         "icon": "📄"
     },
     "MARKDOWN": {
-        "name": "Markdown Conversion",
         "description": "Convert document to Markdown format",
         "prompt": "Convert this document to Markdown. Preserve headings, lists, and formatting.",
         "icon": "📝"
@@ -192,7 +192,7 @@ PROMPTS = {
     "MARKDOWN_OCR": {
         "name": "Markdown OCR",
         "description": "Perform OCR and convert to Markdown",
-        "prompt": "Perform OCR on this document and convert to Markdown. Preserve headings, lists, and formatting.",
         "icon": "🔍"
     },
     "TITLE_JSON": {
@@ -255,6 +255,30 @@ Return ONLY valid JSON with this exact structure:
 IMPORTANT: Return null for any field where information is NOT clearly visible.
 Return ONLY the JSON, no explanation.""",
         "icon": "📄"
     }
 }
@@ -264,7 +288,7 @@ image_examples = [
     {"query": PROMPTS["TITLE_JSON"]["prompt"], "image": "examples/ephesvt_theses_doc13.jpg", "model": "Qwen3-VL-4B-Instruct"},
     {"query": PROMPTS["LOCATED_TITLE_JSON"]["prompt"], "image": "examples/memoires_cridaf_doc07.jpg", "model": "Qwen2-VL-OCR-2B"},
     {"query": PROMPTS["GROUNDED_TITLE_JSON"]["prompt"], "image": "examples/thesefr_2015PA010690.png", "model": "Qwen2-VL-OCR-2B"},
-    {"query": PROMPTS["FULL_SCHEMA_JSON"]["prompt"], "image": "examples/thesefr_2015PA010690.png", "model": "dots.OCR"},
 ]
@@ -415,6 +439,10 @@ def align_inputs_to_model_dtype(inputs, model):
     return inputs
 @spaces.GPU(duration=calc_timeout_duration)
 def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
@@ -425,7 +453,9 @@ def generate_image(model_name, text, image, max_new_tokens, temperature, top_p,
             yield "[ERROR] Please upload an image."
             return
         text = str(text or "").strip()
-        if not text:
             yield "[ERROR] Please enter your OCR/query instruction."
             return
         if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
@@ -434,6 +464,30 @@ def generate_image(model_name, text, image, max_new_tokens, temperature, top_p,
         processor, model = select_model(model_name)
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, "tokenizer") else processor,
             skip_prompt=True,
@@ -724,6 +778,7 @@ footer{display:none!important}
     padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
     resize:none;outline:none;min-height:100px;transition:border-color .2s;
 }
 .modern-textarea:focus{border-color:#ADFF2F;box-shadow:0 0 0 3px rgba(173,255,47,.14)}
 .modern-textarea::placeholder{color:#3f3f46}
 .modern-textarea.error-flash{
@@ -883,6 +938,8 @@ function init() {
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const imgStatus = document.getElementById('sb-image-status');
@@ -982,7 +1039,8 @@ function init() {
         if (imgStatus) imgStatus.textContent = txt;
     }
     function syncPromptToGradio() {
-        setGradioValue('prompt-gradio-input', promptInput.value);
     }
     function syncModelToGradio(name) {
         setGradioValue('hidden-model-name', name);
@@ -1039,6 +1097,9 @@ function init() {
         document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
             btn.classList.toggle('active', btn.getAttribute('data-model') === name);
         });
         syncModelToGradio(name);
         syncPromptToGradio();
     }
@@ -1105,7 +1166,9 @@ function init() {
     syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
-        if (!imageState && !promptVal) {
             showToast('Please upload an image and enter your OCR instruction', 'error');
             flashPromptError();
             return false;
@@ -1114,12 +1177,11 @@ function init() {
             showToast('Please upload an image', 'error');
             return false;
         }
-        if (!promptVal) {
             showToast('Please enter your OCR/query instruction', 'warning');
             flashPromptError();
             return false;
         }
-        const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
         if (!currentModel) {
             showToast('Please select a model', 'error');
             return false;
@@ -1383,7 +1445,7 @@ with gr.Blocks() as demo:
         <div class="model-tabs-bar">
             {MODEL_TABS_HTML}
         </div>
-        <div class="model-tabs-bar">
             {PROMPT_TABS_HTML}
         </div>
         <div class="app-main-row">
@@ -1420,7 +1482,7 @@ with gr.Blocks() as demo:
                 </div>
             </div>
             <div class="app-main-right">
-                <div class="panel-card">
                     <div class="panel-card-title">OCR / Vision Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>

 from huggingface_hub import snapshot_download
 from transformers import (
+    LightOnOcrForConditionalGeneration,
+    LightOnOcrProcessor,
     Qwen2VLForConditionalGeneration,
     Qwen3VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     torch_dtype=torch.float16
 ).to(device).eval()
+MODEL_ID_Y = "lightonai/LightOnOCR-2-1B"
+LIGHTON_DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+processor_y = LightOnOcrProcessor.from_pretrained(MODEL_ID_Y)
+model_y = LightOnOcrForConditionalGeneration.from_pretrained(
+    MODEL_ID_Y,
+    torch_dtype=LIGHTON_DTYPE,
 ).to(device).eval()
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 MODEL_MAP = {
     "Nanonets-OCR2-3B": (processor_v, model_v),
+    "LightOnOCR-2-1B": (processor_y, model_y),
     "olmOCR-7B-0725": (processor_w, model_w),
     "Qwen3-VL-4B-Instruct": (processor_m, model_m),
     "Qwen2-VL-OCR-2B": (processor_x, model_x),
         "icon": "📄"
     },
     "MARKDOWN": {
+        "name": "Simple Markdown Conversion",
         "description": "Convert document to Markdown format",
         "prompt": "Convert this document to Markdown. Preserve headings, lists, and formatting.",
         "icon": "📝"
     "MARKDOWN_OCR": {
         "name": "Markdown OCR",
         "description": "Perform OCR and convert to Markdown",
+        "prompt": "Perform OCR including inside images and logos and convert to Markdown.",
         "icon": "🔍"
     },
     "TITLE_JSON": {
 IMPORTANT: Return null for any field where information is NOT clearly visible.
 Return ONLY the JSON, no explanation.""",
         "icon": "📄"
+    },
+    "NUEXTRACT_SCHEMA_JSON": {
+        "name": "NuExtract Json Schema",
+        "description": "Strict data extraction following deterministic JSON schema",
+        "prompt": """{
+  "title": "verbatim-string",
+  "subtitle": "verbatim-string",
+  "author": "verbatim-string",
+  "degree_type": "verbatim-string",
+  "discipline": [["Mathématiques", "Physique", "Autres"]],
+  "granting_institution": ["verbatim-string"],
+  "doctoral_school": ["verbatim-string"],
+  "co_tutelle_institutions": ["verbatim-string"],
+  "partner_institutions": ["verbatim-string"],
+  "defense_year": "integer",
+  "thesis_advisor": ["verbatim-string"],
+  "co_advisors": ["verbatim-string"],
+  "jury_president": "verbatim-string",
+  "reviewers": ["verbatim-string"],
+  "other_jury_members": ["verbatim-string"],
+  "language": "verbatim-string",
+  "confidence": "float"
+}""",
+        "icon": "📄"
     }
 }
     {"query": PROMPTS["TITLE_JSON"]["prompt"], "image": "examples/ephesvt_theses_doc13.jpg", "model": "Qwen3-VL-4B-Instruct"},
     {"query": PROMPTS["LOCATED_TITLE_JSON"]["prompt"], "image": "examples/memoires_cridaf_doc07.jpg", "model": "Qwen2-VL-OCR-2B"},
     {"query": PROMPTS["GROUNDED_TITLE_JSON"]["prompt"], "image": "examples/thesefr_2015PA010690.png", "model": "Qwen2-VL-OCR-2B"},
+    {"query": "", "image": "examples/thesefr_2015PA010690.png", "model": "LightOnOCR-2-1B"},
 ]
     return inputs
+def model_requires_text_prompt(model_name: str) -> bool:
+    return model_name != "LightOnOCR-2-1B"
 @spaces.GPU(duration=calc_timeout_duration)
 def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
             yield "[ERROR] Please upload an image."
             return
         text = str(text or "").strip()
+        if not model_requires_text_prompt(model_name):
+            text = ""
+        if model_requires_text_prompt(model_name) and not text:
             yield "[ERROR] Please enter your OCR/query instruction."
             return
         if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
         processor, model = select_model(model_name)
+        if model_name == "LightOnOCR-2-1B":
+            conversation = [{"role": "user", "content": [{"type": "image", "image": image}]}]
+            inputs = processor.apply_chat_template(
+                conversation,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            inputs = {
+                k: v.to(device=device, dtype=LIGHTON_DTYPE) if torch.is_tensor(v) and v.is_floating_point() else v.to(device)
+                for k, v in inputs.items()
+            }
+            output_ids = model.generate(**inputs, max_new_tokens=int(max_new_tokens))
+            generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
+            output_text = processor.decode(generated_ids, skip_special_tokens=True)
+            if output_text.strip():
+                yield output_text
+            else:
+                yield "[ERROR] No output was generated."
+            return
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, "tokenizer") else processor,
             skip_prompt=True,
     padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
     resize:none;outline:none;min-height:100px;transition:border-color .2s;
 }
+.is-hidden{display:none!important}
 .modern-textarea:focus{border-color:#ADFF2F;box-shadow:0 0 0 3px rgba(173,255,47,.14)}
 .modern-textarea::placeholder{color:#3f3f46}
 .modern-textarea.error-flash{
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
+    const promptPanel = document.getElementById('prompt-panel');
+    const promptTabsBar = document.getElementById('prompt-tabs-bar');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const imgStatus = document.getElementById('sb-image-status');
         if (imgStatus) imgStatus.textContent = txt;
     }
     function syncPromptToGradio() {
+        const activeModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
+        setGradioValue('prompt-gradio-input', activeModel === 'LightOnOCR-2-1B' ? '' : promptInput.value);
     }
     function syncModelToGradio(name) {
         setGradioValue('hidden-model-name', name);
         document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
             btn.classList.toggle('active', btn.getAttribute('data-model') === name);
         });
+        const hidePrompt = name === 'LightOnOCR-2-1B';
+        if (promptPanel) promptPanel.classList.toggle('is-hidden', hidePrompt);
+        if (promptTabsBar) promptTabsBar.classList.toggle('is-hidden', hidePrompt);
         syncModelToGradio(name);
         syncPromptToGradio();
     }
     syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
+        const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
+        const requiresPrompt = currentModel !== 'LightOnOCR-2-1B';
+        if (!imageState && !promptVal && requiresPrompt) {
             showToast('Please upload an image and enter your OCR instruction', 'error');
             flashPromptError();
             return false;
             showToast('Please upload an image', 'error');
             return false;
         }
+        if (requiresPrompt && !promptVal) {
             showToast('Please enter your OCR/query instruction', 'warning');
             flashPromptError();
             return false;
         }
         if (!currentModel) {
             showToast('Please select a model', 'error');
             return false;
         <div class="model-tabs-bar">
             {MODEL_TABS_HTML}
         </div>
+        <div id="prompt-tabs-bar" class="model-tabs-bar">
             {PROMPT_TABS_HTML}
         </div>
         <div class="app-main-row">
                 </div>
             </div>
             <div class="app-main-right">
+                <div id="prompt-panel" class="panel-card">
                     <div class="panel-card-title">OCR / Vision Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>