Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18, 2025

Commit

3396a9a

verified ·

1 Parent(s): da84e63

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -147

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import random
 import uuid
 import json
 import time
 from threading import Thread
 from typing import Iterable
@@ -10,22 +11,30 @@ import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image
 import cv2
 from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    AutoModelForCausalLM, # Added for PaddleOCR-VL
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # --- Theme and CSS Definition ---
-# Define the SteelBlue color palette
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -73,14 +82,8 @@ class SteelBlueTheme(Soft):
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
-            button_secondary_text_color="black",
-            button_secondary_text_color_hover="white",
-            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
-            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
-            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
-            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
@@ -92,7 +95,6 @@ class SteelBlueTheme(Soft):
             block_label_background_fill="*primary_200",
         )
-# Instantiate the new theme
 steel_blue_theme = SteelBlueTheme()
 css = """
@@ -105,179 +107,186 @@ css = """
 """
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 4096
-DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
-print("torch.__version__ =", torch.__version__)
-print("torch.version.cuda =", torch.version.cuda)
-print("cuda available:", torch.cuda.is_available())
-print("cuda device count:", torch.cuda.device_count())
-if torch.cuda.is_available():
-    print("current device:", torch.cuda.current_device())
-    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
-print("Using device:", device)
-# --- Model Loading ---
 # Load Nanonets-OCR2-3B
-MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_V,
     trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load PaddleOCR-VL
-# Using the corrected model path from your previous attempt
-MODEL_ID_P = "strangervisionhf/paddle"
-processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
-model_p = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_P,
     trust_remote_code=True,
-    torch_dtype=torch.float16,
-).to(device).eval()
-# --- Task Prompts for PaddleOCR-VL ---
-PROMPTS = {
-    "ocr": "OCR:",
-    "table": "Table Recognition:",
-    "chart": "Chart Recognition:",
-    "formula": "Formula Recognition:",
-}
 @spaces.GPU
-def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int, temperature: float, top_p: float,
-                   top_k: int, repetition_penalty: float):
-    """
-    Generates responses using the selected model for image input.
-    Yields raw text and Markdown-formatted text.
-    """
-    if image is None:
-        yield "Please upload an image.", "Please upload an image."
         return
-    if model_name == "Nanonets-OCR2-3B":
-        processor = processor_v
-        model = model_v
-        messages = [{
             "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": text},
             ]
-        }]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[prompt_full],
-            images=[image],
-            return_tensors="pt",
-            padding=True).to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
         }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer, buffer
-    elif model_name == "PaddleOCR-VL":
-        processor = processor_p
-        model = model_p
-        # --- CORRECTED LOGIC FOR PADDLEOCR-VL ---
-        # It expects a simple string content, not a list of dicts.
-        # The user's input `text` should be one of the specific prompts.
-        messages = [{"role": "user", "content": text}]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt_full], images=[image], return_tensors="pt").to(device)
-        generation_kwargs = {
-            **inputs,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": False, # As per the reference script for best results
-            "use_cache": True,
-        }
-        with torch.inference_mode():
-            generated_ids = model.generate(**generation_kwargs)
-        resp = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Extract only the model's answer, excluding the prompt
-        answer = resp.split(prompt_full)[-1].strip()
-        yield answer, answer
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-# Define examples for image inference, updated for both models
 image_examples = [
-    ["OCR:", "images/ocr.png"],
-    ["Table Recognition:", "images/4.png"],
-    ["Extract the content of this invoice.", "images/0.png"]
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter query. For PaddleOCR, use 'OCR:', 'Table Recognition:', etc.")
-            image_upload = gr.Image(type="pil", label="Upload Image", height=290)
-            image_submit = gr.Button("Submit", variant="primary")
-            gr.Examples(
-                examples=image_examples,
-                inputs=[image_query, image_upload]
-            )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.7)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
         with gr.Column(scale=3):
-                gr.Markdown("## Output", elem_id="output-title")
-                output = gr.Textbox(label="Raw Output", interactive=False, lines=11, show_copy_button=True)
-                with gr.Accordion("(Result.md)", open=False):
-                    markdown_output = gr.Markdown(label="(Result.Md)")
-                model_choice = gr.Radio(
-                    choices=["Nanonets-OCR2-3B", "PaddleOCR-VL"],
-                    label="Select Model",
-                    value="Nanonets-OCR2-3B"
-                )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output, markdown_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 import uuid
 import json
 import time
+import asyncio
 from threading import Thread
 from typing import Iterable
 import spaces
 import torch
 import numpy as np
+from PIL import Image, ImageOps
 import cv2
+import requests
 from transformers import (
+    AutoTokenizer,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
+# The custom model class is imported via trust_remote_code=True
+from transformers import AutoModelForImageTextToText
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
 # --- Theme and CSS Definition ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
             block_label_background_fill="*primary_200",
         )
 steel_blue_theme = SteelBlueTheme()
 css = """
 """
 # Constants for text generation
+MAX_MAX_NEW_TOKENS = 5120
+DEFAULT_MAX_NEW_TOKENS = 3072
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Check for CUDA availability
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Nanonets-OCR2-3B
+MODEL_ID_3B = "nanonets/Nanonets-OCR2-3B"
+processor_3b = AutoProcessor.from_pretrained(MODEL_ID_3B, trust_remote_code=True)
+model_3b = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID_3B,
+    torch_dtype="auto",
+    device_map="auto",
     trust_remote_code=True,
+    attn_implementation="flash_attention_2"
+).eval()
+# Load Nanonets-OCR2-1.5B-exp
+MODEL_ID_1_5B = "nanonets/Nanonets-OCR2-1.5B-exp"
+processor_1_5b = AutoProcessor.from_pretrained(MODEL_ID_1_5B, trust_remote_code=True)
+model_1_5b = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID_1_5B,
+    torch_dtype="auto",
+    device_map="auto",
     trust_remote_code=True,
+    attn_implementation="flash_attention_2"
+).eval()
+def downsample_video(video_path):
+    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    # Use a smaller number of frames for video to avoid overwhelming the model
+    frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
 @spaces.GPU
+def generate(model_name: str, text: str, media_input, media_type: str,
+             max_new_tokens: int = 1024,
+             temperature: float = 0.6,
+             top_p: float = 0.9,
+             top_k: int = 50,
+             repetition_penalty: float = 1.2):
+    """Generic generation function for both image and video."""
+    if model_name == "Nanonets-OCR2-3B":
+        processor, model = processor_3b, model_3b
+    elif model_name == "Nanonets-OCR2-1.5B-exp":
+        processor, model = processor_1_5b, model_1_5b
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
         return
+    if media_input is None:
+        yield f"Please upload an {media_type}.", f"Please upload an {media_type}."
+        return
+    if media_type == "image":
+        images = [media_input]
+    elif media_type == "video":
+        frames = downsample_video(media_input)
+        images = [frame for frame, _ in frames]
+    else:
+        yield "Invalid media type.", "Invalid media type."
+        return
+    messages = [
+        {
             "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
             ]
         }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    # Since device_map="auto" is used, we don't need .to(device)
+    inputs = processor(text=prompt, images=images, return_tensors="pt")
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
+        yield buffer, buffer
+# Wrapper functions for Gradio clarity
+def generate_image(*args):
+    yield from generate(*args[:3], media_input=args[2], media_type="image", *args[3:])
+def generate_video(*args):
+    yield from generate(*args[:3], media_input=args[2], media_type="video", *args[3:])
+# Define examples for image and video inference
 image_examples = [
+    ["Reconstruct the doc [table] as it is.", "images/0.png"],
+    ["Describe the image!", "images/8.png"],
+    ["OCR the image", "images/2.jpg"],
+    ["Convert this page to docling", "images/1.png"],
+    ["Convert this page to docling", "images/3.png"],
+    ["Convert chart to OTSL.", "images/4.png"],
+    ["Convert code to text", "images/5.jpg"],
+    ["Convert this table to OTSL.", "images/6.jpg"],
+    ["Convert formula to late.", "images/7.jpg"],
 ]
+video_examples = [
+    ["Explain the video in detail.", "videos/1.mp4"],
+    ["Explain the video in detail.", "videos/2.mp4"]
+]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+                    image_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Upload Video (<= 30s)", height=290)
+                    video_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
+            gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
+            with gr.Accordion("(Result.md)", open=True):
+                formatted_output = gr.Markdown(label="(Result.md)")
+            model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Nanonets-OCR2-1.5B-exp"],
+                label="Select Model",
+                value="Nanonets-OCR2-3B"
+            )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[raw_output, formatted_output]
+    )
+    video_submit.click(
+        fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[raw_output, formatted_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(ssr_mode=False, show_error=True)