testingLightOnOCR2cpu

Paused

App Files Files Community

Bapt120 commited on Jan 19

Commit

a86e158

verified ·

1 Parent(s): d40348f

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -82

app.py CHANGED Viewed

@@ -1,24 +1,28 @@
 #!/usr/bin/env python3
 import os
 import subprocess
 import sys
 import threading
 import spaces
 import torch
-import gradio as gr
 from PIL import Image
-from io import BytesIO
-import pypdfium2 as pdfium
 from transformers import (
     LightOnOcrForConditionalGeneration,
     LightOnOcrProcessor,
     TextIteratorStreamer,
 )
-import re
-import base64
-from collections import OrderedDict
 # Model Registry with all supported models
 MODEL_REGISTRY = {
@@ -26,11 +30,13 @@ MODEL_REGISTRY = {
         "model_id": "lightonai/LightOnOCR-2-1B",
         "has_bbox": False,
         "description": "Best overall OCR performance",
     },
     "LightOnOCR-2-1B-bbox (Best Bbox)": {
         "model_id": "lightonai/LightOnOCR-2-1B-bbox",
         "has_bbox": True,
         "description": "Best bounding box detection",
     },
     "LightOnOCR-2-1B-base": {
         "model_id": "lightonai/LightOnOCR-2-1B-base",
@@ -102,18 +108,20 @@ class ModelManager:
         # Load new model
         print(f"Loading model: {model_name} ({model_id})...")
         hf_token = os.environ.get("HF_TOKEN")
-        model = LightOnOcrForConditionalGeneration.from_pretrained(
-            model_id,
-            attn_implementation=attn_implementation,
-            torch_dtype=dtype,
-            trust_remote_code=True,
-            token=hf_token
-        ).to(device).eval()
         processor = LightOnOcrProcessor.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            token=hf_token
         )
         # Add to cache
@@ -147,10 +155,10 @@ def process_pdf(pdf_path, page_num=1):
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
     page = pdf[page_idx]
     img = render_pdf_page(page)
     pdf.close()
     return img, total_pages, page_idx + 1
@@ -159,31 +167,31 @@ def clean_output_text(text):
     """Remove chat template artifacts from output."""
     # Remove common chat template markers
     markers_to_remove = ["system", "user", "assistant"]
     # Split by lines and filter
-    lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
         # Skip lines that are just template markers
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
     # Join back and strip leading/trailing whitespace
-    cleaned = '\n'.join(cleaned_lines).strip()
     # Alternative approach: if there's an "assistant" marker, take everything after it
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
 # Bbox parsing pattern: ![image](image_N.png)x1,y1,x2,y2 (no space between)
-BBOX_PATTERN = r'!\[image\]\((image_\d+\.png)\)\s*(\d+),(\d+),(\d+),(\d+)'
 def parse_bbox_output(text):
@@ -191,12 +199,11 @@ def parse_bbox_output(text):
     detections = []
     for match in re.finditer(BBOX_PATTERN, text):
         image_ref, x1, y1, x2, y2 = match.groups()
-        detections.append({
-            "ref": image_ref,
-            "coords": (int(x1), int(y1), int(x2), int(y2))
-        })
     # Clean text: remove coordinates, keep markdown image refs
-    cleaned = re.sub(BBOX_PATTERN, r'![image](\1)', text)
     return cleaned, detections
@@ -226,6 +233,71 @@ def image_to_data_uri(image):
     return f"data:image/png;base64,{b64}"
 def render_bbox_with_crops(raw_output, source_image):
     """Replace markdown image placeholders with actual cropped images."""
     cleaned, detections = parse_bbox_output(raw_output)
@@ -236,8 +308,7 @@ def render_bbox_with_crops(raw_output, source_image):
             data_uri = image_to_data_uri(cropped)
             # Replace ![image](image_N.png) with ![Cropped](data:...)
             cleaned = cleaned.replace(
-                f"![image]({bbox['ref']})",
-                f"![Cropped region]({data_uri})"
             )
         except Exception as e:
             print(f"Error cropping bbox {bbox}: {e}")
@@ -250,6 +321,13 @@ def render_bbox_with_crops(raw_output, source_image):
 @spaces.GPU
 def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
     """Extract text from image using LightOnOCR model."""
     # Get model and processor from cache or load
     model, processor = model_manager.get_model(model_name)
@@ -269,13 +347,16 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt"
     )
     # Move inputs to device AND convert to the correct dtype
     inputs = {
-        k: v.to(device=device, dtype=dtype) if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
-        else v.to(device) if isinstance(v, torch.Tensor)
         else v
         for k, v in inputs.items()
     }
@@ -293,9 +374,7 @@ def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
     if stream:
         # Setup streamer for streaming generation
         streamer = TextIteratorStreamer(
-            processor.tokenizer,
-            skip_prompt=True,
-            skip_special_tokens=True
         )
         generation_kwargs["streamer"] = streamer
@@ -338,9 +417,11 @@ def process_input(file_input, model_name, temperature, page_num, enable_streamin
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
-    if file_path.lower().endswith('.pdf'):
         try:
-            image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             page_info = f"Processing page {actual_page} of {total_pages}"
         except Exception as e:
             yield f"Error processing PDF: {str(e)}", "", "", None, gr.update()
@@ -360,13 +441,21 @@ def process_input(file_input, model_name, temperature, page_num, enable_streamin
     try:
         # Extract text using LightOnOCR with optional streaming
-        for extracted_text in extract_text_from_image(image_to_process, model_name, temperature, stream=enable_streaming):
             # For bbox models, render cropped images inline
             if has_bbox:
                 rendered_text = render_bbox_with_crops(extracted_text, image_to_process)
             else:
                 rendered_text = extracted_text
-            yield rendered_text, extracted_text, page_info, image_to_process, gr.update()
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
@@ -377,10 +466,10 @@ def update_slider(file_input):
     """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
-    if file_path.lower().endswith('.pdf'):
         try:
             pdf = pdfium.PdfDocument(file_path)
             total_pages = len(pdf)
@@ -396,7 +485,11 @@ def update_slider(file_input):
 def get_model_info_text(model_name):
     """Return formatted model info string."""
     info = MODEL_REGISTRY.get(model_name, {})
-    has_bbox = "Yes - will show cropped regions inline" if info.get("has_bbox", False) else "No"
     return f"**Description:** {info.get('description', 'N/A')}\n**Bounding Box Detection:** {has_bbox}"
@@ -415,29 +508,25 @@ with gr.Blocks(title="LightOnOCR-2 Multi-Model OCR") as demo:
 **Device:** {device.upper()} | **Attention:** {attn_implementation}
 """)
     with gr.Row():
         with gr.Column(scale=1):
             model_selector = gr.Dropdown(
                 choices=list(MODEL_REGISTRY.keys()),
                 value=DEFAULT_MODEL,
                 label="Model",
-                info="Select OCR model variant"
             )
             model_info = gr.Markdown(
-                value=get_model_info_text(DEFAULT_MODEL),
-                label="Model Info"
             )
             file_input = gr.File(
                 label="Upload Image or PDF",
                 file_types=[".pdf", ".png", ".jpg", ".jpeg"],
-                type="filepath"
             )
             rendered_image = gr.Image(
-                label="Preview",
-                type="pil",
-                height=400,
-                interactive=False
             )
             num_pages = gr.Slider(
                 minimum=1,
@@ -445,68 +534,76 @@ with gr.Blocks(title="LightOnOCR-2 Multi-Model OCR") as demo:
                 value=1,
                 step=1,
                 label="PDF: Page Number",
-                info="Select which page to extract"
-            )
-            page_info = gr.Textbox(
-                label="Processing Info",
-                value="",
-                interactive=False
             )
             temperature = gr.Slider(
                 minimum=0.0,
                 maximum=1.0,
                 value=0.2,
                 step=0.05,
                 label="Temperature",
-                info="0.0 = deterministic, Higher = more varied"
             )
             enable_streaming = gr.Checkbox(
                 label="Enable Streaming",
                 value=True,
-                info="Show text progressively as it's generated"
             )
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             output_text = gr.Markdown(
                 label="📄 Extracted Text (Rendered)",
-                value="*Extracted text will appear here...*"
             )
     with gr.Row():
         with gr.Column():
             raw_output = gr.Textbox(
                 label="Raw Markdown Output",
                 placeholder="Raw text will appear here...",
                 lines=20,
-                max_lines=30
             )
     # Event handlers
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, model_selector, temperature, num_pages, enable_streaming],
-        outputs=[output_text, raw_output, page_info, rendered_image, num_pages]
     )
-    file_input.change(
-        fn=update_slider,
-        inputs=[file_input],
-        outputs=[num_pages]
-    )
     model_selector.change(
-        fn=get_model_info_text,
-        inputs=[model_selector],
-        outputs=[model_info]
     )
     clear_btn.click(
-        fn=lambda: (None, DEFAULT_MODEL, get_model_info_text(DEFAULT_MODEL), "*Extracted text will appear here...*", "", "", None, 1),
-        outputs=[file_input, model_selector, model_info, output_text, raw_output, page_info, rendered_image, num_pages]
     )
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Soft())

 #!/usr/bin/env python3
+import base64
 import os
+import re
 import subprocess
 import sys
 import threading
+from collections import OrderedDict
+from io import BytesIO
+import gradio as gr
+import pypdfium2 as pdfium
 import spaces
 import torch
+from openai import OpenAI
 from PIL import Image
 from transformers import (
     LightOnOcrForConditionalGeneration,
     LightOnOcrProcessor,
     TextIteratorStreamer,
 )
+# vLLM endpoint configuration from environment variables
+VLLM_ENDPOINT_OCR = os.environ.get("VLLM_ENDPOINT_OCR")
+VLLM_ENDPOINT_BBOX = os.environ.get("VLLM_ENDPOINT_BBOX")
 # Model Registry with all supported models
 MODEL_REGISTRY = {
         "model_id": "lightonai/LightOnOCR-2-1B",
         "has_bbox": False,
         "description": "Best overall OCR performance",
+        "vllm_endpoint": VLLM_ENDPOINT_OCR,
     },
     "LightOnOCR-2-1B-bbox (Best Bbox)": {
         "model_id": "lightonai/LightOnOCR-2-1B-bbox",
         "has_bbox": True,
         "description": "Best bounding box detection",
+        "vllm_endpoint": VLLM_ENDPOINT_BBOX,
     },
     "LightOnOCR-2-1B-base": {
         "model_id": "lightonai/LightOnOCR-2-1B-base",
         # Load new model
         print(f"Loading model: {model_name} ({model_id})...")
         hf_token = os.environ.get("HF_TOKEN")
+        model = (
+            LightOnOcrForConditionalGeneration.from_pretrained(
+                model_id,
+                attn_implementation=attn_implementation,
+                torch_dtype=dtype,
+                trust_remote_code=True,
+                token=hf_token,
+            )
+            .to(device)
+            .eval()
+        )
         processor = LightOnOcrProcessor.from_pretrained(
+            model_id, trust_remote_code=True, token=hf_token
         )
         # Add to cache
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
     page = pdf[page_idx]
     img = render_pdf_page(page)
     pdf.close()
     return img, total_pages, page_idx + 1
     """Remove chat template artifacts from output."""
     # Remove common chat template markers
     markers_to_remove = ["system", "user", "assistant"]
     # Split by lines and filter
+    lines = text.split("\n")
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
         # Skip lines that are just template markers
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
     # Join back and strip leading/trailing whitespace
+    cleaned = "\n".join(cleaned_lines).strip()
     # Alternative approach: if there's an "assistant" marker, take everything after it
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
 # Bbox parsing pattern: ![image](image_N.png)x1,y1,x2,y2 (no space between)
+BBOX_PATTERN = r"!\[image\]\((image_\d+\.png)\)\s*(\d+),(\d+),(\d+),(\d+)"
 def parse_bbox_output(text):
     detections = []
     for match in re.finditer(BBOX_PATTERN, text):
         image_ref, x1, y1, x2, y2 = match.groups()
+        detections.append(
+            {"ref": image_ref, "coords": (int(x1), int(y1), int(x2), int(y2))}
+        )
     # Clean text: remove coordinates, keep markdown image refs
+    cleaned = re.sub(BBOX_PATTERN, r"![image](\1)", text)
     return cleaned, detections
     return f"data:image/png;base64,{b64}"
+def extract_text_via_vllm(image, model_name, temperature=0.2, stream=False):
+    """Extract text from image using vLLM endpoint."""
+    config = MODEL_REGISTRY.get(model_name)
+    if config is None:
+        raise ValueError(f"Unknown model: {model_name}")
+    endpoint = config.get("vllm_endpoint")
+    if endpoint is None:
+        raise ValueError(f"Model {model_name} does not have a vLLM endpoint")
+    model_id = config["model_id"]
+    # Convert image to base64 data URI
+    if isinstance(image, Image.Image):
+        image_uri = image_to_data_uri(image)
+    else:
+        # Assume it's already a data URI or URL
+        image_uri = image
+    # Create OpenAI client pointing to vLLM endpoint
+    client = OpenAI(base_url=endpoint, api_key="not-needed")
+    # Prepare the message with image
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_uri}},
+            ],
+        }
+    ]
+    if stream:
+        # Streaming response
+        response = client.chat.completions.create(
+            model=model_id,
+            messages=messages,
+            max_tokens=2048,
+            temperature=temperature if temperature > 0 else 0.0,
+            top_p=0.9,
+            stream=True,
+        )
+        full_text = ""
+        for chunk in response:
+            if chunk.choices and chunk.choices[0].delta.content:
+                full_text += chunk.choices[0].delta.content
+                cleaned_text = clean_output_text(full_text)
+                yield cleaned_text
+    else:
+        # Non-streaming response
+        response = client.chat.completions.create(
+            model=model_id,
+            messages=messages,
+            max_tokens=2048,
+            temperature=temperature if temperature > 0 else 0.0,
+            top_p=0.9,
+            stream=False,
+        )
+        output_text = response.choices[0].message.content
+        cleaned_text = clean_output_text(output_text)
+        yield cleaned_text
 def render_bbox_with_crops(raw_output, source_image):
     """Replace markdown image placeholders with actual cropped images."""
     cleaned, detections = parse_bbox_output(raw_output)
             data_uri = image_to_data_uri(cropped)
             # Replace ![image](image_N.png) with ![Cropped](data:...)
             cleaned = cleaned.replace(
+                f"![image]({bbox['ref']})", f"![Cropped region]({data_uri})"
             )
         except Exception as e:
             print(f"Error cropping bbox {bbox}: {e}")
 @spaces.GPU
 def extract_text_from_image(image, model_name, temperature=0.2, stream=False):
     """Extract text from image using LightOnOCR model."""
+    # Check if model has a vLLM endpoint configured
+    config = MODEL_REGISTRY.get(model_name, {})
+    if config.get("vllm_endpoint"):
+        # Use vLLM endpoint instead of local model
+        yield from extract_text_via_vllm(image, model_name, temperature, stream)
+        return
     # Get model and processor from cache or load
     model, processor = model_manager.get_model(model_name)
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
+        return_tensors="pt",
     )
     # Move inputs to device AND convert to the correct dtype
     inputs = {
+        k: v.to(device=device, dtype=dtype)
+        if isinstance(v, torch.Tensor)
+        and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
+        else v.to(device)
+        if isinstance(v, torch.Tensor)
         else v
         for k, v in inputs.items()
     }
     if stream:
         # Setup streamer for streaming generation
         streamer = TextIteratorStreamer(
+            processor.tokenizer, skip_prompt=True, skip_special_tokens=True
         )
         generation_kwargs["streamer"] = streamer
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
+    if file_path.lower().endswith(".pdf"):
         try:
+            image_to_process, total_pages, actual_page = process_pdf(
+                file_path, int(page_num)
+            )
             page_info = f"Processing page {actual_page} of {total_pages}"
         except Exception as e:
             yield f"Error processing PDF: {str(e)}", "", "", None, gr.update()
     try:
         # Extract text using LightOnOCR with optional streaming
+        for extracted_text in extract_text_from_image(
+            image_to_process, model_name, temperature, stream=enable_streaming
+        ):
             # For bbox models, render cropped images inline
             if has_bbox:
                 rendered_text = render_bbox_with_crops(extracted_text, image_to_process)
             else:
                 rendered_text = extracted_text
+            yield (
+                rendered_text,
+                extracted_text,
+                page_info,
+                image_to_process,
+                gr.update(),
+            )
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
     """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
+    if file_path.lower().endswith(".pdf"):
         try:
             pdf = pdfium.PdfDocument(file_path)
             total_pages = len(pdf)
 def get_model_info_text(model_name):
     """Return formatted model info string."""
     info = MODEL_REGISTRY.get(model_name, {})
+    has_bbox = (
+        "Yes - will show cropped regions inline"
+        if info.get("has_bbox", False)
+        else "No"
+    )
     return f"**Description:** {info.get('description', 'N/A')}\n**Bounding Box Detection:** {has_bbox}"
 **Device:** {device.upper()} | **Attention:** {attn_implementation}
 """)
     with gr.Row():
         with gr.Column(scale=1):
             model_selector = gr.Dropdown(
                 choices=list(MODEL_REGISTRY.keys()),
                 value=DEFAULT_MODEL,
                 label="Model",
+                info="Select OCR model variant",
             )
             model_info = gr.Markdown(
+                value=get_model_info_text(DEFAULT_MODEL), label="Model Info"
             )
             file_input = gr.File(
                 label="Upload Image or PDF",
                 file_types=[".pdf", ".png", ".jpg", ".jpeg"],
+                type="filepath",
             )
             rendered_image = gr.Image(
+                label="Preview", type="pil", height=400, interactive=False
             )
             num_pages = gr.Slider(
                 minimum=1,
                 value=1,
                 step=1,
                 label="PDF: Page Number",
+                info="Select which page to extract",
             )
+            page_info = gr.Textbox(label="Processing Info", value="", interactive=False)
             temperature = gr.Slider(
                 minimum=0.0,
                 maximum=1.0,
                 value=0.2,
                 step=0.05,
                 label="Temperature",
+                info="0.0 = deterministic, Higher = more varied",
             )
             enable_streaming = gr.Checkbox(
                 label="Enable Streaming",
                 value=True,
+                info="Show text progressively as it's generated",
             )
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             output_text = gr.Markdown(
                 label="📄 Extracted Text (Rendered)",
+                value="*Extracted text will appear here...*",
             )
     with gr.Row():
         with gr.Column():
             raw_output = gr.Textbox(
                 label="Raw Markdown Output",
                 placeholder="Raw text will appear here...",
                 lines=20,
+                max_lines=30,
             )
     # Event handlers
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, model_selector, temperature, num_pages, enable_streaming],
+        outputs=[output_text, raw_output, page_info, rendered_image, num_pages],
     )
+    file_input.change(fn=update_slider, inputs=[file_input], outputs=[num_pages])
     model_selector.change(
+        fn=get_model_info_text, inputs=[model_selector], outputs=[model_info]
     )
     clear_btn.click(
+        fn=lambda: (
+            None,
+            DEFAULT_MODEL,
+            get_model_info_text(DEFAULT_MODEL),
+            "*Extracted text will appear here...*",
+            "",
+            "",
+            None,
+            1,
+        ),
+        outputs=[
+            file_input,
+            model_selector,
+            model_info,
+            output_text,
+            raw_output,
+            page_info,
+            rendered_image,
+            num_pages,
+        ],
     )
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Soft())