Spaces:

lightonai
/

LightOnOCR-1B-Demo

Running

App Files Files Community

Bapt120 commited on 29 days ago

Commit

80b2df0

verified ·

1 Parent(s): 5b6bee3

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -73

app.py CHANGED Viewed

@@ -1,29 +1,51 @@
 #!/usr/bin/env python3
-import os
-import json
-import base64
-import requests
 import gradio as gr
 from PIL import Image
 from io import BytesIO
 import pypdfium2 as pdfium
-ENDPOINT = os.environ.get("VLLM_ENDPOINT")
-MODEL = os.environ.get("VLLM_MODEL")
-if not ENDPOINT or not MODEL:
-    raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
-def image_to_base64(image):
-    buffered = BytesIO()
-    if image.mode == 'RGBA':
-        image = image.convert('RGB')
-    image.save(buffered, format="PNG")
-    return base64.b64encode(buffered.getvalue()).decode("utf-8")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
     width, height = page.get_size()
     pixel_width = width * scale
     pixel_height = height * scale
@@ -33,6 +55,7 @@ def render_pdf_page(page, max_resolution=1540, scale=2.77):
 def process_pdf(pdf_path, page_num=1):
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
@@ -44,7 +67,109 @@ def process_pdf(pdf_path, page_num=1):
     return img, total_pages, page_idx + 1
 def process_input(file_input, temperature, page_num):
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", None, gr.update()
         return
@@ -54,78 +179,35 @@ def process_input(file_input, temperature, page_num):
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith('.pdf'):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             page_info = f"Processing page {actual_page} of {total_pages}"
         except Exception as e:
-            yield f"Error processing PDF", "", "", None, gr.update()
             return
     else:
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
         except Exception as e:
-            yield f"Error opening image", "", "", None, gr.update()
             return
-    content = [
-        {"type": "text", "text": ""},
-        {
-            "type": "image_url",
-            "image_url": {"url": f"data:image/png;base64,{image_to_base64(image_to_process)}"}
-        }
-    ]
-    payload = {
-        "model": MODEL,
-        "messages": [{"role": "user", "content": content}],
-        "temperature": temperature,
-        "stream": True
-    }
     try:
-        response = requests.post(
-            ENDPOINT,
-            headers={"Content-Type": "application/json"},
-            data=json.dumps(payload),
-            stream=True
-        )
-        response.raise_for_status()
-        accumulated_response = ""
-        first_chunk = True
-        for line in response.iter_lines():
-            if line:
-                line = line.decode('utf-8')
-                if line.startswith('data: '):
-                    line = line[6:]
-                if line.strip() == '[DONE]':
-                    break
-                try:
-                    chunk = json.loads(line)
-                    if 'choices' in chunk and len(chunk['choices']) > 0:
-                        delta = chunk['choices'][0].get('delta', {})
-                        content_delta = delta.get('content', '')
-                        if content_delta:
-                            accumulated_response += content_delta
-                            if first_chunk:
-                                yield accumulated_response, accumulated_response, page_info, image_to_process, gr.update()
-                                first_chunk = False
-                            else:
-                                yield accumulated_response, accumulated_response, page_info, gr.update(), gr.update()
-                except json.JSONDecodeError:
-                    continue
     except Exception as e:
-        error_msg = f"Error"
         yield error_msg, error_msg, page_info, image_to_process, gr.update()
 def update_slider(file_input):
     if file_input is None:
         return gr.update(maximum=20, value=1)
@@ -143,17 +225,22 @@ def update_slider(file_input):
         return gr.update(maximum=1, value=1)
-with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-# 📖 Image/PDF to Text Extraction
 **💡 How to use:**
 1. Upload an image or PDF
 2. For PDFs: select which page to extract (1-20)
-3. Adjust temperature if needed
 4. Click "Extract Text"
-**Note:** The Markdown rendering for tables is not always correct, check the raw output for complex tables!
 """)
     with gr.Row():
@@ -183,11 +270,12 @@ with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
                 interactive=False
             )
             temperature = gr.Slider(
-                minimum=0.1,
                 maximum=1.0,
                 value=0.2,
                 step=0.05,
-                label="Temperature"
             )
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
@@ -208,6 +296,7 @@ with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
                 show_copy_button=True
             )
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, temperature, num_pages],

 #!/usr/bin/env python3
+import subprocess
+import sys
+import threading
+import spaces
+import torch
 import gradio as gr
 from PIL import Image
 from io import BytesIO
 import pypdfium2 as pdfium
+from transformers import (
+    LightOnOCRForConditionalGeneration,
+    LightOnOCRProcessor,
+    TextIteratorStreamer,
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Choose best attention implementation based on device
+if device == "cuda":
+    attn_implementation = "sdpa"
+    dtype = torch.bfloat16
+    print("Using sdpa for GPU")
+else:
+    attn_implementation = "eager"  # Best for CPU
+    dtype = torch.float32
+    print("Using eager attention for CPU")
+# Initialize the LightOnOCR model and processor
+print(f"Loading model on {device} with {attn_implementation} attention...")
+model = LightOnOCRForConditionalGeneration.from_pretrained(
+    "lightonai/LightOnOCR-1B-1025",
+    attn_implementation=attn_implementation,
+    torch_dtype=dtype,
+    trust_remote_code=True
+).to(device).eval()
+processor = LightOnOCRProcessor.from_pretrained(
+    "lightonai/LightOnOCR-1B-1025",
+    trust_remote_code=True
+)
+print("Model loaded successfully!")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
+    """Render a PDF page to PIL Image."""
     width, height = page.get_size()
     pixel_width = width * scale
     pixel_height = height * scale
 def process_pdf(pdf_path, page_num=1):
+    """Extract a specific page from PDF."""
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
     return img, total_pages, page_idx + 1
+def clean_output_text(text):
+    """Remove chat template artifacts from output."""
+    # Remove common chat template markers
+    markers_to_remove = ["system", "user", "assistant"]
+    # Split by lines and filter
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        stripped = line.strip()
+        # Skip lines that are just template markers
+        if stripped.lower() not in markers_to_remove:
+            cleaned_lines.append(line)
+    # Join back and strip leading/trailing whitespace
+    cleaned = '\n'.join(cleaned_lines).strip()
+    # Alternative approach: if there's an "assistant" marker, take everything after it
+    if "assistant" in text.lower():
+        parts = text.split("assistant", 1)
+        if len(parts) > 1:
+            cleaned = parts[1].strip()
+    return cleaned
+@spaces.GPU
+def extract_text_from_image(image, temperature=0.2, stream=False):
+    """Extract text from image using LightOnOCR model."""
+    # Prepare the chat format
+    chat = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": image},
+            ],
+        }
+    ]
+    # Apply chat template and tokenize
+    inputs = processor.apply_chat_template(
+        chat,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+    # Move inputs to device AND convert to the correct dtype
+    inputs = {
+        k: v.to(device=device, dtype=dtype) if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
+        else v.to(device) if isinstance(v, torch.Tensor)
+        else v
+        for k, v in inputs.items()
+    }
+    generation_kwargs = dict(
+        **inputs,
+        max_new_tokens=2048,
+        temperature=temperature if temperature > 0 else 0.0,
+        use_cache=True,
+        do_sample=temperature > 0,
+    )
+    if stream:
+        # Setup streamer for streaming generation
+        streamer = TextIteratorStreamer(
+            processor.tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        generation_kwargs["streamer"] = streamer
+        # Run generation in a separate thread
+        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Yield chunks as they arrive
+        full_text = ""
+        for new_text in streamer:
+            full_text += new_text
+            # Clean the accumulated text
+            cleaned_text = clean_output_text(full_text)
+            yield cleaned_text
+        thread.join()
+    else:
+        # Non-streaming generation
+        with torch.no_grad():
+            outputs = model.generate(**generation_kwargs)
+        # Decode the output
+        output_text = processor.decode(outputs[0], skip_special_tokens=True)
+        # Clean the output
+        cleaned_text = clean_output_text(output_text)
+        yield cleaned_text
 def process_input(file_input, temperature, page_num):
+    """Process uploaded file (image or PDF) and extract text with streaming."""
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", None, gr.update()
         return
     file_path = file_input if isinstance(file_input, str) else file_input.name
+    # Handle PDF files
     if file_path.lower().endswith('.pdf'):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             page_info = f"Processing page {actual_page} of {total_pages}"
         except Exception as e:
+            yield f"Error processing PDF: {str(e)}", "", "", None, gr.update()
             return
+    # Handle image files
     else:
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
         except Exception as e:
+            yield f"Error opening image: {str(e)}", "", "", None, gr.update()
             return
     try:
+        # Extract text using LightOnOCR with streaming
+        for extracted_text in extract_text_from_image(image_to_process, temperature, stream=True):
+            yield extracted_text, extracted_text, page_info, image_to_process, gr.update()
     except Exception as e:
+        error_msg = f"Error during text extraction: {str(e)}"
         yield error_msg, error_msg, page_info, image_to_process, gr.update()
 def update_slider(file_input):
+    """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
         return gr.update(maximum=1, value=1)
+# Create Gradio interface
+with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"""
+# 📖 Image/PDF to Text Extraction with LightOnOCR
 **💡 How to use:**
 1. Upload an image or PDF
 2. For PDFs: select which page to extract (1-20)
+3. Adjust temperature if needed (0.0 for deterministic, higher for more varied output)
 4. Click "Extract Text"
+**Note:** The Markdown rendering for tables may not always be perfect. Check the raw output for complex tables!
+**Model:** LightOnOCR-1B-1025 by LightOn AI
+**Device:** {device.upper()}
+**Attention:** {attn_implementation}
 """)
     with gr.Row():
                 interactive=False
             )
             temperature = gr.Slider(
+                minimum=0.0,
                 maximum=1.0,
                 value=0.2,
                 step=0.05,
+                label="Temperature",
+                info="0.0 = deterministic, Higher = more varied"
             )
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
                 show_copy_button=True
             )
+    # Event handlers
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, temperature, num_pages],