Spaces:

lightonai
/

LightOnOCR-1B-Demo

Running

App Files Files Community

staghado commited on Oct 22

Commit

6804c82

verified ·

1 Parent(s): 3c5f2af

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -24

app.py CHANGED Viewed

@@ -6,12 +6,14 @@ import requests
 import gradio as gr
 from PIL import Image
 from io import BytesIO
 ENDPOINT = os.environ.get("VLLM_ENDPOINT")
 MODEL = os.environ.get("VLLM_MODEL")
 if not ENDPOINT or not MODEL:
-    raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set. Please add them as secrets in your Space settings.")
 def image_to_base64(image):
@@ -20,22 +22,63 @@ def image_to_base64(image):
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
-def process_image(image, temperature):
-    if image is None:
-        yield "Please upload an image first.", ""
         return
-    b64_image = image_to_base64(image)
     payload = {
         "model": MODEL,
         "messages": [
             {
                 "role": "user",
-                "content": [
-                    {"type": "text", "text": ""},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}
-                ]
             }
         ],
         "temperature": temperature,
@@ -66,9 +109,9 @@ def process_image(image, temperature):
                     chunk = json.loads(line)
                     if 'choices' in chunk and len(chunk['choices']) > 0:
                         delta = chunk['choices'][0].get('delta', {})
-                        content = delta.get('content', '')
-                        if content:
-                            accumulated_response += content
                             yield accumulated_response, accumulated_response
                 except json.JSONDecodeError:
                     continue
@@ -78,27 +121,33 @@ def process_image(image, temperature):
         yield error_msg, error_msg
-with gr.Blocks(title="📖 Image OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 📖 Image to Text Extraction
         **💡 How to use:**
-        1. Upload an image using the upload box
         2. Adjust temperature if needed
         3. Click "Extract Text" to process
-        The model will extract and format text from your image.
         """
     )
     with gr.Row():
-        with gr.Column():
             image_input = gr.Image(
                 type="pil",
                 label="🖼️ Upload Image",
                 sources=["upload", "clipboard"],
-                height=600
             )
             temperature = gr.Slider(
                 minimum=0.1,
                 maximum=1.0,
@@ -109,7 +158,7 @@ with gr.Blocks(title="📖 Image OCR", theme=gr.themes.Soft()) as demo:
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
-        with gr.Column():
             output_text = gr.Markdown(
                 label="📄 Extracted Text (Rendered)",
                 value="<div style='min-height: 600px; padding: 10px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #f9f9f9;'><em>Extracted text will appear here...</em></div>",
@@ -121,19 +170,20 @@ with gr.Blocks(title="📖 Image OCR", theme=gr.themes.Soft()) as demo:
             raw_output = gr.Textbox(
                 label="Raw Markdown Output",
                 placeholder="Raw text will appear here...",
-                lines=30,
                 show_copy_button=True
             )
     submit_btn.click(
-        fn=process_image,
-        inputs=[image_input, temperature],
         outputs=[output_text, raw_output]
     )
     clear_btn.click(
-        fn=lambda: (None, "", ""),
-        outputs=[image_input, output_text, raw_output]
     )

 import gradio as gr
 from PIL import Image
 from io import BytesIO
+import pypdfium2 as pdfium
+from pathlib import Path
 ENDPOINT = os.environ.get("VLLM_ENDPOINT")
 MODEL = os.environ.get("VLLM_MODEL")
 if not ENDPOINT or not MODEL:
+    raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
 def image_to_base64(image):
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def render_pdf_page(page, max_resolution=1540, scale=2.77):
+    width, height = page.get_size()
+    pixel_width = width * scale
+    pixel_height = height * scale
+    resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
+    target_scale = scale * resize_factor
+    return page.render(scale=target_scale, rev_byteorder=True).to_pil()
+def process_pdf(pdf_path, max_pages=5):
+    pdf = pdfium.PdfDocument(pdf_path)
+    num_pages = min(len(pdf), max_pages)
+    images = []
+    for i in range(num_pages):
+        page = pdf[i]
+        img = render_pdf_page(page)
+        images.append(img)
+    pdf.close()
+    return images
+def process_input(image, pdf_file, temperature):
+    if image is None and pdf_file is None:
+        yield "Please upload an image or PDF first.", ""
         return
+    images_to_process = []
+    if pdf_file is not None:
+        try:
+            images_to_process = process_pdf(pdf_file, max_pages=5)
+            if len(images_to_process) == 0:
+                yield "Error: Could not extract pages from PDF.", ""
+                return
+        except Exception as e:
+            yield f"Error processing PDF: {str(e)}", ""
+            return
+    elif image is not None:
+        images_to_process = [image]
+    content = [{"type": "text", "text": ""}]
+    for img in images_to_process:
+        b64_image = image_to_base64(img)
+        content.append({
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{b64_image}"}
+        })
     payload = {
         "model": MODEL,
         "messages": [
             {
                 "role": "user",
+                "content": content
             }
         ],
         "temperature": temperature,
                     chunk = json.loads(line)
                     if 'choices' in chunk and len(chunk['choices']) > 0:
                         delta = chunk['choices'][0].get('delta', {})
+                        content_delta = delta.get('content', '')
+                        if content_delta:
+                            accumulated_response += content_delta
                             yield accumulated_response, accumulated_response
                 except json.JSONDecodeError:
                     continue
         yield error_msg, error_msg
+with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 📖 Image/PDF to Text Extraction
         **💡 How to use:**
+        1. Upload an image OR a PDF (max 5 pages)
         2. Adjust temperature if needed
         3. Click "Extract Text" to process
+        The model will extract and format text from your document.
         """
     )
     with gr.Row():
+        with gr.Column(scale=1):
             image_input = gr.Image(
                 type="pil",
                 label="🖼️ Upload Image",
                 sources=["upload", "clipboard"],
+                height=400
+            )
+            pdf_input = gr.File(
+                label="📄 Upload PDF (max 5 pages)",
+                file_types=[".pdf"],
+                type="filepath"
             )
+            gr.Markdown("*Upload either an image or PDF, not both*")
             temperature = gr.Slider(
                 minimum=0.1,
                 maximum=1.0,
             submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
+        with gr.Column(scale=2):
             output_text = gr.Markdown(
                 label="📄 Extracted Text (Rendered)",
                 value="<div style='min-height: 600px; padding: 10px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #f9f9f9;'><em>Extracted text will appear here...</em></div>",
             raw_output = gr.Textbox(
                 label="Raw Markdown Output",
                 placeholder="Raw text will appear here...",
+                lines=20,
+                max_lines=30,
                 show_copy_button=True
             )
     submit_btn.click(
+        fn=process_input,
+        inputs=[image_input, pdf_input, temperature],
         outputs=[output_text, raw_output]
     )
     clear_btn.click(
+        fn=lambda: (None, None, "", ""),
+        outputs=[image_input, pdf_input, output_text, raw_output]
     )