Spaces:

coderprabhat
/

olmOCR

Runtime error

App Files Files Community

coderprabhat commited on Nov 4

Commit

322bbf8

1 Parent(s): 49129f9

Add olmOCR Gradio app for Hugging Face Spaces deployment

Browse files

Files changed (2) hide show

app.py +170 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import torch
+import base64
+import gradio as gr
+from io import BytesIO
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
+import warnings
+warnings.filterwarnings('ignore')
+# Initialize the model with CPU optimizations
+print("Loading model... This may take a few minutes on CPU")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "allenai/olmOCR-2-7B-1025",
+    torch_dtype=torch.float32,  # Use float32 for CPU
+    low_cpu_mem_usage=True,     # Optimize memory usage
+).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+device = torch.device("cpu")
+model.to(device)
+print("Model loaded successfully")
+def process_document(file, page_number, max_tokens):
+    """
+    Process a PDF or image file and extract text using olmOCR
+    Args:
+        file: Uploaded file (PDF, PNG, or JPEG)
+        page_number: Page number to process (for PDFs)
+        max_tokens: Maximum number of tokens to generate
+    Returns:
+        Extracted text output and processed image
+    """
+    if file is None:
+        return "Please upload a file first.", None
+    try:
+        # Handle different file types
+        if file.name.endswith('.pdf'):
+            # Render PDF page to base64 image with smaller size for CPU
+            image_base64 = render_pdf_to_base64png(
+                file.name,
+                page_number,
+                target_longest_image_dim=1024  # Reduced from 1288 for CPU
+            )
+            main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+        else:
+            # Handle image files directly
+            main_image = Image.open(file.name)
+            # Resize large images for CPU efficiency
+            max_size = 1024
+            if max(main_image.size) > max_size:
+                main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+            buffered = BytesIO()
+            main_image.save(buffered, format="PNG")
+            image_base64 = base64.b64encode(buffered.getvalue()).decode()
+        # Build the full prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Apply the chat template and processor
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = processor(
+            text=[text],
+            images=[main_image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(device) for (key, value) in inputs.items()}
+        # Generate with CPU-optimized settings
+        with torch.no_grad():  # Disable gradient computation for inference
+            output = model.generate(
+                **inputs,
+                temperature=0.1,
+                max_new_tokens=max_tokens,
+                num_return_sequences=1,
+                do_sample=False,  # Greedy decoding is faster on CPU
+                num_beams=1,      # No beam search for speed
+            )
+        # Decode the output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = processor.tokenizer.batch_decode(
+            new_tokens, skip_special_tokens=True
+        )
+        return text_output[0], main_image
+    except Exception as e:
+        return f"Error processing file: {str(e)}", None
+# Create Gradio interface
+with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
+    gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
+    gr.Markdown("""
+    Upload a PDF or image file to extract text using the olmOCR model.
+    ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page.
+    """)
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload Document (PDF, PNG, or JPEG)",
+                file_types=[".pdf", ".png", ".jpg", ".jpeg"]
+            )
+            page_number = gr.Slider(
+                minimum=1,
+                maximum=50,
+                value=1,
+                step=1,
+                label="Page Number (for PDFs)"
+            )
+            max_tokens = gr.Slider(
+                minimum=100,
+                maximum=1024,  # Reduced max for CPU
+                value=512,
+                step=50,
+                label="Max Tokens"
+            )
+            process_btn = gr.Button("Extract Text", variant="primary")
+            gr.Markdown("""
+            ### Tips for CPU Usage:
+            - Smaller images process faster
+            - First run may be slower (model loading)
+            - Reduce max tokens for faster results
+            """)
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="Extracted Text",
+                lines=20,
+                placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
+            )
+            output_image = gr.Image(label="Processed Image")
+    process_btn.click(
+        fn=process_document,
+        inputs=[file_input, page_number, max_tokens],
+        outputs=[output_text, output_image]
+    )
+    gr.Examples(
+        examples=[],
+        inputs=[file_input]
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=3)  # Limit queue to prevent overload
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ olmocr>=0.4.0