Spaces:

coderprabhat
/

olmOCR

Runtime error

File size: 5,864 Bytes

322bbf8

import torch
import base64
import gradio as gr
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
import warnings
warnings.filterwarnings('ignore')

# Initialize the model with CPU optimizations
print("Loading model... This may take a few minutes on CPU")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "allenai/olmOCR-2-7B-1025", 
    torch_dtype=torch.float32,  # Use float32 for CPU
    low_cpu_mem_usage=True,     # Optimize memory usage
).eval()

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
device = torch.device("cpu")
model.to(device)
print("Model loaded successfully")

def process_document(file, page_number, max_tokens):
    """
    Process a PDF or image file and extract text using olmOCR
    
    Args:
        file: Uploaded file (PDF, PNG, or JPEG)
        page_number: Page number to process (for PDFs)
        max_tokens: Maximum number of tokens to generate
    
    Returns:
        Extracted text output and processed image
    """
    if file is None:
        return "Please upload a file first.", None
    
    try:
        # Handle different file types
        if file.name.endswith('.pdf'):
            # Render PDF page to base64 image with smaller size for CPU
            image_base64 = render_pdf_to_base64png(
                file.name, 
                page_number, 
                target_longest_image_dim=1024  # Reduced from 1288 for CPU
            )
            main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
        else:
            # Handle image files directly
            main_image = Image.open(file.name)
            # Resize large images for CPU efficiency
            max_size = 1024
            if max(main_image.size) > max_size:
                main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
            
            buffered = BytesIO()
            main_image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode()
        
        # Build the full prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                ],
            }
        ]
        
        # Apply the chat template and processor
        text = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        inputs = processor(
            text=[text],
            images=[main_image],
            padding=True,
            return_tensors="pt",
        )
        inputs = {key: value.to(device) for (key, value) in inputs.items()}
        
        # Generate with CPU-optimized settings
        with torch.no_grad():  # Disable gradient computation for inference
            output = model.generate(
                **inputs,
                temperature=0.1,
                max_new_tokens=max_tokens,
                num_return_sequences=1,
                do_sample=False,  # Greedy decoding is faster on CPU
                num_beams=1,      # No beam search for speed
            )
        
        # Decode the output
        prompt_length = inputs["input_ids"].shape[1]
        new_tokens = output[:, prompt_length:]
        text_output = processor.tokenizer.batch_decode(
            new_tokens, skip_special_tokens=True
        )
        
        return text_output[0], main_image
        
    except Exception as e:
        return f"Error processing file: {str(e)}", None

# Create Gradio interface
with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
    gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
    gr.Markdown("""
    Upload a PDF or image file to extract text using the olmOCR model.
    
    ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page.
    """)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Document (PDF, PNG, or JPEG)",
                file_types=[".pdf", ".png", ".jpg", ".jpeg"]
            )
            page_number = gr.Slider(
                minimum=1, 
                maximum=50, 
                value=1, 
                step=1,
                label="Page Number (for PDFs)"
            )
            max_tokens = gr.Slider(
                minimum=100, 
                maximum=1024,  # Reduced max for CPU
                value=512, 
                step=50,
                label="Max Tokens"
            )
            process_btn = gr.Button("Extract Text", variant="primary")
            
            gr.Markdown("""
            ### Tips for CPU Usage:
            - Smaller images process faster
            - First run may be slower (model loading)
            - Reduce max tokens for faster results
            """)
        
        with gr.Column():
            output_text = gr.Textbox(
                label="Extracted Text", 
                lines=20,
                placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
            )
            output_image = gr.Image(label="Processed Image")
    
    process_btn.click(
        fn=process_document,
        inputs=[file_input, page_number, max_tokens],
        outputs=[output_text, output_image]
    )
    
    gr.Examples(
        examples=[],
        inputs=[file_input]
    )

if __name__ == "__main__":
    demo.queue(max_size=3)  # Limit queue to prevent overload
    demo.launch(server_name="0.0.0.0", server_port=7860)