import torch import base64 import gradio as gr from io import BytesIO from PIL import Image from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_no_anchoring_v4_yaml_prompt import warnings warnings.filterwarnings('ignore') # Initialize the model with CPU optimizations print("Loading model... This may take a few minutes on CPU") model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-2-7B-1025", torch_dtype=torch.float32, # Use float32 for CPU low_cpu_mem_usage=True, # Optimize memory usage ).eval() processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") device = torch.device("cpu") model.to(device) print("Model loaded successfully") def process_document(file, page_number, max_tokens): """ Process a PDF or image file and extract text using olmOCR Args: file: Uploaded file (PDF, PNG, or JPEG) page_number: Page number to process (for PDFs) max_tokens: Maximum number of tokens to generate Returns: Extracted text output and processed image """ if file is None: return "Please upload a file first.", None try: # Handle different file types if file.name.endswith('.pdf'): # Render PDF page to base64 image with smaller size for CPU image_base64 = render_pdf_to_base64png( file.name, page_number, target_longest_image_dim=1024 # Reduced from 1288 for CPU ) main_image = Image.open(BytesIO(base64.b64decode(image_base64))) else: # Handle image files directly main_image = Image.open(file.name) # Resize large images for CPU efficiency max_size = 1024 if max(main_image.size) > max_size: main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) buffered = BytesIO() main_image.save(buffered, format="PNG") image_base64 = base64.b64encode(buffered.getvalue()).decode() # Build the full prompt messages = [ { "role": "user", "content": [ {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ] # Apply the chat template and processor text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=[text], images=[main_image], padding=True, return_tensors="pt", ) inputs = {key: value.to(device) for (key, value) in inputs.items()} # Generate with CPU-optimized settings with torch.no_grad(): # Disable gradient computation for inference output = model.generate( **inputs, temperature=0.1, max_new_tokens=max_tokens, num_return_sequences=1, do_sample=False, # Greedy decoding is faster on CPU num_beams=1, # No beam search for speed ) # Decode the output prompt_length = inputs["input_ids"].shape[1] new_tokens = output[:, prompt_length:] text_output = processor.tokenizer.batch_decode( new_tokens, skip_special_tokens=True ) return text_output[0], main_image except Exception as e: return f"Error processing file: {str(e)}", None # Create Gradio interface with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo: gr.Markdown("# olmOCR: Document OCR with Vision Language Models") gr.Markdown(""" Upload a PDF or image file to extract text using the olmOCR model. ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page. """) with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Document (PDF, PNG, or JPEG)", file_types=[".pdf", ".png", ".jpg", ".jpeg"] ) page_number = gr.Slider( minimum=1, maximum=50, value=1, step=1, label="Page Number (for PDFs)" ) max_tokens = gr.Slider( minimum=100, maximum=1024, # Reduced max for CPU value=512, step=50, label="Max Tokens" ) process_btn = gr.Button("Extract Text", variant="primary") gr.Markdown(""" ### Tips for CPU Usage: - Smaller images process faster - First run may be slower (model loading) - Reduce max tokens for faster results """) with gr.Column(): output_text = gr.Textbox( label="Extracted Text", lines=20, placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds." ) output_image = gr.Image(label="Processed Image") process_btn.click( fn=process_document, inputs=[file_input, page_number, max_tokens], outputs=[output_text, output_image] ) gr.Examples( examples=[], inputs=[file_input] ) if __name__ == "__main__": demo.queue(max_size=3) # Limit queue to prevent overload demo.launch(server_name="0.0.0.0", server_port=7860)