Spaces:
Runtime error
Runtime error
| import torch | |
| import base64 | |
| import gradio as gr | |
| from io import BytesIO | |
| from PIL import Image | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| from olmocr.data.renderpdf import render_pdf_to_base64png | |
| from olmocr.prompts import build_no_anchoring_v4_yaml_prompt | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Initialize the model with CPU optimizations | |
| print("Loading model... This may take a few minutes on CPU") | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| "allenai/olmOCR-2-7B-1025", | |
| torch_dtype=torch.float32, # Use float32 for CPU | |
| low_cpu_mem_usage=True, # Optimize memory usage | |
| ).eval() | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| device = torch.device("cpu") | |
| model.to(device) | |
| print("Model loaded successfully") | |
| def process_document(file, page_number, max_tokens): | |
| """ | |
| Process a PDF or image file and extract text using olmOCR | |
| Args: | |
| file: Uploaded file (PDF, PNG, or JPEG) | |
| page_number: Page number to process (for PDFs) | |
| max_tokens: Maximum number of tokens to generate | |
| Returns: | |
| Extracted text output and processed image | |
| """ | |
| if file is None: | |
| return "Please upload a file first.", None | |
| try: | |
| # Handle different file types | |
| if file.name.endswith('.pdf'): | |
| # Render PDF page to base64 image with smaller size for CPU | |
| image_base64 = render_pdf_to_base64png( | |
| file.name, | |
| page_number, | |
| target_longest_image_dim=1024 # Reduced from 1288 for CPU | |
| ) | |
| main_image = Image.open(BytesIO(base64.b64decode(image_base64))) | |
| else: | |
| # Handle image files directly | |
| main_image = Image.open(file.name) | |
| # Resize large images for CPU efficiency | |
| max_size = 1024 | |
| if max(main_image.size) > max_size: | |
| main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
| buffered = BytesIO() | |
| main_image.save(buffered, format="PNG") | |
| image_base64 = base64.b64encode(buffered.getvalue()).decode() | |
| # Build the full prompt | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, | |
| ], | |
| } | |
| ] | |
| # Apply the chat template and processor | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[text], | |
| images=[main_image], | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = {key: value.to(device) for (key, value) in inputs.items()} | |
| # Generate with CPU-optimized settings | |
| with torch.no_grad(): # Disable gradient computation for inference | |
| output = model.generate( | |
| **inputs, | |
| temperature=0.1, | |
| max_new_tokens=max_tokens, | |
| num_return_sequences=1, | |
| do_sample=False, # Greedy decoding is faster on CPU | |
| num_beams=1, # No beam search for speed | |
| ) | |
| # Decode the output | |
| prompt_length = inputs["input_ids"].shape[1] | |
| new_tokens = output[:, prompt_length:] | |
| text_output = processor.tokenizer.batch_decode( | |
| new_tokens, skip_special_tokens=True | |
| ) | |
| return text_output[0], main_image | |
| except Exception as e: | |
| return f"Error processing file: {str(e)}", None | |
| # Create Gradio interface | |
| with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo: | |
| gr.Markdown("# olmOCR: Document OCR with Vision Language Models") | |
| gr.Markdown(""" | |
| Upload a PDF or image file to extract text using the olmOCR model. | |
| ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Document (PDF, PNG, or JPEG)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"] | |
| ) | |
| page_number = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=1, | |
| step=1, | |
| label="Page Number (for PDFs)" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=100, | |
| maximum=1024, # Reduced max for CPU | |
| value=512, | |
| step=50, | |
| label="Max Tokens" | |
| ) | |
| process_btn = gr.Button("Extract Text", variant="primary") | |
| gr.Markdown(""" | |
| ### Tips for CPU Usage: | |
| - Smaller images process faster | |
| - First run may be slower (model loading) | |
| - Reduce max tokens for faster results | |
| """) | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="Extracted Text", | |
| lines=20, | |
| placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds." | |
| ) | |
| output_image = gr.Image(label="Processed Image") | |
| process_btn.click( | |
| fn=process_document, | |
| inputs=[file_input, page_number, max_tokens], | |
| outputs=[output_text, output_image] | |
| ) | |
| gr.Examples( | |
| examples=[], | |
| inputs=[file_input] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=3) # Limit queue to prevent overload | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |