import torch import base64 import gradio as gr from io import BytesIO from PIL import Image from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_no_anchoring_v4_yaml_prompt import warnings warnings.filterwarnings('ignore') # Configure 8-bit quantization to reduce memory quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True ) print("Loading model with 8-bit quantization...") model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-2-7B-1025", quantization_config=quantization_config, device_map="auto", low_cpu_mem_usage=True, ).eval() processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") print("Model loaded successfully") def process_document(file, page_number, max_tokens): if file is None: return "Please upload a file first.", None try: # Handle different file types if file.name.endswith('.pdf'): image_base64 = render_pdf_to_base64png( file.name, page_number, target_longest_image_dim=896 # Further reduced for memory ) main_image = Image.open(BytesIO(base64.b64decode(image_base64))) else: main_image = Image.open(file.name) max_size = 896 # Reduced image size if max(main_image.size) > max_size: main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) buffered = BytesIO() main_image.save(buffered, format="PNG") image_base64 = base64.b64encode(buffered.getvalue()).decode() messages = [ { "role": "user", "content": [ {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=[text], images=[main_image], padding=True, return_tensors="pt", ) # Generate with memory optimization with torch.no_grad(): output = model.generate( **inputs, temperature=0.1, max_new_tokens=min(max_tokens, 256), # Limit tokens num_return_sequences=1, do_sample=False, ) prompt_length = inputs["input_ids"].shape[1] new_tokens = output[:, prompt_length:] text_output = processor.tokenizer.batch_decode( new_tokens, skip_special_tokens=True ) return text_output[0], main_image except Exception as e: return f"Error: {str(e)}", None # Create Gradio interface (same as before, but update max_tokens) with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo: gr.Markdown("# olmOCR: Document OCR (Quantized)") gr.Markdown("⚠️ **Note**: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.") with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Document (PDF, PNG, or JPEG)", file_types=[".pdf", ".png", ".jpg", ".jpeg"] ) page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number") max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens") process_btn = gr.Button("Extract Text", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Extracted Text", lines=20) output_image = gr.Image(label="Processed Image") process_btn.click( fn=process_document, inputs=[file_input, page_number, max_tokens], outputs=[output_text, output_image] ) if __name__ == "__main__": demo.queue(max_size=2) demo.launch(server_name="0.0.0.0", server_port=7860)