Spaces:
Runtime error
Runtime error
| import torch | |
| import base64 | |
| import gradio as gr | |
| from io import BytesIO | |
| from PIL import Image | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig | |
| from olmocr.data.renderpdf import render_pdf_to_base64png | |
| from olmocr.prompts import build_no_anchoring_v4_yaml_prompt | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Configure 8-bit quantization to reduce memory | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| llm_int8_enable_fp32_cpu_offload=True | |
| ) | |
| print("Loading model with 8-bit quantization...") | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| "allenai/olmOCR-2-7B-1025", | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| ).eval() | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| print("Model loaded successfully") | |
| def process_document(file, page_number, max_tokens): | |
| if file is None: | |
| return "Please upload a file first.", None | |
| try: | |
| # Handle different file types | |
| if file.name.endswith('.pdf'): | |
| image_base64 = render_pdf_to_base64png( | |
| file.name, | |
| page_number, | |
| target_longest_image_dim=896 # Further reduced for memory | |
| ) | |
| main_image = Image.open(BytesIO(base64.b64decode(image_base64))) | |
| else: | |
| main_image = Image.open(file.name) | |
| max_size = 896 # Reduced image size | |
| if max(main_image.size) > max_size: | |
| main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
| buffered = BytesIO() | |
| main_image.save(buffered, format="PNG") | |
| image_base64 = base64.b64encode(buffered.getvalue()).decode() | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[text], | |
| images=[main_image], | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| # Generate with memory optimization | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| temperature=0.1, | |
| max_new_tokens=min(max_tokens, 256), # Limit tokens | |
| num_return_sequences=1, | |
| do_sample=False, | |
| ) | |
| prompt_length = inputs["input_ids"].shape[1] | |
| new_tokens = output[:, prompt_length:] | |
| text_output = processor.tokenizer.batch_decode( | |
| new_tokens, skip_special_tokens=True | |
| ) | |
| return text_output[0], main_image | |
| except Exception as e: | |
| return f"Error: {str(e)}", None | |
| # Create Gradio interface (same as before, but update max_tokens) | |
| with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo: | |
| gr.Markdown("# olmOCR: Document OCR (Quantized)") | |
| gr.Markdown("⚠️ **Note**: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Document (PDF, PNG, or JPEG)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"] | |
| ) | |
| page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number") | |
| max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens") | |
| process_btn = gr.Button("Extract Text", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Extracted Text", lines=20) | |
| output_image = gr.Image(label="Processed Image") | |
| process_btn.click( | |
| fn=process_document, | |
| inputs=[file_input, page_number, max_tokens], | |
| outputs=[output_text, output_image] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=2) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |