olmOCR / app.py
coderprabhat
fix : bugs
55a0a6c
import torch
import base64
import gradio as gr
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
import warnings
warnings.filterwarnings('ignore')
# Configure 8-bit quantization to reduce memory
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
print("Loading model with 8-bit quantization...")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-2-7B-1025",
quantization_config=quantization_config,
device_map="auto",
low_cpu_mem_usage=True,
).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
print("Model loaded successfully")
def process_document(file, page_number, max_tokens):
if file is None:
return "Please upload a file first.", None
try:
# Handle different file types
if file.name.endswith('.pdf'):
image_base64 = render_pdf_to_base64png(
file.name,
page_number,
target_longest_image_dim=896 # Further reduced for memory
)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
else:
main_image = Image.open(file.name)
max_size = 896 # Reduced image size
if max(main_image.size) > max_size:
main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
buffered = BytesIO()
main_image.save(buffered, format="PNG")
image_base64 = base64.b64encode(buffered.getvalue()).decode()
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=[text],
images=[main_image],
padding=True,
return_tensors="pt",
)
# Generate with memory optimization
with torch.no_grad():
output = model.generate(
**inputs,
temperature=0.1,
max_new_tokens=min(max_tokens, 256), # Limit tokens
num_return_sequences=1,
do_sample=False,
)
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
new_tokens, skip_special_tokens=True
)
return text_output[0], main_image
except Exception as e:
return f"Error: {str(e)}", None
# Create Gradio interface (same as before, but update max_tokens)
with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
gr.Markdown("# olmOCR: Document OCR (Quantized)")
gr.Markdown("⚠️ **Note**: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Document (PDF, PNG, or JPEG)",
file_types=[".pdf", ".png", ".jpg", ".jpeg"]
)
page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number")
max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens")
process_btn = gr.Button("Extract Text", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Extracted Text", lines=20)
output_image = gr.Image(label="Processed Image")
process_btn.click(
fn=process_document,
inputs=[file_input, page_number, max_tokens],
outputs=[output_text, output_image]
)
if __name__ == "__main__":
demo.queue(max_size=2)
demo.launch(server_name="0.0.0.0", server_port=7860)