Spaces:

coderprabhat
/

olmOCR

Runtime error

olmOCR / app.py

coderprabhat

fix : bugs

55a0a6c about 1 month ago

4.32 kB

	import torch
	import base64
	import gradio as gr
	from io import BytesIO
	from PIL import Image
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig
	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
	import warnings
	warnings.filterwarnings('ignore')

	# Configure 8-bit quantization to reduce memory
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	llm_int8_enable_fp32_cpu_offload=True
	)

	print("Loading model with 8-bit quantization...")
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-2-7B-1025",
	quantization_config=quantization_config,
	device_map="auto",
	low_cpu_mem_usage=True,
	).eval()

	processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
	print("Model loaded successfully")

	def process_document(file, page_number, max_tokens):
	if file is None:
	return "Please upload a file first.", None

	try:
	# Handle different file types
	if file.name.endswith('.pdf'):
	image_base64 = render_pdf_to_base64png(
	file.name,
	page_number,
	target_longest_image_dim=896 # Further reduced for memory
	)
	main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
	else:
	main_image = Image.open(file.name)
	max_size = 896 # Reduced image size
	if max(main_image.size) > max_size:
	main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

	buffered = BytesIO()
	main_image.save(buffered, format="PNG")
	image_base64 = base64.b64encode(buffered.getvalue()).decode()

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
	],
	}
	]

	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = processor(
	text=[text],
	images=[main_image],
	padding=True,
	return_tensors="pt",
	)

	# Generate with memory optimization
	with torch.no_grad():
	output = model.generate(
	**inputs,
	temperature=0.1,
	max_new_tokens=min(max_tokens, 256), # Limit tokens
	num_return_sequences=1,
	do_sample=False,
	)

	prompt_length = inputs["input_ids"].shape[1]
	new_tokens = output[:, prompt_length:]
	text_output = processor.tokenizer.batch_decode(
	new_tokens, skip_special_tokens=True
	)

	return text_output[0], main_image

	except Exception as e:
	return f"Error: {str(e)}", None

	# Create Gradio interface (same as before, but update max_tokens)
	with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
	gr.Markdown("# olmOCR: Document OCR (Quantized)")
	gr.Markdown("⚠️ Note: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document (PDF, PNG, or JPEG)",
	file_types=[".pdf", ".png", ".jpg", ".jpeg"]
	)
	page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number")
	max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens")
	process_btn = gr.Button("Extract Text", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="Extracted Text", lines=20)
	output_image = gr.Image(label="Processed Image")

	process_btn.click(
	fn=process_document,
	inputs=[file_input, page_number, max_tokens],
	outputs=[output_text, output_image]
	)

	if __name__ == "__main__":
	demo.queue(max_size=2)
	demo.launch(server_name="0.0.0.0", server_port=7860)