Spaces:

pankti07
/

OCR

Runtime error

App Files Files Community

OCR / app.py

pankti07

Update app.py

3f44afb verified 2 months ago

raw

history blame contribute delete

4.73 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from PIL import Image
	from pdf2image import convert_from_path
	import os

	# --- CONFIGURATION ---
	MODEL_ID = "numind/NuMarkdown-8B-Thinking"

	# 1. Hardware Detection
	# We explicitly check for CUDA. If not found, we default to CPU with float32.
	if torch.cuda.is_available():
	device = "cuda"
	dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
	print(f"🚀 Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}")
	else:
	device = "cpu"
	dtype = torch.float32 # CPU must use float32 to avoid "Layer not implemented" errors
	print("⚠️ Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.")

	# 2. Load Model & Processor
	print("Loading model... (This may take a while)")
	try:
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	trust_remote_code=True,
	low_cpu_mem_usage=True # optimized loading
	).to(device)
	model.eval()
	print("✅ Model loaded successfully.")
	except ValueError as e:
	print("\n🛑 CRITICAL ERROR: Transformers version is too old.")
	print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n")
	raise e

	def process_textbook(pdf_file, start_page, end_page):
	if pdf_file is None:
	return "Error: Please upload a PDF file."

	try:
	# Convert PDF to images
	images = convert_from_path(
	pdf_file.name,
	first_page=int(start_page),
	last_page=int(end_page),
	dpi=150
	)

	extracted_text = []

	for i, page_image in enumerate(images):
	page_num = int(start_page) + i
	print(f"Processing page {page_num}...")

	# 3. Construct Qwen2.5-VL / NuMarkdown Prompt
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": "Convert this image to markdown."}
	],
	}
	]

	# Apply chat template
	text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

	# Prepare Inputs
	inputs = processor(
	text=[text_prompt],
	images=[page_image],
	return_tensors="pt"
	).to(device)

	# Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe)
	if device == "cpu":
	inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
	else:
	inputs["pixel_values"] = inputs["pixel_values"].to(dtype)

	# 4. Generate
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=4096,
	do_sample=False
	)

	# 5. Decode (Slice off the prompt)
	input_len = inputs.input_ids.shape[1]
	generated_ids_trimmed = generated_ids[:, input_len:]

	response = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	extracted_text.append(f"### Page {page_num}\n\n{response}")

	return "\n\n---\n\n".join(extracted_text)

	except Exception as e:
	import traceback
	traceback.print_exc()
	return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)"

	# 3. UI Layout
	with gr.Blocks() as demo:
	gr.Markdown("## 📚 NuMarkdown-8B OCR (Qwen2.5-VL Architecture)")

	if device == "cpu":
	gr.Markdown("⚠️ WARNING: Running on CPU. Expect very slow performance (minutes per page).")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload Textbook PDF")
	with gr.Row():
	s = gr.Number(value=1, label="Start Page", precision=0)
	e = gr.Number(value=1, label="End Page", precision=0)
	btn = gr.Button("Extract Text", variant="primary")
	with gr.Column():
	output = gr.Markdown()

	btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output)

	if __name__ == "__main__":
	demo.launch()