OCR / app.py
pankti07's picture
Update app.py
3f44afb verified
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
from pdf2image import convert_from_path
import os
# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
# 1. Hardware Detection
# We explicitly check for CUDA. If not found, we default to CPU with float32.
if torch.cuda.is_available():
device = "cuda"
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
print(f"πŸš€ Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}")
else:
device = "cpu"
dtype = torch.float32 # CPU must use float32 to avoid "Layer not implemented" errors
print("⚠️ Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.")
# 2. Load Model & Processor
print("Loading model... (This may take a while)")
try:
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
trust_remote_code=True,
low_cpu_mem_usage=True # optimized loading
).to(device)
model.eval()
print("βœ… Model loaded successfully.")
except ValueError as e:
print("\nπŸ›‘ CRITICAL ERROR: Transformers version is too old.")
print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n")
raise e
def process_textbook(pdf_file, start_page, end_page):
if pdf_file is None:
return "Error: Please upload a PDF file."
try:
# Convert PDF to images
images = convert_from_path(
pdf_file.name,
first_page=int(start_page),
last_page=int(end_page),
dpi=150
)
extracted_text = []
for i, page_image in enumerate(images):
page_num = int(start_page) + i
print(f"Processing page {page_num}...")
# 3. Construct Qwen2.5-VL / NuMarkdown Prompt
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Convert this image to markdown."}
],
}
]
# Apply chat template
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Prepare Inputs
inputs = processor(
text=[text_prompt],
images=[page_image],
return_tensors="pt"
).to(device)
# Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe)
if device == "cpu":
inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
else:
inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
# 4. Generate
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=4096,
do_sample=False
)
# 5. Decode (Slice off the prompt)
input_len = inputs.input_ids.shape[1]
generated_ids_trimmed = generated_ids[:, input_len:]
response = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
extracted_text.append(f"### Page {page_num}\n\n{response}")
return "\n\n---\n\n".join(extracted_text)
except Exception as e:
import traceback
traceback.print_exc()
return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)"
# 3. UI Layout
with gr.Blocks() as demo:
gr.Markdown("## πŸ“š NuMarkdown-8B OCR (Qwen2.5-VL Architecture)")
if device == "cpu":
gr.Markdown("⚠️ **WARNING: Running on CPU.** Expect very slow performance (minutes per page).")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Textbook PDF")
with gr.Row():
s = gr.Number(value=1, label="Start Page", precision=0)
e = gr.Number(value=1, label="End Page", precision=0)
btn = gr.Button("Extract Text", variant="primary")
with gr.Column():
output = gr.Markdown()
btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output)
if __name__ == "__main__":
demo.launch()