File size: 4,726 Bytes
be5bedb cc2cf00 be5bedb 3f44afb be5bedb 3f44afb cc2cf00 3f44afb 0854b22 be5bedb 0854b22 be5bedb 29c9c09 b300836 0854b22 827133f 3f44afb 0854b22 09f7609 be5bedb 29c9c09 09f7609 3f44afb 0854b22 3f44afb cc2cf00 3f44afb cc2cf00 3f44afb cc2cf00 3f44afb cc2cf00 3f44afb 29c9c09 cc2cf00 3f44afb 29c9c09 3f44afb cc2cf00 29c9c09 b300836 29c9c09 09f7609 29c9c09 3f44afb be5bedb 3f44afb faf7bcb 3f44afb be5bedb faf7bcb b300836 e51788e 2238111 b300836 faf7bcb 29c9c09 e51788e be5bedb 29c9c09 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
from pdf2image import convert_from_path
import os
# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
# 1. Hardware Detection
# We explicitly check for CUDA. If not found, we default to CPU with float32.
if torch.cuda.is_available():
device = "cuda"
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
print(f"๐ Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}")
else:
device = "cpu"
dtype = torch.float32 # CPU must use float32 to avoid "Layer not implemented" errors
print("โ ๏ธ Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.")
# 2. Load Model & Processor
print("Loading model... (This may take a while)")
try:
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
trust_remote_code=True,
low_cpu_mem_usage=True # optimized loading
).to(device)
model.eval()
print("โ
Model loaded successfully.")
except ValueError as e:
print("\n๐ CRITICAL ERROR: Transformers version is too old.")
print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n")
raise e
def process_textbook(pdf_file, start_page, end_page):
if pdf_file is None:
return "Error: Please upload a PDF file."
try:
# Convert PDF to images
images = convert_from_path(
pdf_file.name,
first_page=int(start_page),
last_page=int(end_page),
dpi=150
)
extracted_text = []
for i, page_image in enumerate(images):
page_num = int(start_page) + i
print(f"Processing page {page_num}...")
# 3. Construct Qwen2.5-VL / NuMarkdown Prompt
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Convert this image to markdown."}
],
}
]
# Apply chat template
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Prepare Inputs
inputs = processor(
text=[text_prompt],
images=[page_image],
return_tensors="pt"
).to(device)
# Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe)
if device == "cpu":
inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
else:
inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
# 4. Generate
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=4096,
do_sample=False
)
# 5. Decode (Slice off the prompt)
input_len = inputs.input_ids.shape[1]
generated_ids_trimmed = generated_ids[:, input_len:]
response = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
extracted_text.append(f"### Page {page_num}\n\n{response}")
return "\n\n---\n\n".join(extracted_text)
except Exception as e:
import traceback
traceback.print_exc()
return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)"
# 3. UI Layout
with gr.Blocks() as demo:
gr.Markdown("## ๐ NuMarkdown-8B OCR (Qwen2.5-VL Architecture)")
if device == "cpu":
gr.Markdown("โ ๏ธ **WARNING: Running on CPU.** Expect very slow performance (minutes per page).")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Textbook PDF")
with gr.Row():
s = gr.Number(value=1, label="Start Page", precision=0)
e = gr.Number(value=1, label="End Page", precision=0)
btn = gr.Button("Extract Text", variant="primary")
with gr.Column():
output = gr.Markdown()
btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output)
if __name__ == "__main__":
demo.launch() |