import gradio as gr import torch from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image from pdf2image import convert_from_path import os # --- CONFIGURATION --- MODEL_ID = "numind/NuMarkdown-8B-Thinking" # 1. Hardware Detection # We explicitly check for CUDA. If not found, we default to CPU with float32. if torch.cuda.is_available(): device = "cuda" dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 print(f"šŸš€ Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}") else: device = "cpu" dtype = torch.float32 # CPU must use float32 to avoid "Layer not implemented" errors print("āš ļø Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.") # 2. Load Model & Processor print("Loading model... (This may take a while)") try: processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=dtype, trust_remote_code=True, low_cpu_mem_usage=True # optimized loading ).to(device) model.eval() print("āœ… Model loaded successfully.") except ValueError as e: print("\nšŸ›‘ CRITICAL ERROR: Transformers version is too old.") print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n") raise e def process_textbook(pdf_file, start_page, end_page): if pdf_file is None: return "Error: Please upload a PDF file." try: # Convert PDF to images images = convert_from_path( pdf_file.name, first_page=int(start_page), last_page=int(end_page), dpi=150 ) extracted_text = [] for i, page_image in enumerate(images): page_num = int(start_page) + i print(f"Processing page {page_num}...") # 3. Construct Qwen2.5-VL / NuMarkdown Prompt conversation = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": "Convert this image to markdown."} ], } ] # Apply chat template text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) # Prepare Inputs inputs = processor( text=[text_prompt], images=[page_image], return_tensors="pt" ).to(device) # Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe) if device == "cpu": inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) else: inputs["pixel_values"] = inputs["pixel_values"].to(dtype) # 4. Generate with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=4096, do_sample=False ) # 5. Decode (Slice off the prompt) input_len = inputs.input_ids.shape[1] generated_ids_trimmed = generated_ids[:, input_len:] response = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] extracted_text.append(f"### Page {page_num}\n\n{response}") return "\n\n---\n\n".join(extracted_text) except Exception as e: import traceback traceback.print_exc() return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)" # 3. UI Layout with gr.Blocks() as demo: gr.Markdown("## šŸ“š NuMarkdown-8B OCR (Qwen2.5-VL Architecture)") if device == "cpu": gr.Markdown("āš ļø **WARNING: Running on CPU.** Expect very slow performance (minutes per page).") with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload Textbook PDF") with gr.Row(): s = gr.Number(value=1, label="Start Page", precision=0) e = gr.Number(value=1, label="End Page", precision=0) btn = gr.Button("Extract Text", variant="primary") with gr.Column(): output = gr.Markdown() btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output) if __name__ == "__main__": demo.launch()