File size: 4,726 Bytes
be5bedb
 
cc2cf00
be5bedb
 
 
 
3f44afb
 
be5bedb
3f44afb
 
 
 
 
 
 
 
 
 
cc2cf00
3f44afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0854b22
 
be5bedb
0854b22
be5bedb
29c9c09
b300836
0854b22
 
827133f
 
3f44afb
0854b22
 
09f7609
be5bedb
29c9c09
09f7609
3f44afb
0854b22
3f44afb
cc2cf00
 
 
 
 
 
 
 
 
 
3f44afb
cc2cf00
 
3f44afb
cc2cf00
 
 
 
3f44afb
 
 
 
 
 
 
cc2cf00
3f44afb
29c9c09
cc2cf00
 
3f44afb
 
29c9c09
3f44afb
 
cc2cf00
 
 
 
 
 
 
 
29c9c09
b300836
29c9c09
09f7609
29c9c09
 
3f44afb
 
 
be5bedb
3f44afb
faf7bcb
3f44afb
 
 
 
 
be5bedb
faf7bcb
b300836
e51788e
2238111
 
b300836
faf7bcb
 
29c9c09
e51788e
be5bedb
29c9c09
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
from pdf2image import convert_from_path
import os

# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"

# 1. Hardware Detection
# We explicitly check for CUDA. If not found, we default to CPU with float32.
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    print(f"๐Ÿš€ Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}")
else:
    device = "cpu"
    dtype = torch.float32  # CPU must use float32 to avoid "Layer not implemented" errors
    print("โš ๏ธ Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.")

# 2. Load Model & Processor
print("Loading model... (This may take a while)")
try:
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID, 
        torch_dtype=dtype,
        trust_remote_code=True,
        low_cpu_mem_usage=True # optimized loading
    ).to(device)
    model.eval()
    print("โœ… Model loaded successfully.")
except ValueError as e:
    print("\n๐Ÿ›‘ CRITICAL ERROR: Transformers version is too old.")
    print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n")
    raise e

def process_textbook(pdf_file, start_page, end_page):
    if pdf_file is None:
        return "Error: Please upload a PDF file."
    
    try:
        # Convert PDF to images
        images = convert_from_path(
            pdf_file.name, 
            first_page=int(start_page), 
            last_page=int(end_page), 
            dpi=150
        )
        
        extracted_text = []

        for i, page_image in enumerate(images):
            page_num = int(start_page) + i
            print(f"Processing page {page_num}...")
            
            # 3. Construct Qwen2.5-VL / NuMarkdown Prompt
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "Convert this image to markdown."}
                    ],
                }
            ]

            # Apply chat template
            text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

            # Prepare Inputs
            inputs = processor(
                text=[text_prompt], 
                images=[page_image], 
                return_tensors="pt"
            ).to(device)
            
            # Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe)
            if device == "cpu":
                inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
            else:
                inputs["pixel_values"] = inputs["pixel_values"].to(dtype)

            # 4. Generate
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=4096,
                    do_sample=False
                )
            
            # 5. Decode (Slice off the prompt)
            input_len = inputs.input_ids.shape[1]
            generated_ids_trimmed = generated_ids[:, input_len:]
            
            response = processor.batch_decode(
                generated_ids_trimmed, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False
            )[0]
            
            extracted_text.append(f"### Page {page_num}\n\n{response}")
            
        return "\n\n---\n\n".join(extracted_text)
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)"

# 3. UI Layout
with gr.Blocks() as demo:
    gr.Markdown("## ๐Ÿ“š NuMarkdown-8B OCR (Qwen2.5-VL Architecture)")
    
    if device == "cpu":
        gr.Markdown("โš ๏ธ **WARNING: Running on CPU.** Expect very slow performance (minutes per page).")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload Textbook PDF")
            with gr.Row():
                s = gr.Number(value=1, label="Start Page", precision=0)
                e = gr.Number(value=1, label="End Page", precision=0)
            btn = gr.Button("Extract Text", variant="primary")
        with gr.Column():
            output = gr.Markdown()

    btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output)

if __name__ == "__main__":
    demo.launch()