import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
from pdf2image import convert_from_path
import os

# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"

# 1. Hardware Detection
# We explicitly check for CUDA. If not found, we default to CPU with float32.
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    print(f"🚀 Running on GPU ({torch.cuda.get_device_name(0)}) with {dtype}")
else:
    device = "cpu"
    dtype = torch.float32  # CPU must use float32 to avoid "Layer not implemented" errors
    print("⚠️ Running on CPU. This will be SLOW (2-5 mins per page). Requires ~32GB RAM.")

# 2. Load Model & Processor
print("Loading model... (This may take a while)")
try:
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID, 
        torch_dtype=dtype,
        trust_remote_code=True,
        low_cpu_mem_usage=True # optimized loading
    ).to(device)
    model.eval()
    print("✅ Model loaded successfully.")
except ValueError as e:
    print("\n🛑 CRITICAL ERROR: Transformers version is too old.")
    print("You MUST install from source: pip install git+https://github.com/huggingface/transformers.git\n")
    raise e

def process_textbook(pdf_file, start_page, end_page):
    if pdf_file is None:
        return "Error: Please upload a PDF file."
    
    try:
        # Convert PDF to images
        images = convert_from_path(
            pdf_file.name, 
            first_page=int(start_page), 
            last_page=int(end_page), 
            dpi=150
        )
        
        extracted_text = []

        for i, page_image in enumerate(images):
            page_num = int(start_page) + i
            print(f"Processing page {page_num}...")
            
            # 3. Construct Qwen2.5-VL / NuMarkdown Prompt
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "Convert this image to markdown."}
                    ],
                }
            ]

            # Apply chat template
            text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

            # Prepare Inputs
            inputs = processor(
                text=[text_prompt], 
                images=[page_image], 
                return_tensors="pt"
            ).to(device)
            
            # Cast inputs to correct dtype if on CPU (processor usually returns float32, but good to be safe)
            if device == "cpu":
                inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
            else:
                inputs["pixel_values"] = inputs["pixel_values"].to(dtype)

            # 4. Generate
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=4096,
                    do_sample=False
                )
            
            # 5. Decode (Slice off the prompt)
            input_len = inputs.input_ids.shape[1]
            generated_ids_trimmed = generated_ids[:, input_len:]
            
            response = processor.batch_decode(
                generated_ids_trimmed, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False
            )[0]
            
            extracted_text.append(f"### Page {page_num}\n\n{response}")
            
        return "\n\n---\n\n".join(extracted_text)
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return f"Error processing file: {str(e)}\n\n(If on CPU, check if you ran out of RAM. You need ~32GB for this model.)"

# 3. UI Layout
with gr.Blocks() as demo:
    gr.Markdown("## 📚 NuMarkdown-8B OCR (Qwen2.5-VL Architecture)")
    
    if device == "cpu":
        gr.Markdown("⚠️ **WARNING: Running on CPU.** Expect very slow performance (minutes per page).")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload Textbook PDF")
            with gr.Row():
                s = gr.Number(value=1, label="Start Page", precision=0)
                e = gr.Number(value=1, label="End Page", precision=0)
            btn = gr.Button("Extract Text", variant="primary")
        with gr.Column():
            output = gr.Markdown()

    btn.click(fn=process_textbook, inputs=[file_input, s, e], outputs=output)

if __name__ == "__main__":
    demo.launch()