import os
import time
import torch
from threading import Thread
from PIL import Image
from transformers import (
    AutoProcessor,
    AutoModelForCausalLM,
    Qwen2_5_VLForConditionalGeneration,
    TextIteratorStreamer
)
from qwen_vl_utils import process_vision_info

# Try importing Qwen3VL if available
try:
    from transformers import Qwen3VLForConditionalGeneration
except ImportError:
    Qwen3VLForConditionalGeneration = None

MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load Chandra-OCR
MODEL_ID_V = "datalab-to/chandra"
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
if Qwen3VLForConditionalGeneration:
    model_v = Qwen3VLForConditionalGeneration.from_pretrained(
        MODEL_ID_V,
        trust_remote_code=True,
        torch_dtype=torch.float16
    ).to(device).eval()
else:
    model_v = None

# Load Nanonets-OCR2-3B
MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_X,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load Dots.OCR from the local, patched directory
MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
model_d = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH_D,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
).eval()

# Load olmOCR-2-7B-1025
MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_M,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load DeepSeek-OCR
MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
processor_ds = AutoProcessor.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
model_ds = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_DS,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

@spaces.GPU
def generate_image(model_name: str, text: str, image: Image.Image,
                   max_new_tokens: int, temperature: float, top_p: float,
                   top_k: int, repetition_penalty: float):
    """
    Generates responses using the selected model for image input.
    Yields raw text and Markdown-formatted text.
    
    Args:
        model_name: Name of the OCR model to use
        text: Prompt text for the model
        image: PIL Image object to process
        max_new_tokens: Maximum number of tokens to generate
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        top_k: Top-k sampling parameter
        repetition_penalty: Penalty for repeating tokens
        
    Yields:
        tuple: (raw_text, markdown_text)
    """
    # Select model and processor based on model_name
    if model_name == "olmOCR-2-7B-1025":
        processor = processor_m
        model = model_m
    elif model_name == "Nanonets-OCR2-3B":
        processor = processor_x
        model = model_x
    elif model_name == "Chandra-OCR":
        if model_v is None:
            yield "Chandra-OCR model not available.", "Chandra-OCR model not available."
            return
        processor = processor_v
        model = model_v
    elif model_name == "Dots.OCR":
        processor = processor_d
        model = model_d
    elif model_name == "DeepSeek-OCR":
        processor = processor_ds
        model = model_ds
    else:
        yield "Invalid model selected.", "Invalid model selected."
        return

    if image is None:
        yield "Please upload an image.", "Please upload an image."
        return

    # Prepare messages in chat format
    messages = [{
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": text},
        ]
    }]
    
    # Apply chat template
    prompt_full = processor.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    # Process inputs
    inputs = processor(
        text=[prompt_full],
        images=[image],
        return_tensors="pt",
        padding=True
    ).to(device)

    # Setup streaming generation
    streamer = TextIteratorStreamer(
        processor, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    
    generation_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": max_new_tokens,
        "do_sample": True,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "repetition_penalty": repetition_penalty,
    }
    
    # Start generation in separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream the results
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        time.sleep(0.01)
        yield buffer, buffer
    
    # Ensure thread completes
    thread.join()


# Example usage for Gradio interface
if __name__ == "__main__":
    import gradio as gr
    
    with gr.Blocks() as demo:
        gr.Markdown("# Multi-Model OCR Application")
        gr.Markdown("Upload an image and select a model to extract text")
        
        with gr.Row():
            with gr.Column():
                model_selector = gr.Dropdown(
                    choices=[
                        "olmOCR-2-7B-1025",
                        "Nanonets-OCR2-3B",
                        "Chandra-OCR",
                        "Dots.OCR",
                        "DeepSeek-OCR"
                    ],
                    value="DeepSeek-OCR",
                    label="Select OCR Model"
                )
                image_input = gr.Image(type="pil", label="Upload Image")
                text_input = gr.Textbox(
                    value="Extract all text from this image.",
                    label="Prompt"
                )
                
                with gr.Accordion("Advanced Settings", open=False):
                    max_tokens = gr.Slider(
                        minimum=1,
                        maximum=MAX_MAX_NEW_TOKENS,
                        value=DEFAULT_MAX_NEW_TOKENS,
                        step=1,
                        label="Max New Tokens"
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=2.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature"
                    )
                    top_p = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.9,
                        step=0.05,
                        label="Top P"
                    )
                    top_k = gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=50,
                        step=1,
                        label="Top K"
                    )
                    repetition_penalty = gr.Slider(
                        minimum=1.0,
                        maximum=2.0,
                        value=1.1,
                        step=0.1,
                        label="Repetition Penalty"
                    )
                
                submit_btn = gr.Button("Extract Text", variant="primary")
            
            with gr.Column():
                output_text = gr.Textbox(label="Extracted Text", lines=20)
                output_markdown = gr.Markdown(label="Formatted Output")
        
        submit_btn.click(
            fn=generate_image,
            inputs=[
                model_selector,
                text_input,
                image_input,
                max_tokens,
                temperature,
                top_p,
                top_k,
                repetition_penalty
            ],
            outputs=[output_text, output_markdown]
        )
    
    demo.launch()