import os
import gradio as gr
import torch
import spaces
import cv2
import numpy as np
from PIL import Image
from typing import List

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# ========== Configuration ==========
MODEL_ID = "WoWolf/Qwen2_5vl-7b-fm-tuned"
MAX_FRAMES = 48
MAX_NEW_TOKENS = 128
TEMPERATURE = 1.0

# ========== Video Examples Configuration ==========
VIDEO_EXAMPLES = {
    "1_raw.mp4": {
        "path": "1_raw.mp4",
        "questions": ["What's happening in this video?", "Which hand holds the pen?"]
    },
    "4_raw.mp4": {
        "path": "4_raw.mp4",
        "questions": ["What's happening in this video?", "What is the main action in the video?"]
    },
    "6_raw.mp4": {
        "path": "6_raw.mp4",
        "questions": ["What's happening in this video?", "What's the right hand doing?"]
    },
}

# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
)

# ========== Video Frame Extraction ==========
def extract_video_frames(video_path: str, max_frames: int = 8) -> List[Image.Image]:
    """Extract key frames from video using OpenCV"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        cap.release()
        return frames
    
    # Select frames evenly
    frame_indices = np.linspace(0, total_frames - 1, min(max_frames, total_frames), dtype=int)
    
    for frame_idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if ret:
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame_rgb))
    
    cap.release()
    return frames

# ========== Message Builder ==========
SYSTEM_PROMPT = (
    "You are a helpful assistant that watches a user-provided video and answers "
    "questions about it concisely and accurately."
)

def build_messages(frames: List[Image.Image], question: str, fps: float = 1.0):
    """Build messages in Qwen-VL format"""
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": SYSTEM_PROMPT}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": frames,
                    "fps": fps,
                },
                {"type": "text", "text": question},
            ],
        },
    ]
    return messages

# ========== Helper Functions ==========
def update_video_display(video_name):
    """Update video display and example questions when video is selected"""
    if video_name is None:
        return None, ""
    
    video_info = VIDEO_EXAMPLES[video_name]
    video_path = video_info["path"]
    example_questions = "\n".join([f"• {q}" for q in video_info["questions"]])
    
    return video_path, example_questions

def fill_question(video_name, question_idx):
    """Fill the question textbox with selected example question"""
    if video_name is None:
        return ""
    questions = VIDEO_EXAMPLES[video_name]["questions"]
    if 0 <= question_idx < len(questions):
        return questions[question_idx]
    return ""

# ========== Inference ==========
@spaces.GPU
@torch.inference_mode()
def answer(video_name, question):
    if video_name is None:
        return "Please select a video first."
    if not question or question.strip() == "":
        question = "Describe this video in detail."

    video_path = VIDEO_EXAMPLES[video_name]["path"]
    
    # Extract frames from video
    frames = extract_video_frames(video_path, max_frames=MAX_FRAMES)
    if not frames:
        return "Error: Unable to extract frames from video."

    # Build messages
    messages = build_messages(frames, question, fps=1.0)

    # Apply chat template
    text = processor.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    # Process vision info
    image_inputs, video_inputs = process_vision_info(messages)

    # Prepare inputs
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    # Generation settings
    gen_kwargs = dict(
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=(TEMPERATURE > 0.0),
        temperature=TEMPERATURE if TEMPERATURE > 0 else None,
        pad_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
    )

    # Generate
    generated_ids = model.generate(**inputs, **gen_kwargs)

    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] 
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    output_text = processor.batch_decode(
        generated_ids_trimmed, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False
    )[0]

    return output_text.strip()


# ========== Gradio UI ==========
with gr.Blocks(title="Video Q&A with Qwen2.5-VL-7B") as demo:
    gr.Markdown(
        """
        # FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
        Select a video, ask a question, and get an answer!
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            # Video selector dropdown
            video_selector = gr.Dropdown(
                choices=list(VIDEO_EXAMPLES.keys()),
                label="Select a Video",
                value=None,
                interactive=True,
            )
            # Video display (read-only)
            video_display = gr.Video(
                label="Video Preview", 
                height=400,
                interactive=False,
            )
        
        with gr.Column(scale=1):
            # Example questions display
            example_questions_display = gr.Textbox(
                label="Example Questions (click buttons below to use)",
                lines=3,
                interactive=False,
            )
            
            # Buttons for quick question selection
            with gr.Row():
                q1_btn = gr.Button("Use Question 1", size="sm")
                q2_btn = gr.Button("Use Question 2", size="sm")
            
            question = gr.Textbox(
                label="Your Question",
                placeholder="Type your question or click an example button above",
                lines=2,
            )
            ask_btn = gr.Button("Ask", variant="primary")
            output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)

    # Event handlers
    video_selector.change(
        fn=update_video_display,
        inputs=[video_selector],
        outputs=[video_display, example_questions_display],
    )
    
    q1_btn.click(
        fn=lambda v: fill_question(v, 0),
        inputs=[video_selector],
        outputs=[question],
    )
    
    q2_btn.click(
        fn=lambda v: fill_question(v, 1),
        inputs=[video_selector],
        outputs=[question],
    )

    ask_btn.click(
        fn=answer,
        inputs=[video_selector, question],
        outputs=[output],
    )

# ========== Launch ==========
if __name__ == "__main__":
    demo.launch()