# app.py

import gradio as gr
import torch
import cv2
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration

# Load LLaVA model (MiniGPT-4 style)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id)

device = torch.device("cpu")
model.to(device)

# Function: read webcam, yield frame + LLaVA caption every few seconds
def webcam_llava():
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise RuntimeError("Webcam could not be opened.")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert BGR to RGB PIL
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)

        # --- Compose prompt for LLaVA ---
        prompt = "<image>\nUSER: Describe this scene in detail.\nASSISTANT:"
        inputs = processor(prompt, pil_image, return_tensors="pt").to(device)

        # Generate
        output = model.generate(**inputs, max_new_tokens=200)
        caption = processor.decode(output[0], skip_special_tokens=True)

        # Yield current frame + caption
        yield rgb_frame, caption

        # Wait before next frame (adjust as needed)
        cv2.waitKey(10000)  # 10 seconds for CPU safety

    cap.release()

# Gradio app
with gr.Blocks() as demo:
    gr.Markdown("# 🎥 LLaVA MiniGPT-4 Webcam Captioning\n_(CPU, slow but descriptive)_")

    webcam_display = gr.Image(label="Live Webcam")
    description = gr.Textbox(label="LLaVA Caption")

    demo.load(
        fn=webcam_llava,
        inputs=None,
        outputs=[webcam_display, description],
        every=1
    )

demo.launch()