import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import LlavaNextVideoForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from gtts import gTTS
import tempfile

print("Loading model from Hugging Face (4-bit mode)...")

# ====================== 4-BIT CONFIG ======================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "Remostart/sign-language",
    quantization_config=bnb_config,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("Remostart/sign-language")
print("✅ Model loaded successfully in 4-bit!")

def translate_and_speak(video_path):
    if video_path is None:
        return "Please upload a video.", None

    # Load frames
    cap = cv2.VideoCapture(video_path)
    frames = []
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total > 0:
        for i in np.linspace(0, total - 1, 8, dtype=int):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if ret:
                frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
    cap.release()

    if not frames:
        return "Could not read video.", None

    # Prompt
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "video"},
                {"type": "text", "text": "Translate the sign language in this video to English."}
            ]
        }
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(text=prompt, videos=frames, return_tensors="pt").to(model.device)

    # Generation (memory friendly)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            num_beams=2,
            repetition_penalty=1.2
        )

    translation = processor.decode(output[0], skip_special_tokens=True)
    if "ASSISTANT:" in translation:
        translation = translation.split("ASSISTANT:")[-1].strip()

    # Text to Speech
    tts = gTTS(text=translation, lang='en')
    audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
    tts.save(audio_path)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return translation, audio_path

# ====================== GRADIO INTERFACE ======================
with gr.Blocks(title="Sign Language to Speech") as demo:
    gr.Markdown("# 🖐️ Sign Language Translator (Text + Voice)")
    gr.Markdown("Upload a sign language video and get the translation as **text and voice**.")

    video = gr.Video(label="Upload Video")
    btn = gr.Button("Translate & Speak", variant="primary")

    text_out = gr.Textbox(label="Translation (Text)", lines=3)
    audio_out = gr.Audio(label="Voice Output", type="filepath")

    btn.click(translate_and_speak, inputs=video, outputs=[text_out, audio_out])

demo.launch()