import gradio as gr
import torch
from moviepy.editor import VideoFileClip
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=400,  # Adjusted to a lower value
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


def transcribe_video(video_path):
    """Transcribes the audio from a video file using Whisper.

    Args:
      video_path: Path to the video file.

    Returns:
      The transcribed text.
    """
    try:
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = video_path.replace(".mp4", ".mp3")  # Assuming input is MP4
        video.audio.write_audiofile(audio_path)

        # Load the Whisper model
        result = pipe(audio_path)

        return result
    except Exception as e:
        return f"An error occurred: {e}"


# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe_video,
    inputs=gr.Video(label="Upload Video"),
    outputs="text",
    title="Video Transcription with Whisper",
    description="Upload a video to transcribe its audio content.",
)

iface.launch()