# import os
# import gradio as gr
# import torch
# import pytube
# import tempfile
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from pytube import YouTube

# # Set environment variables for Hugging Face Space
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# # Model configuration
# MODEL_NAME = "openai/whisper-large-v3-turbo"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# # Setup model and processor
# def load_model():
#     model = AutoModelForSpeechSeq2Seq.from_pretrained(
#         MODEL_NAME, 
#         torch_dtype=torch_dtype, 
#         low_cpu_mem_usage=True, 
#         use_safetensors=True
#     )
#     model.to(device)

#     processor = AutoProcessor.from_pretrained(MODEL_NAME)

#     pipe = pipeline(
#         "automatic-speech-recognition",
#         model=model,
#         tokenizer=processor.tokenizer,
#         feature_extractor=processor.feature_extractor,
#         torch_dtype=torch_dtype,
#         device=device,
#     )
    
#     return pipe

# # Load model globally - will be cached
# pipe = load_model()

# # Transcription function for audio files and microphone
# def transcribe(audio_path, task="transcribe"):
#     if audio_path is None:
#         return "Please provide an audio input."
    
#     # Set task-specific generation parameters
#     generate_kwargs = {}
#     if task == "translate":
#         generate_kwargs["task"] = "translate"
    
#     # Process the audio
#     try:
#         result = pipe(audio_path, generate_kwargs=generate_kwargs)
#         return result["text"]
#     except Exception as e:
#         return f"Error during transcription: {str(e)}"

# # YouTube video transcription function
# def yt_transcribe(youtube_url, task="transcribe"):
#     if not youtube_url or not youtube_url.strip():
#         return "Please enter a YouTube URL", "No transcription available."
    
#     try:
#         # Download audio from YouTube
#         yt = YouTube(youtube_url)
#         video = yt.streams.filter(only_audio=True).first()
        
#         # Create a temporary file to store the audio
#         with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
#             temp_path = temp_file.name
        
#         # Download the audio to the temporary file
#         video.download(filename=temp_path)
        
#         # Set task-specific generation parameters
#         generate_kwargs = {}
#         if task == "translate":
#             generate_kwargs["task"] = "translate"
        
#         # Process the audio
#         result = pipe(temp_path, generate_kwargs=generate_kwargs)
        
#         # Create an HTML element to display video thumbnail
#         video_id = youtube_url.split("v=")[-1].split("&")[0]
#         thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
#         html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>'
        
#         # Clean up the temporary file
#         os.unlink(temp_path)
        
#         return html_output, result["text"]
#     except Exception as e:
#         return f"Error processing YouTube video: {str(e)}", "Transcription failed."

# # Create Gradio interfaces
# mic_transcribe = gr.Interface(
#     fn=transcribe,
#     inputs=[
#         gr.Audio(source="microphone", type="filepath", optional=True),
#         gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
#     ],
#     outputs="text",
#     layout="horizontal",
#     theme="huggingface",
#     title="Whisper Large V3 Turbo: Transcribe Audio",
#     description=(
#         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
#         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
#         " of arbitrary length."
#     ),
#     allow_flagging="never",
# )

# file_transcribe = gr.Interface(
#     fn=transcribe,
#     inputs=[
#         gr.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
#         gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
#     ],
#     outputs="text",
#     layout="horizontal",
#     theme="huggingface",
#     title="Whisper Large V3 Turbo: Transcribe Audio",
#     description=(
#         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
#         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
#         " of arbitrary length."
#     ),
#     allow_flagging="never",
# )

# yt_interface = gr.Interface(
#     fn=yt_transcribe,
#     inputs=[
#         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
#         gr.Radio(["transcribe", "translate"], label="Task", default="transcribe")
#     ],
#     outputs=["html", "text"],
#     layout="horizontal",
#     theme="huggingface",
#     title="Whisper Large V3 Turbo: Transcribe YouTube",
#     description=(
#         "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
#         f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
#         " arbitrary length."
#     ),
#     allow_flagging="never",
# )

# # Create the tabbed interface
# demo = gr.Blocks()
# with demo:
#     gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"])

# # Launch the app
# if __name__ == "__main__":

#     demo.launch(enable_queue=True)

import os
import gradio as gr
import torch
import pytube
import tempfile
import shutil
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pytube import YouTube

# Set environment variables for Hugging Face Space
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/hf_hub_cache"
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib_config"

# Create cache directories with proper permissions
for cache_dir in ["/tmp/transformers_cache", "/tmp/hf_home", "/tmp/hf_hub_cache", "/tmp/matplotlib_config"]:
    os.makedirs(cache_dir, exist_ok=True)

# Model configuration
MODEL_NAME = "openai/whisper-large-v3-turbo"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Print environment info for debugging
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
print(f"Using device: {device}")
print(f"Using dtype: {torch_dtype}")

# Setup model and processor
def load_model():
    try:
        print("Loading model...")
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            MODEL_NAME, 
            torch_dtype=torch_dtype, 
            low_cpu_mem_usage=True, 
            use_safetensors=True,
            cache_dir="/tmp/transformers_cache",
        )
        model.to(device)
        print("Model loaded successfully!")

        print("Loading processor...")
        processor = AutoProcessor.from_pretrained(
            MODEL_NAME,
            cache_dir="/tmp/transformers_cache",
        )
        print("Processor loaded successfully!")

        print("Creating pipeline...")
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            torch_dtype=torch_dtype,
            device=device,
        )
        print("Pipeline created successfully!")
        
        return pipe
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

# Load model globally - will be cached
pipe = load_model()

# Transcription function for audio files and microphone
def transcribe(audio_path, task="transcribe"):
    if audio_path is None:
        return "Please provide an audio input."
    
    # Set task-specific generation parameters
    generate_kwargs = {}
    if task == "translate":
        generate_kwargs["task"] = "translate"
    
    # Process the audio
    try:
        result = pipe(audio_path, generate_kwargs=generate_kwargs)
        return result["text"]
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# YouTube video transcription function
def yt_transcribe(youtube_url, task="transcribe"):
    if not youtube_url or not youtube_url.strip():
        return "Please enter a YouTube URL", "No transcription available."
    
    try:
        # Download audio from YouTube
        yt = YouTube(youtube_url)
        video = yt.streams.filter(only_audio=True).first()
        
        # Create a temporary file to store the audio
        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
            temp_path = temp_file.name
        
        # Download the audio to the temporary file
        video.download(filename=temp_path)
        
        # Set task-specific generation parameters
        generate_kwargs = {}
        if task == "translate":
            generate_kwargs["task"] = "translate"
        
        # Process the audio
        result = pipe(temp_path, generate_kwargs=generate_kwargs)
        
        # Create an HTML element to display video thumbnail
        video_id = youtube_url.split("v=")[-1].split("&")[0]
        thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
        html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>'
        
        # Clean up the temporary file
        os.unlink(temp_path)
        
        return html_output, result["text"]
    except Exception as e:
        return f"Error processing YouTube video: {str(e)}", "Transcription failed."


# Create Gradio interfaces
mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Microphone(sources=["microphone"], type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    theme="huggingface",
    title="Whisper Large V3 Turbo: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    flagging_mode="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["upload"], type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    theme="huggingface",
    title="Whisper Large V3 Turbo: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    flagging_mode="never",
)

yt_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs=["html", "text"],
    theme="huggingface",
    title="Whisper Large V3 Turbo: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    flagging_mode="never",
)

# # Create the tabbed interface
demo = gr.Blocks()
with demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"])

# Launch the app
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")