# app.py
import gradio as gr
from transformers import pipeline, AutoModel
import torch
import ffmpeg
import nltk
import re
from deep_translator import MyMemoryTranslator
import num2words
import soundfile as sf
from gradio_client import Client, handle_file
from openvoice_cli.__main__ import tune_one
import pyrubberband as rb
import librosa
import os
import numpy as np

# You only need to run this download command once
nltk.download('punkt_tab')

# --- Model Loading ---
# We load the model once when the app starts, not on every function call.
# This makes the app much more efficient.
# We also check for GPU availability to speed things up if possible.
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Using device: {device}")

# --- Transcription Function ---
def extract_audio_from_video(video_path, output_audio_path="temp_extracted_audio.wav"):
    """
    Extracts audio from a video file using python-ffmpeg.
    """
    print(f"\n[STEP 1/9] Extracting audio from video: {video_path}")
    
    try:
        (
            ffmpeg
            .input(video_path)
            .output(
                output_audio_path,
                vn=None,           # Disable video
                acodec='mp3',      # Audio codec
                ab='192k',         # Audio bitrate
                ar='44100',        # Sample rate
                ac=2,              # Audio channels
                f='wav'            # Output format
            )
            .run(overwrite_output=True, quiet=True)
        )
        print(f"✅ Audio extracted successfully to: {output_audio_path}")
        return output_audio_path
    except ffmpeg.Error as e:
        print(f"Error: Failed to extract audio from video. stderr: {e.stderr.decode('utf8')}")
        return None

def transcribe_audio(audio_path):
    """
    This function takes an audio file path, transcribes it using the Whisper model,
    and returns the transcribed text.
    """
    # Initialize the ASR pipeline from Hugging Face Transformers
    
    THRESHOLDS = {
    "very_slow":  80,
    "slow":       110,
    "normal":     150,
    "fast":       200,
    "very_fast":  float("inf")
    }

    def get_audio_duration(path: str) -> float:
        """Return duration of audio file in seconds."""
        with sf.SoundFile(path) as f:
            return len(f) / f.samplerate

    def compute_wpm(transcript: str, duration_s: float) -> float:
        """Compute words per minute."""
        if not transcript or duration_s == 0:
            return 0.0
        words = transcript.strip().split()
        return len(words) / (duration_s / 60.0)

    def categorize_wpm(wpm: float) -> str:
        """Map a WPM value to one of the pace categories."""
        for label, threshold in THRESHOLDS.items():
            if wpm < threshold:
                return label
        return "unknown"

    
    transcriber = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-large-v3-turbo",
        torch_dtype=torch_dtype,
        device=device,
        generate_kwargs={"language": "english"},
    )

    if audio_path is None:
        return "No audio file provided. Please upload or record an audio file."
    
    print(f"Transcribing audio file: {audio_path}")
    
    # The pipeline handles all the complex steps of loading and processing the audio
    result = transcriber(audio_path)
    # The result is a dictionary, and we need the 'text' key
    transcription = result["text"]
    
    print(f"✅ Transcription successful: {transcription}")

    duration_s = get_audio_duration(audio_path)
    wpm = compute_wpm(transcription, duration_s)
    pace = categorize_wpm(wpm)
    print(f"✅ > Pace detected: {pace.upper()} ({wpm:.1f} WPM)")
    
    return transcription, pace

def lang_select(target_lang):
    LANGUAGE_NAME_TO_CODE = {
        "Bengali": "bn-IN", "English": "en-IN", "Gujarati": "gu-IN",
        "Hindi": "hi-IN", "Kannada": "kn-IN", "Malayalam": "ml-IN",
        "Marathi": "mr-IN", "Odia": "or-IN", "Punjabi": "pa-IN",
        "Tamil": "ta-IN", "Telugu": "te-IN"
    }
    return LANGUAGE_NAME_TO_CODE[target_lang]
    
def translate_local(text_to_translate, target_lang='ta-IN', device=None):
    """
    Translates text from English to a target language, handling texts longer
    than 500 characters by splitting them into sentence-based chunks.
    """
    # 1. Pre-process the text (same as your original code)
    text_to_translate = re.sub(r'\d+', lambda match: num2words(int(match.group(0))), text_to_translate)
    target_lang=lang_select(target_lang.capitalize())
    
    # 2. Split the entire text into individual sentences
    sentences = nltk.sent_tokenize(text_to_translate)

    # 3. Group sentences into chunks under 500 characters
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        # Check if adding the next sentence exceeds the limit
        if len(current_chunk) + len(sentence) + 1 < 500:
            current_chunk += sentence + " "
        else:
            # If it exceeds, add the current chunk to the list and start a new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    # Add the last remaining chunk to the list
    if current_chunk:
        chunks.append(current_chunk.strip())

    # 4. Translate each chunk and combine the results
    translator = MyMemoryTranslator(source='en-GB', target=target_lang)
    translated_chunks = []
    for chunk in chunks:
        try:
            translated_chunks.append(translator.translate(chunk))
        except Exception as e:
            print(f"Could not translate chunk: {chunk}\nError: {e}")
            translated_chunks.append("") # Add an empty string on error

    translated_text = " ".join(translated_chunks)
    
    print(f"✅ Translated Text to {target_lang} Successfully")
    
    return translated_text

def synthesize_speech(synth_text, target_lang, pace="normal", output_path="temp_audio_synthesized.wav", device="cpu"):
    
    ref_audio_path = str('reference/TAMIL/MALE_'+pace.upper()+'.wav')
    ref_text_path = str('reference/TAMIL/MALE_'+pace.upper()+'.txt')

    ref_audio_path = ref_audio_path
    with open(ref_text_path, encoding='utf-8') as f:
        ref_text = f.read()

    print("> Loading IndicF5 TTS model (ai4bharat/IndicF5)...")
    indicf5_repo_id = "ai4bharat/IndicF5"
    token = os.environ.get("HF_TOKEN")
    tts_model = AutoModel.from_pretrained(indicf5_repo_id, trust_remote_code=True).to(device)
    
    audio = tts_model(synth_text, ref_audio_path=ref_audio_path, ref_text=ref_text)

    if audio.dtype == np.int16:
        audio = audio.astype(np.float32) / 32768.0
    
    sf.write(output_path, np.array(audio, dtype=np.float32), samplerate=24000)
    print(f"✅ Speech synthesis complete.")
    print(f"> Final audio saved to: {output_path}")
    
    return output_path

def match_audio_duration(original_path, translated_path, output_path="temp_audio_synced.wav"):
    """
    Matches Synthesized Audio duration to Original Audio duration
    """
    print("\n[STEP 7/9] Syncing Audio durations")
    # Load original audio
    original_audio, original_sr = librosa.load(original_path, sr=None)
    original_duration = librosa.get_duration(y=original_audio, sr=original_sr)
    print(f"Original audio duration: {original_duration:.2f} seconds")

    # Load translated audio
    translated_audio, translated_sr = librosa.load(translated_path, sr=None)
    translated_duration = librosa.get_duration(y=translated_audio, sr=translated_sr)
    print(f"Translated audio duration: {translated_duration:.2f} seconds")

    # Compute the speed-up/slow-down rate
    # If rate > 1.0, audio is sped up. If rate < 1.0, audio is slowed down.
    rate = translated_duration / original_duration
    print(f"Stretch rate: {rate:.4f}")

    # Apply time-stretch using the high-quality rubberband library
    # The parameters are: audio_data, sample_rate, and the desired rate
    adjusted_audio = rb.time_stretch(translated_audio, translated_sr, rate=rate)

    # Save output
    # The sample rate remains the same as the translated audio's original rate
    sf.write(output_path, adjusted_audio, translated_sr)
    print(f"✅ Duration Adjusted audio saved as: {output_path}")
    return output_path

def clone_voice(translated_audio_path, original_audio_path, output_path="temp_audio_cloned.wav", device="cpu"):
    print("Cloning Voice")
    # Convert the tone color of a single audio file
    tune_one(input_file=translated_audio_path, ref_file=original_audio_path, output_file=output_path, device=device)
    print(f"✅ Voice cloned audio saved to {output_path}")
    return output_path

def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
    """
    Merges an audio file with a video file into a single output video.
    """
    print("\n[STEP] Merging audio and video...")
    video_input = ffmpeg.input(video_path)
    audio_input = ffmpeg.input(audio_path)
    (
        ffmpeg.output(video_input.video, audio_input.audio, output_path, vcodec='copy', acodec='aac', shortest=None)
        .run(overwrite_output=True, quiet=True)
    )
    print(f"✅ Merged video saved to {output_path}")
    return output_path

def main_run(video_path,target_lang,user_transcript=None, user_translation=None):
    original_audio_file = extract_audio_from_video(video_path)
    if user_transcript:
        original_text , pace = transcribe_audio(original_audio_file)
        original_text = user_transcript
        print(f"Using provided transcript: {original_text}")
    else:
        original_text , pace = transcribe_audio(original_audio_file)
    if user_translation:
        translated_text = user_translation
        print(f"Using provided translation: {translated_text}")
    else:
        translated_text = translate_local(original_text,target_lang)
        print(f"Translated Text: {translated_text}")
    translated_audio = synthesize_speech(translated_text, target_lang, pace)
    synced_translated_audio = match_audio_duration(original_audio_file, translated_audio) 
    cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
    final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio)
    print(f"✅ Pipeline finished")
    return final_video_nobgm

def audio_pipeline_run(audio_path,target_lang,user_transcript=None, user_translation=None):
    if user_transcript:
        original_text , pace = transcribe_audio(audio_path)
        original_text = user_transcript
        print(f"Using provided transcript: {original_text}")
    else:
        original_text , pace = transcribe_audio(audio_path)
    if user_translation:
        translated_text = user_translation
        print(f"Using provided translation: {translated_text}")
    else:
        translated_text = translate_local(original_text,target_lang)
        print(f"Translated Text: {translated_text}")
    translated_audio = synthesize_speech(translated_text, target_lang, pace)
    synced_translated_audio = match_audio_duration(audio_path, translated_audio) 
    cloned_synced_translated_audio = clone_voice(synced_translated_audio, audio_path)
    print(f"✅ Pipeline finished")
    return cloned_synced_translated_audio


# --- Gradio Interface Definition ---
# Title and description for the new Space
title = "Custom Whisper Transcription App"
description = """
This is a custom Gradio app that uses the <b>openai/whisper-large-v2</b> model 
from the Hugging Face Hub for transcription. Upload an audio file or record 
directly from your microphone to get the transcript.
"""
article = "<p style='text-align: center'><a href='https://huggingface.co/openai/whisper-large-v3-turbo' target='_blank'>Model Card</a></p>"


# Create the Gradio interface with our custom function
# We define the input as an Audio component and the output as a Textbox
'''
app_interface = gr.Interface(
    fn=main_run,
    inputs=gr.Video(label="Upload Video"),gr.
    outputs=gr.Textbox(label="Translation Result"),gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
    title=title,
    description=description,
    article=article,
    allow_flagging="never"
)
'''

with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
    gr.Markdown("# 🚀 Audio/Video Translation Toolkit")
    gr.Markdown("This might take a while to generate as it's running on the free tier.")
    gr.Markdown("Please input only English Audio/Video under 30secs.")
    gr.Markdown("Time taken for 10secs of audio/video is 5-10 mins.")
    with gr.Tabs():
        with gr.Tab("🎬 Translate Video"):
            with gr.Column():
                with gr.Row():
                    video_in = gr.Video(label="Input Video", height=500)
                    video_out = gr.Video(label="Output Video", interactive=False, height=500)
                with gr.Row():
                    # Radio buttons for selecting target language
                    # This allows users to choose one of the mutually exclusive options
                    lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
                    # Single-select option for mutually exclusive choices
                    option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
                    # Textboxes for user input, initially hidden
                user_transcript_vid = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
                user_translation_vid = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
                submit_btn_vid = gr.Button("Translate Video", variant="primary")
                    
                # Toggle visibility based on selected option (only one can be active)
                option_select.change(
                    fn=lambda choice: (
                        gr.update(visible=(choice == "Use my Transcript")),
                        gr.update(visible=(choice == "Use my Translation")),
                    ),
                    inputs=option_select,
                    outputs=[user_transcript_vid, user_translation_vid],
                )

            # Include the optional transcript/translation textboxes as inputs (they may be hidden)
            submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid, user_transcript_vid, user_translation_vid], outputs=[video_out])

        with gr.Tab("🎵 Translate Audio"):
            with gr.Column():
                with gr.Row():
                    audio_in = gr.Audio(label="Input Audio")
                    audio_out = gr.Audio(label="Output Audio", interactive=False)
                with gr.Row():
                    # Radio buttons for selecting target language
                    # This allows users to choose one of the mutually exclusive options
                    lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
                    # Single-select option for mutually exclusive choices
                    option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
                    # Textboxes for user input, initially hidden
                user_transcript_aud = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
                user_translation_aud = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
                submit_btn_aud = gr.Button("Translate Audio", variant="primary")
                    
                # Toggle visibility based on selected option (only one can be active)
                option_select.change(
                    fn=lambda choice: (
                        gr.update(visible=(choice == "Use my Transcript")),
                        gr.update(visible=(choice == "Use my Translation")),
                    ),
                    inputs=option_select,
                    outputs=[user_transcript_aud, user_translation_aud],
                )                                
            submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in, lang_radio_aud, user_transcript_aud, user_translation_aud], outputs=[audio_out])
            
        with gr.Tab("✂️ Extract Audio"):
            with gr.Row():
                video_in_ext = gr.Video(label="Input Video", height=500)
                audio_out_ext = gr.Audio(label="Extracted Audio")
            btn_ext = gr.Button("Extract", variant="secondary")
            btn_ext.click(fn=extract_audio_from_video, inputs=video_in_ext, outputs=audio_out_ext)

        with gr.Tab("✍️ Transcribe"):
            with gr.Row():
                audio_in_trans = gr.Audio(type="filepath", label="Input Audio")
                with gr.Column():
                    text_out_trans = gr.Textbox(label="Transcription")
                    text_out_pace = gr.Textbox(label="Detected Pace")
            btn_trans = gr.Button("Transcribe", variant="secondary")
            btn_trans.click(lambda aud: transcribe_audio(aud), inputs=audio_in_trans, outputs=[text_out_trans, text_out_pace])

        with gr.Tab("🌐 Translate Text"):
            with gr.Row():
                with gr.Column():
                    text_in_tran = gr.Textbox(label="Text to Translate", lines=5)
                    lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
                    btn_tran = gr.Button("Translate", variant="secondary")
                text_out_tran = gr.Textbox(label="Translated Text", lines=5, interactive=False)
            btn_tran.click(fn=translate_local, inputs=[text_in_tran, lang_radio_tran], outputs=text_out_tran)

        with gr.Tab("🔊 Synthesize Speech"):
            with gr.Column():
                with gr.Row():
                    text_in_synth = gr.Textbox(label="Text to Synthesize", lines=5)
                    audio_out_synth = gr.Audio(label="Synthesized Speech")
                with gr.Row():
                    lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
                    gender_radio_tran = gr.Radio(choices=["Male", "Female"], label="Speaker Gender", value="Male")
                    pace_radio_tran = gr.Radio(choices=["Very_Slow", "Slow", "Normal", "Fast", "Very_Fast"], label="Target Language", value="Normal")
            btn_synth = gr.Button("Synthesize", variant="secondary")
            btn_synth.click(fn=synthesize_speech, inputs=[text_in_synth,lang_radio_tran,pace_radio_tran], outputs=audio_out_synth)

        with gr.Tab("⏱️ Sync Duration"):
            with gr.Row():
                audio_in_sync1 = gr.Audio(type="filepath", label="Original Audio (for duration reference)")
                audio_in_sync2 = gr.Audio(type="filepath", label="Translated Audio (to be resized)")
                audio_out_sync = gr.Audio(label="Duration-Synced Audio")
            btn_sync = gr.Button("Sync Duration", variant="secondary")
            btn_sync.click(fn=match_audio_duration, inputs=[audio_in_sync1, audio_in_sync2], outputs=audio_out_sync)

        with gr.Tab("🧬 Clone Voice"):
            with gr.Row():
                audio_in_clone1 = gr.Audio(type="filepath", label="Target Audio (e.g., Synthesized Speech)")
                audio_in_clone2 = gr.Audio(type="filepath", label="Reference Audio (Original Speaker's Voice)")
                audio_out_clone = gr.Audio(label="Cloned Voice Audio")
            btn_clone = gr.Button("Clone Voice", variant="secondary")
            btn_clone.click(fn=clone_voice, inputs=[audio_in_clone1, audio_in_clone2], outputs=audio_out_clone)

        with gr.Tab("🎞️ Replace Audio"):
            with gr.Row():
                video_in_rep = gr.Video(label="Input Video", height=500)
                audio_in_rep = gr.Audio(type="filepath", label="New Audio")
                video_out_rep = gr.Video(label="Video with Replaced Audio", height=500)
            btn_rep = gr.Button("Replace Audio", variant="secondary")
            btn_rep.click(fn=merge_audio_video, inputs=[video_in_rep, audio_in_rep], outputs=video_out_rep)

            # --- Launch the App ---
if __name__ == "__main__":
    # The launch() method creates a web server and makes the interface accessible.
    app_interface.launch()