import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import librosa
import numpy as np
import srt
import datetime
import re
import json

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, text, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, text, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, text, None

# Subtitle segmentation
def split_text_by_punctuation(text):
    raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
    segments = []
    for segment in raw_segments:
        words = segment.strip().split()
        while len(words) > 8:
            segments.append(" ".join(words[:8]))
            words = words[8:]
        if words:
            segments.append(" ".join(words))
    return segments

# Main .srt generation
def generate_srt(audio_path, input_text):
    y, sr = librosa.load(audio_path)
    raw_intervals = librosa.effects.split(y, top_db=25)

    # Merge close intervals
    merged_intervals = []
    min_duration = 1.2
    buffer = []
    for start, end in raw_intervals:
        if not buffer:
            buffer = [start, end]
        elif (start - buffer[1]) / sr < 0.5:
            buffer[1] = end
        else:
            merged_intervals.append(tuple(buffer))
            buffer = [start, end]
    if buffer:
        merged_intervals.append(tuple(buffer))

    segments = split_text_by_punctuation(input_text)
    num_segments = len(segments)
    num_intervals = len(merged_intervals)

    if num_intervals != num_segments:
        total_len = len(y)
        step = total_len // num_segments
        merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)]

    subs = []
    for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)):
        start_sec = start_sample / sr
        end_sec = end_sample / sr

        if end_sec - start_sec < min_duration:
            end_sec = start_sec + min_duration
            if end_sec > librosa.get_duration(y=y, sr=sr):
                end_sec = librosa.get_duration(y=y, sr=sr)

        subs.append(srt.Subtitle(
            index=idx + 1,
            start=datetime.timedelta(seconds=start_sec),
            end=datetime.timedelta(seconds=end_sec),
            content=seg_text.strip()
        ))

    return srt.compose(subs)

# Save .srt to file
def save_srt_file(srt_text):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f:
        f.write(srt_text)
        return f.name

# Generate animated subtitle timing
def generate_word_animation_json(audio_path, base_srt):
    subtitles = list(srt.parse(base_srt))
    all_words = []

    for sub in subtitles:
        words = sub.content.strip().split()
        start_sec = sub.start.total_seconds()
        end_sec = sub.end.total_seconds()
        total_duration = end_sec - start_sec
        word_duration = total_duration / len(words) if words else 0

        for i, word in enumerate(words):
            w_start = start_sec + i * word_duration
            w_end = w_start + word_duration
            all_words.append({
                "word": word,
                "start": round(w_start, 3),
                "end": round(w_end, 3),
                "line": sub.index
            })

    return all_words

# Interface logic
def tts_interface(text, voice, rate, pitch):
    audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    if not audio:
        return None, None, None, warning

    srt_data = generate_srt(audio, input_text)
    srt_file = save_srt_file(srt_data)

    # Word-by-word timing
    word_json = generate_word_animation_json(audio, srt_data)
    word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8')
    json.dump(word_json, word_json_file)
    word_json_file.close()

    return audio, srt_file, word_json_file.name, warning

# Gradio App
async def create_demo():
    voices = await get_voices()

    description = """
    🎙️ Convert text to realistic voice using Microsoft Edge TTS  
    📜 Download full subtitle (.srt) + animated word timing (.json)  
    🪄 Perfect for content creators, educators, and storytelling
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Download Subtitle (.srt)"),
            gr.File(label="Word-by-Word JSON Timing"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS with Dual Subtitles",
        description=description,
        allow_flagging=False
    )
    return demo

# Run app
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()