import gradio as gr
import edge_tts
import os
import random
import json
from pydub import AudioSegment
from pydub.effects import normalize, compress_dynamic_range, low_pass_filter, high_pass_filter
import asyncio
from datetime import datetime
import zipfile
import natsort
import time
import webvtt
import re
from typing import Dict, List, Tuple, Optional
from datetime import timedelta

import numpy as np
import wave
import time


# Khởi tạo môi trường - Ưu tiên GPU

class TTSModel:
    def __init__(self):
        self.models = {}
        self.tokenizer = Tokenizer()
        self.voice_cache = {}
        self.voice_files = self._discover_voices()
        
        try:
            if self.use_cuda:
                self.models['cuda'] = torch.compile(KModel().to('cuda').eval(), mode='max-autotune')
                with torch.no_grad():
                    _ = self.models['cuda'](torch.randn(1, 64).cuda(), torch.randn(1, 80, 100).cuda(), 1.0)
            
            self.models['cpu'] = KModel().to('cpu').eval()
        except Exception as e:
            print(f"Error loading model: {e}")
            self.models = {'cpu': KModel().to('cpu').eval()}
        
        self.pipelines = {
            'a': KPipeline(lang_code='a', model=False),
            'b': KPipeline(lang_code='b', model=False)
        }
    
    def _discover_voices(self):
        """Discover available voice files in the voices folder"""
        voice_files = {}
        voices_dir = "voices"
        
        if not os.path.exists(voices_dir):
            os.makedirs(voices_dir)
            print(f"Created voices directory at {os.path.abspath(voices_dir)}")
            return voice_files
            
        for file in os.listdir(voices_dir):
            if file.endswith(".pt"):
                voice_name = os.path.splitext(file)[0]
                voice_files[voice_name] = os.path.join(voices_dir, file)
                print(f"Found voice: {voice_name}")
                
        return voice_files

    def get_voice_list(self):
        """Get list of available voices for the UI"""
        voices = list(self.voice_files.keys())
        if not voices:
            print("Warning: No voice files found in voices folder")
        return voices

class TextProcessor:
    @staticmethod
    def clean_text(text: str) -> str:
        text = TextProcessor._process_special_cases(text)
        
        re_tab = re.compile(r'[\r\t]')
        re_spaces = re.compile(r' +')
        re_punctuation = re.compile(r'(\s)([,.!?])')
        
        text = re_tab.sub(' ', text)
        text = re_spaces.sub(' ', text)
        text = re_punctuation.sub(r'\2', text)
        return text.strip()

    @staticmethod
    def _process_special_cases(text: str) -> str:
        """Pipeline xử lý đặc biệt với thứ tự tối ưu"""
        text = TextProcessor._process_emails(text)
        text = TextProcessor._process_websites(text)
        text = TextProcessor._process_phone_numbers(text)
        text = TextProcessor._process_temperatures(text)
        text = TextProcessor._process_measurements(text)
        text = TextProcessor._process_currency(text)
        text = TextProcessor._process_percentages(text)
        text = TextProcessor._process_math_operations(text)
        text = TextProcessor._process_times(text)
        text = TextProcessor._process_years(text)
        text = TextProcessor._process_special_symbols(text)
        
        return text
    
    @staticmethod
    def _process_emails(text: str) -> str:
        """Process emails with correct English pronunciation for all special characters"""
        def convert_email(match):
            full_email = match.group(0)
            # Replace each special character with its English pronunciation
            processed = (full_email
                        .replace('@', ' at ')
                        .replace('.', ' dot ')
                        .replace('-', ' dash ')
                        .replace('_', ' underscore ')
                        .replace('+', ' plus ')
                        .replace('/', ' slash ')
                        .replace('=', ' equals '))
            return processed

        # Regex to match all email formats
        email_pattern = r'\b[\w.+-]+@[\w.-]+\.[a-zA-Z]{2,}\b'
        return re.sub(email_pattern, convert_email, text)

    @staticmethod
    def _process_websites(text: str) -> str:
        """Process websites with correct English pronunciation for special characters"""
        def convert_website(match):
            url = match.group(1)
            # Replace each special character with its English pronunciation
            return (url.replace('.', ' dot ')
                     .replace('-', ' dash ')
                     .replace('_', ' underscore ')
                     .replace('/', ' slash ')
                     .replace('?', ' question mark ')
                     .replace('=', ' equals ')
                     .replace('&', ' ampersand '))

        # Only process websites that don't contain @ (to avoid conflict with emails)
        website_pattern = r'\b(?![\w.-]*@)((?:https?://)?(?:www\.)?[\w.-]+\.[a-z]{2,}(?:[/?=&#][\w.-]*)*)\b'
        return re.sub(website_pattern, convert_website, text, flags=re.IGNORECASE)

    @staticmethod
    def _process_temperatures(text: str) -> str:
        """Process temperatures and cardinal directions with degree symbols"""
        def temp_to_words(temp, unit):
            temp_text = TextProcessor._number_to_words(temp)
            unit = unit.upper() if unit else ''
            
            unit_map = {
                'C': 'degrees Celsius',
                'F': 'degrees Fahrenheit',
                'N': 'degrees north',
                'S': 'degrees south',
                'E': 'degrees east', 
                'W': 'degrees west',
                '': 'degrees'  # Default case for just number with degree symbol
            }
            unit_text = unit_map.get(unit, f'degrees {unit}')
            
            return f"{temp_text} {unit_text}"
        
        # Process formats like 75°F, 100°C, 15°N, 120°E
        text = re.sub(
            r'(-?\d+)°([NSEWCFnsewcf]?)',
            lambda m: temp_to_words(m.group(1), m.group(2)),
            text,
            flags=re.IGNORECASE
        )
        
        # Add degree symbol pronunciation when standalone
        text = re.sub(r'°', ' degrees ', text)
        
        return text

    @staticmethod
    def _process_measurements(text: str) -> str:
        """Xử lý đơn vị đo lường, đọc chuẩn số thập phân (1.65m → 'one point six five meters')"""
        units_map = {
            'km/h': 'kilometers per hour',
            'mph': 'miles per hour',
            'kg': 'kilograms',
            'g': 'grams',
            'cm': 'centimeters',
            'm': 'meter',  # Sửa thành singular để xử lý số nhiều sau
            'mm': 'millimeters',
            'L': 'liter',
            'l': 'liter',
            'ml': 'milliliter',
            'mL': 'milliliter',
            'h': 'hour',
            'min': 'minute',
            's': 'second'
        }
    
        plural_units = {'L', 'l', 'mL', 'ml'}  # Đơn vị không thêm 's' dù số nhiều
    
        def measurement_to_words(value, unit):
            try:
                unit_lower = unit.lower()
                unit_text = units_map.get(unit, units_map.get(unit_lower, unit))
    
                # Đọc số thập phân: one point six five
                if '.' in value:
                    integer, decimal = value.split('.')
                    value_text = (
                        f"{TextProcessor._number_to_words(integer)} "
                        f"point {' '.join(TextProcessor._digit_to_word(d) for d in decimal)}"
                    )
                else:
                    value_text = TextProcessor._number_to_words(value)
    
                # Xử lý số nhiều (thêm 's' nếu value != 1 và đơn vị không nằm trong plural_units)
                if float(value) != 1 and unit in units_map and unit not in plural_units:
                    unit_text += 's'
    
                return f"{value_text} {unit_text}"
            except:
                return f"{value}{unit}"  # Giữ nguyên nếu có lỗi
    
        # Regex bắt các số + đơn vị (kể cả viết liền như 1.65m)
        text = re.sub(
            r'(-?\d+\.?\d*)\s*({})s?\b'.format('|'.join(re.escape(key) for key in units_map.keys())),
            lambda m: measurement_to_words(m.group(1), m.group(2)),
            text,
            flags=re.IGNORECASE
        )
        return text
    
    @staticmethod
    def _process_currency(text: str) -> str:
        """Xử lý tiền tệ (hỗ trợ số nguyên, thập phân, và dấu chấm cuối câu)"""
        currency_map = {
            '$': 'dollars',
            '€': 'euros',
            '£': 'pounds',
            '¥': 'yen',
            '₩': 'won',
            '₽': 'rubles'
        }
    
        def currency_to_words(value, symbol):
            # Xử lý dấu chấm kết thúc câu (ví dụ: $20.)
            if value.endswith('.'):
                value = value[:-1]
                return f"{TextProcessor._number_to_words(value)} {currency_map.get(symbol, '')}."
    
            # Xử lý số thập phân (ví dụ: $20.5 → "twenty dollars and fifty cents")
            if '.' in value:
                integer_part, decimal_part = value.split('.')
                decimal_part = decimal_part.ljust(2, '0')  # Đảm bảo 2 chữ số
                return (
                    f"{TextProcessor._number_to_words(integer_part)} {currency_map.get(symbol, '')} "
                    f"and {TextProcessor._number_to_words(decimal_part)} cents"
                )
    
            # Số nguyên (ví dụ: $20 → "twenty dollars")
            return f"{TextProcessor._number_to_words(value)} {currency_map.get(symbol, '')}"
    
        # Regex bắt tiền tệ (số nguyên hoặc thập phân, không bắt dấu chấm cuối nếu không có số)
        text = re.sub(
            r'([$€£¥₩₽])(\d+(?:\.\d+)?)(?=\s|$|\.|,|;)',  # Chỉ khớp nếu sau số là ký tự kết thúc
            lambda m: currency_to_words(m.group(2), m.group(1)),
            text
        )
    
        return text

    @staticmethod
    def _process_percentages(text: str) -> str:
        """Xử lý phần trăm"""
        text = re.sub(
            r'(\d+\.?\d*)%',
            lambda m: f"{TextProcessor._number_to_words(m.group(1))} percent",
            text
        )
        return text

    @staticmethod
    def _process_math_operations(text: str) -> str:
        """Xử lý các phép toán và khoảng số"""
        math_map = {
            '+': 'plus',
            '-': 'minus',  # Mặc định là "minus", sẽ xử lý riêng cho khoảng số
            '×': 'times',
            '*': 'times',
            '÷': 'divided by',
            '/': 'divided by',
            '=': 'equals',
            '>': 'is greater than',
            '<': 'is less than'
        }
    
        # Xử lý KHOẢNG SỐ (3-4 → "three to four") khi KHÔNG có dấu = hoặc phép toán sau -
        text = re.sub(
            r'(\d+)\s*-\s*(\d+)(?!\s*[=+×*÷/><])',  # Chỉ áp dụng khi KHÔNG có dấu =/+/*... sau -
            lambda m: f"{TextProcessor._number_to_words(m.group(1))} to {TextProcessor._number_to_words(m.group(2))}",
            text
        )
    
        # Xử lý PHÉP TRỪ (chỉ khi có dấu = hoặc phép toán sau -)
        text = re.sub(
            r'(\d+)\s*-\s*(\d+)(?=\s*[=+×*÷/><])',  # Chỉ áp dụng khi CÓ dấu =/+/*... sau -
            lambda m: f"{TextProcessor._number_to_words(m.group(1))} minus {TextProcessor._number_to_words(m.group(2))}",
            text
        )
    
        # Xử lý các PHÉP TOÁN KHÁC (+, *, /, ...)
        text = re.sub(
            r'(\d+)\s*([+×*÷/=><])\s*(\d+)',
            lambda m: (f"{TextProcessor._number_to_words(m.group(1))} "
                      f"{math_map.get(m.group(2), m.group(2))} "
                      f"{TextProcessor._number_to_words(m.group(3))}"),
            text
        )
    
        # Xử lý phân số 4/5
        text = re.sub(
            r'(\d+)/(\d+)',
            lambda m: (f"{TextProcessor._number_to_words(m.group(1))} "
                      f"divided by {TextProcessor._number_to_words(m.group(2))}"),
            text
        )
    
        return text

    @staticmethod
    def _process_special_symbols(text: str) -> str:
        """Xử lý các ký hiệu đặc biệt"""
        symbol_map = {
            '@': 'at',
            '#': 'number',
            '&': 'and',
            '_': 'underscore'
        }

        # Xử lý @home → at home
        text = re.sub(
            r'@(\w+)',
            lambda m: f"at {m.group(1)}",
            text
        )

        # Xử lý #1 → number one
        text = re.sub(
            r'#(\d+)',
            lambda m: f"number {TextProcessor._number_to_words(m.group(1))}",
            text
        )

        # Xử lý các ký hiệu đơn lẻ
        for symbol, replacement in symbol_map.items():
            text = text.replace(symbol, f' {replacement} ')

        return text

    @staticmethod
    def _process_times(text: str) -> str:
        """Xử lý MỌI định dạng thời gian (giờ:phút:giây, có/không AM/PM)"""
        text = re.sub(
            r'\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(AM|PM|am|pm)?\b',
            lambda m: TextProcessor._time_to_words(m.group(1), m.group(2), m.group(3), m.group(4)),
            text
        )
        return text
    
    @staticmethod
    def _time_to_words(hour: str, minute: str, second: str = None, period: str = None) -> str:
        """Chuyển thời gian thành giọng nói tự nhiên (bao gồm giây nếu có)"""
        hour_int = int(hour)
        minute_int = int(minute)
        
        # 1. Xử lý AM/PM (viết hoa chuẩn)
        period_text = f" {period.upper()}" if period else ""
        
        # 2. Chuyển đổi giờ 24h → 12h
        hour_12 = hour_int % 12
        hour_text = "twelve" if hour_12 == 0 else TextProcessor._number_to_words(str(hour_12))
        
        # 3. Xử lý phút
        minute_text = " \u200Bo'clock\u200B " if minute_int == 0 else \
                     f"oh {TextProcessor._number_to_words(minute)}" if minute_int < 10 else \
                     TextProcessor._number_to_words(minute)
        
        # 4. Xử lý giây (nếu có)
        second_text = ""
        if second and int(second) > 0:
            second_text = f" and {TextProcessor._number_to_words(second)} seconds"
        
        # 5. Ghép câu logic
        if minute_int == 0 and not second_text:
            return f"{hour_text}{minute_text}{period_text}"  # 3:00 → "three o'clock"
        else:
            return f"{hour_text} {minute_text}{second_text}{period_text}"  # 3:05:30 → "three oh five and thirty seconds"

    @staticmethod
    def _process_years(text: str) -> str:
        """Xử lý các năm trong văn bản"""
        # Xử lý năm 4 chữ số từ 1000-2999 (phổ biến nhất)
        text = re.sub(
            r'\b(1[0-9]{3}|2[0-9]{3})\b',
            lambda m: TextProcessor._year_to_words(m.group(1)),
            text
        )
    
        # Xử lý năm 2 chữ số (nếu cần)
        text = re.sub(
            r'\b([0-9]{2})\b',
            lambda m: TextProcessor._two_digit_year_to_words(m.group(1)),
            text
        )
    
        return text

    @staticmethod
    def _year_to_words(year: str) -> str:
        """Chuyển năm 4 chữ số thành chữ"""
        if len(year) != 4:
            return year
    
        # Năm từ 2000-2099 có thể đọc là "two thousand twenty-one" hoặc "twenty twenty-one"
        if year.startswith('20'):
            # Lựa chọn cách đọc phổ biến hơn
            return f"twenty {TextProcessor._two_digit_year_to_words(year[2:])}"
    
        # Các năm khác đọc bình thường
        return TextProcessor._number_to_words(year)

    @staticmethod
    def _two_digit_year_to_words(num: str) -> str:
        """Chuyển số 2 chữ số thành chữ (cho năm)"""
        if len(num) != 2:
            return num
    
        num_int = int(num)
        if num_int == 0:
            return "zero zero"
        if num_int < 10:
            return f"oh {TextProcessor._digit_to_word(num[1])}"
    
        ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
                'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
                'seventeen', 'eighteen', 'nineteen']
        tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 
               'eighty', 'ninety']
    
        if num_int < 20:
            return ones[num_int]
    
        ten, one = divmod(num_int, 10)
        if one == 0:
            return tens[ten]
        return f"{tens[ten]} {ones[one]}"        

    @staticmethod
    def _process_phone_numbers(text: str) -> str:
        """Xử lý số điện thoại với regex chính xác hơn"""
        # Pattern mới tránh xung đột với số La Mã
        phone_pattern = r'\b(\d{3})[-. ]?(\d{3})[-. ]?(\d{4})\b'
    
        def phone_to_words(match):
            groups = match.groups()
            # Đọc từng số trong từng nhóm và thêm dấu phẩy (,) để tạo ngắt nghỉ
            parts = []
            for part in groups:
                digits = ' '.join([TextProcessor._digit_to_word(d) for d in part])
                parts.append(digits)
            return ', '.join(parts)  # Thêm dấu phẩy để tạo ngắt nghỉ khi đọc
    
        return re.sub(phone_pattern, phone_to_words, text)    
        @staticmethod
        def _process_currency_numbers(text: str) -> str:
            return re.sub(
                r'\$?(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\b',
                lambda m: f"{TextProcessor._number_to_words(m.group(1))} dollars" if '$' in m.group(0) 
                         else TextProcessor._number_to_words(m.group(1)),
                text
            )

    @staticmethod
    def _digit_to_word(digit: str) -> str:
        digit_map = {
            '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
            '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
        }
        return digit_map.get(digit, digit)

    @staticmethod
    def _number_to_words(number: str) -> str:
        num_str = number.replace(',', '')
    
        try:
            if '.' in num_str:
                integer_part, decimal_part = num_str.split('.')
                integer_text = TextProcessor._int_to_words(integer_part)
                decimal_text = ' '.join([TextProcessor._digit_to_word(d) for d in decimal_part])
                return f"{integer_text} point {decimal_text}"
            return TextProcessor._int_to_words(num_str)
        except:
            return number

    @staticmethod
    def _digits_to_words(digits: str) -> str:
        return ' '.join([TextProcessor._digit_to_word(d) for d in digits])

    @staticmethod
    def _int_to_words(num_str: str) -> str:
        num = int(num_str)
        if num == 0:
            return 'zero'
        
        units = ['', 'thousand', 'million', 'billion', 'trillion']
        words = []
        level = 0
        
        while num > 0:
            chunk = num % 1000
            if chunk != 0:
                words.append(TextProcessor._convert_less_than_thousand(chunk) + ' ' + units[level])
            num = num // 1000
            level += 1
        
        return ' '.join(reversed(words)).strip()

    @staticmethod
    def _convert_less_than_thousand(num: int) -> str:
        ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
                'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
                'seventeen', 'eighteen', 'nineteen']
        tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 
               'eighty', 'ninety']
        
        if num == 0:
            return ''
        if num < 20:
            return ones[num]
        if num < 100:
            return tens[num // 10] + (' ' + ones[num % 10] if num % 10 != 0 else '')
        return ones[num // 100] + ' hundred' + (' ' + TextProcessor._convert_less_than_thousand(num % 100) if num % 100 != 0 else '')

    @staticmethod
    def split_sentences(text: str) -> List[str]:
        re_special_cases = re.compile(r'(?<!\w)([A-Z][a-z]*\.)(?=\s)')
        re_sentence_split = re.compile(r'(?<=[.!?])\s+')
        
        sentences = []
        for line in text.split('\n'):
            stripped = line.strip()
            if stripped:
                stripped = re_special_cases.sub(r'\1Ⓝ', stripped)
                parts = re_sentence_split.split(stripped)
                for part in parts:
                    part = part.replace('Ⓝ', '')
                    if part:
                        sentences.append(part)
        return sentences

    @staticmethod
    def parse_dialogues(text: str, prefixes: List[str]) -> List[Tuple[str, str]]:
        """Phân tích nội dung hội thoại với các prefix chỉ định"""
        dialogues = []
        current = None
        
        for line in text.split('\n'):
            line = line.strip()
            if not line:
                continue
                
            # Kiểm tra xem dòng có bắt đầu bằng bất kỳ prefix nào không
            found_prefix = None
            for prefix in prefixes:
                if line.lower().startswith(prefix.lower() + ':'):
                    found_prefix = prefix
                    break
                    
            if found_prefix:
                if current:
                    # Xử lý các trường hợp đặc biệt trước khi thêm vào dialogues
                    processed_content = TextProcessor._process_special_cases(current[1])
                    dialogues.append((current[0], processed_content))
                
                speaker = found_prefix
                content = line[len(found_prefix)+1:].strip()
                current = (speaker, content)
            elif current:
                current = (current[0], current[1] + ' ' + line)
                
        if current:
            # Xử lý các trường hợp đặc biệt cho dòng cuối cùng
            processed_content = TextProcessor._process_special_cases(current[1])
            dialogues.append((current[0], processed_content))
            
        return dialogues


class AudioProcessor:
    @staticmethod
    def enhance_audio(audio: np.ndarray, volume: float = 1.0, pitch: float = 1.0) -> np.ndarray:
        # 1. Chuẩn hóa và bảo vệ chống clipping
        max_sample = np.max(np.abs(audio)) + 1e-8
        audio = (audio / max_sample) * 0.9 * volume  # Giữ headroom 10%
        
        # 2. Soft clipping để tránh distortion
        audio = np.tanh(audio * 1.5) / 1.5  # Hàm tanh cho soft clipping mượt
        
        # 3. Chuyển sang AudioSegment với xử lý pitch
        audio_seg = AudioSegment(
            (audio * 32767).astype(np.int16).tobytes(),
            frame_rate=24000,
            sample_width=2,
            channels=1
        )
        
        # 4. Xử lý pitch với crossfade
        if pitch != 1.0:
            audio_seg = audio_seg._spawn(
                audio_seg.raw_data,
                overrides={"frame_rate": int(audio_seg.frame_rate * pitch)}
            ).set_frame_rate(24000).fade_in(10).fade_out(10)
        
        # 5. Xử lý động và lọc tần
        audio_seg = compress_dynamic_range(
            audio_seg,
            threshold=-12.0,
            ratio=3.5,
            attack=5,
            release=50
        )
        audio_seg = audio_seg.low_pass_filter(11000).high_pass_filter(200)
        
        # 6. Chuẩn hóa an toàn
        if audio_seg.max_dBFS > -1.0:
            audio_seg = audio_seg.apply_gain(-audio_seg.max_dBFS * 0.8)
        
        return np.array(audio_seg.get_array_of_samples()) / 32768.0

    @staticmethod
    def calculate_pause(text: str, pause_settings: Dict[str, int]) -> int:
        """Calculate pause duration with more precise rules"""
        text = text.strip()
        if not text:
            return 0
            
        # Special cases that should have no pause
        if re.search(r'(?:^|\s)(?:Mr|Mrs|Ms|Dr|Prof|St|A\.M|P\.M|etc|e\.g|i\.e)\.$', text, re.IGNORECASE):
            return 0
            
        # Time formats (12:30) - minimal pause
        if re.search(r'\b\d{1,2}:\d{2}\b', text):
            return pause_settings.get('time_colon_pause', 50)  # Default 50ms for times
            
        # Determine pause based on last character
        last_char = text[-1]
        return pause_settings.get(last_char, pause_settings['default_pause'])

    @staticmethod
    def combine_segments(segments: List[AudioSegment], pauses: List[int]) -> AudioSegment:
        """Combine audio segments with frame-accurate timing"""
        combined = AudioSegment.silent(duration=0)  # Start with 0 silence
        
        for i, (seg, pause) in enumerate(zip(segments, pauses)):
            # Apply fades without affecting duration
            seg = seg.fade_in(10).fade_out(10)
            
            # Add segment
            combined += seg
            
            # Add pause if not the last segment
            if i < len(segments) - 1:
                combined += AudioSegment.silent(duration=max(50, pause))
        
        return combined
        
    @staticmethod
    def combine_with_pauses(segments: List[AudioSegment], pauses: List[int]) -> AudioSegment:
        combined = AudioSegment.empty()
        for i, (seg, pause) in enumerate(zip(segments, pauses)):
            seg = seg.fade_in(50).fade_out(50)
            combined += seg
            if i < len(segments) - 1:
                combined += AudioSegment.silent(duration=pause)
        return combined


# ==================== SYSTEM CONFIGURATION ====================
class TTSConfig:
    SETTINGS_FILE = "edge_tts_settings.json"

    LANGUAGES = {
        "Tiếng Việt": [
            {"name": "vi-VN-HoaiMyNeural", "gender": "Nữ"},
            {"name": "vi-VN-NamMinhNeural", "gender": "Nam"}
        ],
        "English (US)": [
            {"name": "en-US-GuyNeural", "gender": "Nam"},
            {"name": "en-US-JennyNeural", "gender": "Nữ"},
            {"name": "en-US-AvaNeural", "gender": "Nữ"},
            {"name": "en-US-AndrewNeural", "gender": "Nam"},
            {"name": "en-US-EmmaNeural", "gender": "Nữ"},
            {"name": "en-US-BrianNeural", "gender": "Nam"},
            {"name": "en-US-AnaNeural", "gender": "Nữ"},
            {"name": "en-US-AndrewMultilingualNeural", "gender": "Nam"},
            {"name": "en-US-AriaNeural", "gender": "Nữ"},
            {"name": "en-US-AvaMultilingualNeural", "gender": "Nữ"},
            {"name": "en-US-BrianMultilingualNeural", "gender": "Nam"},
            {"name": "en-US-ChristopherNeural", "gender": "Nam"},
            {"name": "en-US-EmmaMultilingualNeural", "gender": "Nữ"},
            {"name": "en-US-EricNeural", "gender": "Nam"},
            {"name": "en-US-MichelleNeural", "gender": "Nữ"},
            {"name": "en-US-RogerNeural", "gender": "Nam"},
            {"name": "en-US-SteffanNeural", "gender": "Nam"}
        ],
        "English (UK)": [
            {"name": "en-GB-LibbyNeural", "gender": "Nữ"},
            {"name": "en-GB-MiaNeural", "gender": "Nữ"},
            {"name": "en-GB-RyanNeural", "gender": "Nam"},
            {"name": "en-GB-MaisieNeural", "gender": "Nữ"},
            {"name": "en-GB-SoniaNeural", "gender": "Nữ"},
            {"name": "en-GB-ThomasNeural", "gender": "Nam"}
        ]
    }

# ==================== AUDIO PROCESSOR ====================
class AudioProcessor:
    @staticmethod
    def calculate_pause(text: str, pause_settings: Dict[str, int]) -> int:
        """Calculate pause duration with more precise rules"""
        text = text.strip()
        if not text:
            return 0
            
        # Special cases that should have no pause
        if re.search(r'(?:^|\s)(?:Mr|Mrs|Ms|Dr|Prof|St|A\.M|P\.M|etc|e\.g|i\.e)\.$', text, re.IGNORECASE):
            return 0
            
        # Time formats (12:30) - minimal pause
        if re.search(r'\b\d{1,2}:\d{2}\b', text):
            return pause_settings.get('time_colon_pause', 50)  # Default 50ms for times
            
        # Determine pause based on last character
        last_char = text[-1]
        return pause_settings.get(last_char, pause_settings['default_pause'])

    @staticmethod
    def combine_with_pauses(segments: List[AudioSegment], pauses: List[int]) -> AudioSegment:
        combined = AudioSegment.empty()
        for i, (seg, pause) in enumerate(zip(segments, pauses)):
            seg = seg.fade_in(50).fade_out(50)
            combined += seg
            if i < len(segments) - 1:
                combined += AudioSegment.silent(duration=pause)
        return combined

# ==================== SUBTITLE GENERATOR ====================
class SubtitleGenerator:
    @staticmethod
    def clean_subtitle_text(text: str) -> str:
        """Remove Q:/A:/CHARx: prefixes from subtitle text"""
        cleaned = re.sub(r'^(Q|A|CHAR\d+):\s*', '', text.strip())
        return cleaned
        
    @staticmethod
    def split_long_sentences(text: str, max_length: int = 120) -> List[str]:
        """Split long sentences at punctuation marks while preserving meaning"""
        sentences = []
        current = ""
        
        # Split at punctuation first
        parts = re.split(r'([.!?])', text)
        
        # Recombine with punctuation but check length
        for i in range(0, len(parts)-1, 2):
            part = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
            if len(current + part) <= max_length:
                current += part
            else:
                if current:
                    sentences.append(current)
                current = part
        
        if current:
            sentences.append(current)
            
        return sentences

    @staticmethod
    def generate_srt(audio_segments: List[AudioSegment], sentences: List[str], pause_settings: Dict[str, int]) -> str:
        """Generate SRT format subtitles with precise timing information"""
        subtitles = []
        current_time = 150  # Start with initial silence (150ms)
        max_subtitle_length = 120  # Maximum characters per subtitle line
        
        for i, (seg, sentence) in enumerate(zip(audio_segments, sentences)):
            # Remove Q: and A: prefixes if present
            cleaned_sentence = re.sub(r'^(Q|A|CHAR\d+):\s*', '', sentence.strip())
            
            # Split long sentences into smaller chunks at punctuation
            sentence_chunks = SubtitleGenerator.split_long_sentences(cleaned_sentence, max_subtitle_length)
            
            # Calculate duration per chunk (equal division for simplicity)
            chunk_duration = len(seg) / max(1, len(sentence_chunks))
            
            for j, chunk in enumerate(sentence_chunks):
                start_time = current_time + (j * chunk_duration)
                end_time = start_time + chunk_duration
                
                # Add subtitle entry
                subtitles.append({
                    'start': int(start_time),
                    'end': int(end_time),
                    'text': chunk.strip()
                })
            
            # Update current time with segment duration
            current_time += len(seg)
            
            # Add pause if not the last segment
            if i < len(audio_segments) - 1:
                pause = AudioProcessor.calculate_pause(sentence, pause_settings)
                current_time += max(100, pause)
        
        # Convert to SRT format with precise timing
        srt_content = []
        for idx, sub in enumerate(subtitles, 1):
            start_time = timedelta(milliseconds=sub['start'])
            end_time = timedelta(milliseconds=sub['end'])
            
            # Format: 00:00:01,040 --> 00:00:09,760
            start_str = f"{start_time.total_seconds() // 3600:02.0f}:{(start_time.total_seconds() % 3600) // 60:02.0f}:{start_time.total_seconds() % 60:06.3f}".replace('.', ',')
            end_str = f"{end_time.total_seconds() // 3600:02.0f}:{(end_time.total_seconds() % 3600) // 60:02.0f}:{end_time.total_seconds() % 60:06.3f}".replace('.', ',')
            
            srt_content.append(
                f"{idx}\n"
                f"{start_str} --> {end_str}\n"
                f"{sub['text']}\n"
            )
        
        return "\n".join(srt_content)

# ==================== BASE PROCESSOR CLASS ====================
class BaseTTSProcessor:
    def __init__(self):
        self.voice_map = {}
        self.initialize_voices()
        self.load_settings()
        self.audio_processor = AudioProcessor()
        self.subtitle_generator = SubtitleGenerator()
        
    def initialize_voices(self):
        for lang, voices in TTSConfig.LANGUAGES.items():
            for voice in voices:
                voice_name = voice['name'].split('-')[-1].replace('Neural', '')
                display_name = f"{lang} - {voice_name} ({voice['gender']})"
                self.voice_map[display_name] = voice['name']
    
    def load_settings(self):
        if os.path.exists(TTSConfig.SETTINGS_FILE):
            with open(TTSConfig.SETTINGS_FILE, 'r') as f:
                self.settings = json.load(f)
        else:
            self.settings = {}
    
    def save_settings(self):
        with open(TTSConfig.SETTINGS_FILE, 'w') as f:
            json.dump(self.settings, f)

    async def generate_speech(self, text, voice_id, rate, pitch, volume):
        try:
            # Add random delay between requests to prevent server overload
            await asyncio.sleep(random.uniform(0.1, 0.5))
            
            rate_str = f"{rate}%" if rate != 0 else "+0%"
            pitch_str = f"+{pitch}Hz" if pitch >=0 else f"{pitch}Hz"
            
            communicate = edge_tts.Communicate(text, voice_id, rate=rate_str, pitch=pitch_str)
            temp_file = f"temp_{random.randint(1000,9999)}.mp3"
            
            # Generate audio and subtitles
            subs = []
            start_time = 0
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    with open(temp_file, "ab") as audio_file:
                        audio_file.write(chunk["data"])
                elif chunk["type"] == "WordBoundary":
                    subs.append({
                        "text": chunk["text"],
                        "start": chunk["offset"],
                        "end": chunk["offset"] + chunk["duration"]
                    })
                    start_time = end_time
            
            # Audio processing pipeline
            audio = AudioSegment.from_file(temp_file)
            
            # Apply volume adjustment (limit to +10dB max)
            volume_adjustment = min(max(volume - 100, -50), 10)  # Limit to +10dB max
            audio = audio + volume_adjustment
            
            # Apply audio processing effects
            audio = normalize(audio)
            audio = compress_dynamic_range(audio, threshold=-20.0, ratio=4.0)
            audio = low_pass_filter(audio, 14000)  # Reduce high-frequency hiss
            audio = high_pass_filter(audio, 100)  # Remove ultra-low frequencies
            
            # Export with higher bitrate
            audio.export(temp_file, format="mp3", bitrate="256k")
            
            return temp_file, subs
        except Exception as e:
            print(f"Error generating speech: {str(e)}")
            return None, []

    def generate_srt(self, subtitles, output_path):
        """Generate SRT file from subtitles data"""
        if not subtitles:
            return None
            
        srt_path = output_path.replace('.mp3', '.srt')
        try:
            with open(srt_path, 'w', encoding='utf-8') as f:
                for i, sub in enumerate(subtitles, start=1):
                    start = timedelta(milliseconds=sub["start"])
                    end = timedelta(milliseconds=sub["end"])
                    
                    # Format: 00:00:01,040 --> 00:00:09,760
                    start_str = f"{start.total_seconds() // 3600:02.0f}:{(start.total_seconds() % 3600) // 60:02.0f}:{start.total_seconds() % 60:06.3f}".replace('.', ',')
                    end_str = f"{end.total_seconds() // 3600:02.0f}:{(end.total_seconds() % 3600) // 60:02.0f}:{end.total_seconds() % 60:06.3f}".replace('.', ',')
                    
                    f.write(f"{i}\n{start_str} --> {end_str}\n{sub['text']}\n\n")
            return srt_path
        except Exception as e:
            print(f"Error generating SRT: {e}")
            return None

    def _format_time(self, milliseconds):
        """Convert milliseconds to SRT time format"""
        seconds, milliseconds = divmod(milliseconds, 1000)
        minutes, seconds = divmod(seconds, 60)
        hours, minutes = divmod(minutes, 60)
        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
		
    def check_srt_generated(self, audio_path):
        if not audio_path:
            return False
        srt_path = audio_path.replace('.mp3', '.srt')
        return os.path.exists(srt_path)		

# ==================== TAB 1: SINGLE CHARACTER ====================
class StoryTTSProcessor(BaseTTSProcessor):
    def __init__(self):
        super().__init__()
        if not self.settings.get("single_char"):
            self.settings["single_char"] = {
                "language": "Tiếng Việt",
                "voice": "Tiếng Việt - HoaiMy (Nữ)",
                "rate": 0,
                "pitch": 0,
                "volume": 100,
                "pause": 500
            }
    
    async def process_story(self, content, voice, rate, pitch, volume, pause, save_settings):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        voice_dir = f"story_{timestamp}"
        os.makedirs(voice_dir, exist_ok=True)
        
        lines = [line.strip() for line in content.splitlines() if line.strip()]
        all_subs = []
        audio_files = []
        
        for idx, text in enumerate(lines):
            try:
                temp_file, subs = await self.generate_speech(
                    text, 
                    self.voice_map[voice], 
                    rate, 
                    pitch, 
                    volume
                )
                if temp_file:
                    new_name = f"{voice_dir}/line_{idx+1:03d}.mp3"
                    os.rename(temp_file, new_name)
                    audio_files.append(new_name)
                    
                    # Process subtitles
                    if subs:
                        line_subs = []
                        for sub in subs:
                            line_subs.append({
                                "text": sub["text"],
                                "start": sub["start"],
                                "end": sub["end"]
                            })
                        all_subs.append(line_subs)
            except Exception as e:
                print(f"❌ Lỗi dòng {idx+1}: {str(e)}")
        
        if not audio_files:
            return None, None, "❌ Không tạo được file âm thanh"
        
        merged_path = self.merge_audio(voice_dir, pause)
        srt_path = self.generate_full_srt(all_subs, pause, merged_path)
        
        if save_settings:
            self.settings["single_char"] = {
                "language": next(k for k in TTSConfig.LANGUAGES.keys() if voice.startswith(k)),
                "voice": voice,
                "rate": rate,
                "pitch": pitch,
                "volume": volume,
                "pause": pause
            }
            self.save_settings()
        
        return merged_path, srt_path, "✅ Hoàn thành! Bấm vào nút phát để nghe"

    def merge_audio(self, voice_dir, pause_duration):
        files = natsort.natsorted([f for f in os.listdir(voice_dir) if f.startswith("line_")])
        merged = AudioSegment.empty()
        pause = AudioSegment.silent(duration=pause_duration)
        
        for file in files:
            try:
                audio = AudioSegment.from_file(os.path.join(voice_dir, file))
                audio = audio.fade_in(50).fade_out(50)
                audio = normalize(audio)
                merged += audio + pause
            except Exception as e:
                print(f"❌ Lỗi file {file}: {str(e)}")
        
        merged = merged.low_pass_filter(15000)
        merged = compress_dynamic_range(merged)
        
        output_path = os.path.join(voice_dir, "merged_story.mp3")
        merged.export(output_path, format="mp3", bitrate="256k")
        return output_path

    def generate_full_srt(self, all_subs, pause_duration, audio_path):
        """Generate SRT for the full merged audio"""
        if not any(all_subs):
            return None
            
        vtt = webvtt.WebVTT()
        current_time = 0
        
        for line_subs in all_subs:
            for sub in line_subs:
                start = current_time + sub["start"]
                end = current_time + sub["end"]
                vtt.captions.append(webvtt.Caption(
                    self._format_time(start),
                    self._format_time(end),
                    sub["text"]
                ))
            
            # Add pause time after each line
            current_time += line_subs[-1]["end"] + pause_duration if line_subs else 0
        
        srt_path = audio_path.replace('.mp3', '.srt')
        vtt.save(srt_path)
        return srt_path

def generate_story_audio(self, text: str, voice: str, speed: float, device: str,
                       pause_settings: Dict[str, int], volume: float = 1.0, pitch: float = 1.0) -> Tuple[Tuple[int, np.ndarray], str, str]:
    start_time = time.time()
    clean_text = self.text_processor.clean_text(text)
    sentences = self.text_processor.split_sentences(clean_text)
    
    if not sentences:
        return None, "No content to read", ""
    
    audio_segments = []
    pause_durations = []
    
    # Adjust pause settings based on speed
    speed_factor = max(0.5, min(2.0, speed))
    adjusted_pause_settings = {
        k: int(v / speed_factor) for k, v in pause_settings.items()
    }
    
    # Generate each audio segment
    for sentence in sentences:
        result = self.generate_sentence_audio(sentence, voice, speed, device, volume, pitch)
        if not result:
            continue
            
        sample_rate, audio_data = result
        audio_seg = AudioSegment(
            (audio_data * 32767).astype(np.int16).tobytes(),
            frame_rate=sample_rate,
            sample_width=2,
            channels=1
        )
        audio_segments.append(audio_seg)
        
        # Calculate precise pause duration
        pause = self.audio_processor.calculate_pause(sentence, adjusted_pause_settings)
        pause_durations.append(pause)
    
    if not audio_segments:
        return None, "Failed to generate audio", ""
    
    # Combine with frame-accurate timing
    combined_audio = self.audio_processor.combine_segments(audio_segments, pause_durations)
    
    # Export with precise timing
    with io.BytesIO() as buffer:
        combined_audio.export(buffer, format="mp3", bitrate="256k", parameters=["-ar", str(combined_audio.frame_rate)])
        buffer.seek(0)
        audio_data = np.frombuffer(buffer.read(), dtype=np.uint8)
    
    # Generate subtitles with the same timing used for audio
    subtitles = self.subtitle_generator.generate_srt(audio_segments, sentences, adjusted_pause_settings)
    
    stats = (f"Processed {len(clean_text)} chars, {len(clean_text.split())} words\n"
            f"Audio duration: {len(combined_audio)/1000:.2f}s\n"
            f"Time: {time.time() - start_time:.2f}s\n"
            f"Device: {device.upper()}")
    
    return (combined_audio.frame_rate, audio_data), stats, subtitles	

# ==================== TAB 2: MULTI CHARACTER ====================
class MultiCharacterTTSProcessor(BaseTTSProcessor):
    def __init__(self):
        super().__init__()
        if not self.settings.get("multi_char"):
            self.settings["multi_char"] = {
                "language_char1": "Tiếng Việt",
                "voice_char1": "Tiếng Việt - HoaiMy (Nữ)",
                "language_char2": "Tiếng Việt",
                "voice_char2": "Tiếng Việt - NamMinh (Nam)",
                "language_char3": "Tiếng Việt",
                "voice_char3": "Tiếng Việt - HoaiMy (Nữ)",
                "rate_char1": -20,
                "pitch_char1": 0,
                "volume_char1": 100,
                "rate_char2": -25,
                "pitch_char2": 0,
                "volume_char2": 100,
                "rate_char3": -15,
                "pitch_char3": 0,
                "volume_char3": 100,
                "repeat_times": 1,
                "pause_between": 500
            }
    
    def parse_story(self, content):
        dialogues = []
        
        for line in content.splitlines():
            line = line.strip()
            if not line:
                continue
                
            if line.upper().startswith("CHAR1:"):
                dialogues.append(("CHAR1", line[6:].strip()))
            elif line.upper().startswith("CHAR2:"):
                dialogues.append(("CHAR2", line[6:].strip()))
            elif line.upper().startswith("CHAR3:"):
                dialogues.append(("CHAR3", line[6:].strip()))
            elif line.upper().startswith("NARRATOR:"):
                dialogues.append(("NARRATOR", line[9:].strip()))
            else:
                if dialogues:
                    last_char, last_text = dialogues[-1]
                    dialogues[-1] = (last_char, f"{last_text} {line}")
        
        return dialogues

    async def process_story(self, content, output_format, 
                          char1_voice, char2_voice, char3_voice,
                          char1_rate, char2_rate, char3_rate,
                          char1_pitch, char2_pitch, char3_pitch,
                          char1_volume, char2_volume, char3_volume,
                          repeat_times, pause_between, save_settings):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        voice_dir = f"story_{timestamp}"
        os.makedirs(voice_dir, exist_ok=True)

        dialogues = self.parse_story(content)
        all_subs = []
        audio_files = []
        
        for idx, (character, text) in enumerate(dialogues):
            file_prefix = f"{idx+1:03d}"
            
            if character == "CHAR1":
                voice_id = self.voice_map[char1_voice]
                rate = char1_rate
                pitch = char1_pitch
                volume = char1_volume
                file_name = f"{file_prefix}_CHAR1.{output_format.lower()}"
            elif character == "CHAR2":
                voice_id = self.voice_map[char2_voice]
                rate = char2_rate
                pitch = char2_pitch
                volume = char2_volume
                file_name = f"{file_prefix}_CHAR2.{output_format.lower()}"
            elif character == "CHAR3":
                voice_id = self.voice_map[char3_voice]
                rate = char3_rate
                pitch = char3_pitch
                volume = char3_volume
                file_name = f"{file_prefix}_CHAR3.{output_format.lower()}"
            else:  # NARRATOR
                voice_id = self.voice_map[char1_voice]
                rate = char1_rate
                pitch = char1_pitch
                volume = char1_volume
                file_name = f"{file_prefix}_NARRATOR.{output_format.lower()}"

            try:
                temp_file, subs = await self.generate_speech(text, voice_id, rate, pitch, volume)
                if temp_file:
                    new_path = os.path.join(voice_dir, file_name)
                    os.rename(temp_file, new_path)
                    audio_files.append(new_path)
                    
                    if subs:
                        char_subs = []
                        for sub in subs:
                            char_subs.append({
                                "text": f"{character}: {sub['text']}",
                                "start": sub["start"],
                                "end": sub["end"]
                            })
                        all_subs.append(char_subs)
            except Exception as e:
                print(f"❌ Lỗi khi tạo giọng nói cho đoạn {idx+1}: {str(e)}")

        if not audio_files:
            return None, None, "❌ Không tạo được file âm thanh"

        merged_path = self.merge_story(voice_dir, output_format, repeat_times, pause_between)
        srt_path = self.generate_full_srt(all_subs, pause_between, merged_path, repeat_times)
        
        if save_settings:
            self.settings["multi_char"] = {
                "language_char1": next(k for k in TTSConfig.LANGUAGES.keys() if char1_voice.startswith(k)),
                "voice_char1": char1_voice,
                "language_char2": next(k for k in TTSConfig.LANGUAGES.keys() if char2_voice.startswith(k)),
                "voice_char2": char2_voice,
                "language_char3": next(k for k in TTSConfig.LANGUAGES.keys() if char3_voice.startswith(k)),
                "voice_char3": char3_voice,
                "rate_char1": char1_rate,
                "pitch_char1": char1_pitch,
                "volume_char1": char1_volume,
                "rate_char2": char2_rate,
                "pitch_char2": char2_pitch,
                "volume_char2": char2_volume,
                "rate_char3": char3_rate,
                "pitch_char3": char3_pitch,
                "volume_char3": char3_volume,
                "repeat_times": repeat_times,
                "pause_between": pause_between
            }
            self.save_settings()

        return merged_path, srt_path, "✅ Hoàn thành! Bấm vào nút phát để nghe"

    def merge_story(self, voice_dir, fmt, repeat_count, pause_between):
        all_files = sorted(
            [f for f in os.listdir(voice_dir) if f.endswith(f".{fmt.lower()}")],
            key=lambda x: int(x.split('_')[0])
        )
        
        merged = AudioSegment.empty()
        pause = AudioSegment.silent(duration=pause_between)

        for file in all_files:
            try:
                audio = AudioSegment.from_file(os.path.join(voice_dir, file))
                audio = audio.fade_in(50).fade_out(50)
                for _ in range(repeat_count):
                    merged += normalize(audio)
                    merged += pause
            except Exception as e:
                print(f"❌ Lỗi khi xử lý file {file}: {str(e)}")
                return None

        merged = merged.low_pass_filter(15000)
        merged = compress_dynamic_range(merged)
        
        output_path = os.path.join(voice_dir, f"story_merged.{fmt.lower()}")
        merged.export(output_path, format=fmt.lower(), bitrate="256k")
        return output_path

    def generate_full_srt(self, all_subs, pause_between, audio_path, repeat_times):
        """Generate SRT for the full merged audio with character markers"""
        if not any(all_subs):
            return None
            
        vtt = webvtt.WebVTT()
        current_time = 0
        
        for _ in range(repeat_times):
            for line_subs in all_subs:
                for sub in line_subs:
                    start = current_time + sub["start"]
                    end = current_time + sub["end"]
                    vtt.captions.append(webvtt.Caption(
                        self._format_time(start),
                        self._format_time(end),
                        sub["text"]
                    ))
                
                current_time += (line_subs[-1]["end"] if line_subs else 0) + pause_between
        
        srt_path = audio_path.replace('.mp3', '.srt')
        vtt.save(srt_path)
        return srt_path

# ==================== TAB 3: Q&A DIALOGUE ====================
class DialogueTTSProcessor(BaseTTSProcessor):
    def __init__(self):
        super().__init__()
        if not self.settings.get("dialogue"):
            self.settings["dialogue"] = {
                "language_q": "Tiếng Việt",
                "voice_q": "Tiếng Việt - HoaiMy (Nữ)",
                "language_a": "Tiếng Việt",
                "voice_a": "Tiếng Việt - NamMinh (Nam)",
                "rate_q": -20,
                "pitch_q": 0,
                "volume_q": 100,
                "rate_a": -25,
                "pitch_a": 0,
                "volume_a": 100,
                "repeat_times": 2,
                "pause_q": 200,
                "pause_a": 500
            }
    
    def parse_dialogues(self, content):
        dialogues = []
        current_speaker = None
        current_text = []

        for line in content.splitlines():
            line = line.strip()
            if not line: continue

            if line.upper().startswith(("Q:", "A:")):
                if current_speaker is not None:
                    dialogues.append((current_speaker, " ".join(current_text)))
                
                parts = line.split(":", 1)
                current_speaker = parts[0].upper()
                current_text = [parts[1].strip()] if len(parts) > 1 else [""]
            else:
                current_text.append(line)

        if current_speaker is not None:
            dialogues.append((current_speaker, " ".join(current_text)))

        return dialogues

    async def process_dialogues(self, content, output_format, 
                             language_q, voice_q, rate_q, pitch_q, volume_q,
                             language_a, voice_a, rate_a, pitch_a, volume_a,
                             repeat_times, pause_q, pause_a, save_settings):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        voice_dir = f"dialogues_{timestamp}"
        os.makedirs(voice_dir, exist_ok=True)

        dialogues = self.parse_dialogues(content)
        all_subs = []
        audio_files = []
        
        for idx, (speaker, text) in enumerate(dialogues):
            voice_id = self.voice_map[voice_q if speaker == "Q" else voice_a]
            rate = rate_q if speaker == "Q" else rate_a
            pitch = pitch_q if speaker == "Q" else pitch_a
            volume = volume_q if speaker == "Q" else volume_a

            try:
                temp_file, subs = await self.generate_speech(text, voice_id, rate, pitch, volume)
                if temp_file:
                    prefix = speaker
                    new_name = f"{voice_dir}/{prefix}_{idx+1:03d}.{output_format.lower()}"
                    os.rename(temp_file, new_name)
                    audio_files.append(new_name)
                    
                    if subs:
                        speaker_subs = []
                        for sub in subs:
                            speaker_subs.append({
                                "text": f"{speaker}: {sub['text']}",
                                "start": sub["start"],
                                "end": sub["end"]
                            })
                        all_subs.append(speaker_subs)
            except Exception as e:
                print(f"❌ Error generating speech for line {idx+1}: {str(e)}")

        if not audio_files:
            return None, None, "❌ Failed to generate audio files"

        merged_path = self.merge_with_exact_repetition(voice_dir, output_format, repeat_times, pause_q, pause_a)
        srt_path = self.generate_full_srt(all_subs, pause_q, pause_a, merged_path, repeat_times)
        
        if save_settings:
            self.settings["dialogue"] = {
                "language_q": language_q,
                "voice_q": voice_q,
                "language_a": language_a,
                "voice_a": voice_a,
                "rate_q": rate_q,
                "pitch_q": pitch_q,
                "volume_q": volume_q,
                "rate_a": rate_a,
                "pitch_a": pitch_a,
                "volume_a": volume_a,
                "repeat_times": repeat_times,
                "pause_q": pause_q,
                "pause_a": pause_a
            }
            self.save_settings()

        return merged_path, srt_path, "✅ Done! Click play to listen"

    def merge_with_exact_repetition(self, voice_dir, fmt, repeat_count, pause_q, pause_a):
        q_files = natsort.natsorted([f for f in os.listdir(voice_dir) if f.startswith("Q_") and f.endswith(f".{fmt.lower()}")])
        a_files = natsort.natsorted([f for f in os.listdir(voice_dir) if f.startswith("A_") and f.endswith(f".{fmt.lower()}")])

        if len(q_files) != len(a_files):
            print(f"❌ Mismatched Q ({len(q_files)}) and A ({len(a_files)}) files")
            return None

        merged = AudioSegment.empty()
        short_pause = AudioSegment.silent(duration=pause_q)
        long_pause = AudioSegment.silent(duration=pause_a)

        for q_file, a_file in zip(q_files, a_files):
            try:
                q_audio = AudioSegment.from_file(os.path.join(voice_dir, q_file))
                a_audio = AudioSegment.from_file(os.path.join(voice_dir, a_file))

                q_audio = q_audio.fade_in(50).fade_out(50)
                a_audio = a_audio.fade_in(50).fade_out(50)
                
                q_audio = normalize(q_audio)
                a_audio = normalize(a_audio)
                
                for _ in range(repeat_count):
                    merged += q_audio
                    merged += short_pause
                    merged += a_audio
                    merged += long_pause
            except Exception as e:
                print(f"❌ Error processing {q_file} or {a_file}: {str(e)}")
                return None

        merged = normalize(merged)
        merged = compress_dynamic_range(merged, threshold=-20.0, ratio=4.0)
        
        output_path = os.path.join(voice_dir, f"merged_repeat_{repeat_count}x.{fmt.lower()}")
        merged.export(output_path, format=fmt.lower(), bitrate="256k")
        return output_path

    def generate_full_srt(self, all_subs, pause_q, pause_a, audio_path, repeat_times):
        """Generate SRT for Q&A with exact repetition"""
        if not any(all_subs):
            return None
            
        vtt = webvtt.WebVTT()
        current_time = 0
        
        for _ in range(repeat_times):
            for i in range(0, len(all_subs), 2):
                # Process Q
                q_subs = all_subs[i] if i < len(all_subs) else []
                for sub in q_subs:
                    start = current_time + sub["start"]
                    end = current_time + sub["end"]
                    vtt.captions.append(webvtt.Caption(
                        self._format_time(start),
                        self._format_time(end),
                        sub["text"]
                    ))
                current_time += (q_subs[-1]["end"] if q_subs else 0) + pause_q
                
                # Process A
                a_subs = all_subs[i+1] if i+1 < len(all_subs) else []
                for sub in a_subs:
                    start = current_time + sub["start"]
                    end = current_time + sub["end"]
                    vtt.captions.append(webvtt.Caption(
                        self._format_time(start),
                        self._format_time(end),
                        sub["text"]
                    ))
                current_time += (a_subs[-1]["end"] if a_subs else 0) + pause_a
        
        srt_path = audio_path.replace('.mp3', '.srt')
        vtt.save(srt_path)
        return srt_path

# ==================== GRADIO INTERFACE ====================
def update_voice_dropdown(language, tab_name, char_num=None):
    processor = BaseTTSProcessor()
    voice_options = [v for v in processor.voice_map.keys() if v.startswith(language)]
    default_voice = voice_options[0] if voice_options else None
    
    if tab_name == "single":
        return gr.Dropdown(choices=voice_options, value=default_voice)
    elif tab_name == "multi":
        if char_num == 1:
            return gr.Dropdown(choices=voice_options, value=default_voice)
        elif char_num == 2:
            return gr.Dropdown(choices=voice_options, value=default_voice)
        elif char_num == 3:
            return gr.Dropdown(choices=voice_options, value=default_voice)
    elif tab_name == "dialogue":
        if char_num == "q":
            return gr.Dropdown(choices=voice_options, value=default_voice)
        elif char_num == "a":
            return gr.Dropdown(choices=voice_options, value=default_voice)

def toggle_srt_download(audio_path, message):
    if audio_path and os.path.exists(audio_path.replace('.mp3', '.srt')):
        return gr.Button(visible=True), gr.Button(visible=True)
    return gr.Button(visible=False), gr.Button(visible=False)

def show_subtitles(audio_output):
    """Xử lý mọi trường hợp đầu vào không hợp lệ"""
    # Nếu là số nguyên (sample rate), bỏ qua
    if isinstance(audio_output, (int, float)):
        return "⏳ Đang xử lý audio..."
        
    # Xử lý các trường hợp còn lại như trước
    if audio_output is None:
        return "⏳ Chưa có audio được tạo"
        
    if isinstance(audio_output, (tuple, list)) and len(audio_output) > 0:
        audio_path = audio_output[0]
    elif isinstance(audio_output, str):
        audio_path = audio_output
    else:
        return "⚠️ Định dạng đầu vào không hỗ trợ"

    if not isinstance(audio_path, str) or not audio_path.endswith('.mp3'):
        return f"⚠️ Đường dẫn audio không hợp lệ: {audio_path}"

    srt_path = audio_path.replace('.mp3', '.srt')
    if not os.path.exists(srt_path):
        return "⚠️ Không tìm thấy file phụ đề"

    try:
        with open(srt_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        return f"❌ Lỗi đọc phụ đề: {str(e)}"
	
def toggle_srt_display(audio_path):
    if not audio_path:
        return gr.Button(visible=False), gr.Textbox(visible=False)
    
    srt_path = audio_path.replace('.mp3', '.srt')
    if os.path.exists(srt_path):
        return gr.Button(visible=True), gr.Textbox(visible=True)
    return gr.Button(visible=False), gr.Textbox(visible=False)

def load_subtitles(audio_path):
    if not audio_path:
        return ""
    
    srt_path = audio_path.replace('.mp3', '.srt')
    try:
        with open(srt_path, 'r', encoding='utf-8') as f:
            return f.read()
    except:
        return "Không thể đọc file phụ đề"	

with gr.Blocks(title="TTS Story Generator") as app:
    gr.Markdown("<h1 style='text-align: center'>📖 TTS Story Generator</h1>")
    
    with gr.Tabs() as tabs:
        # ========== TAB 1: SINGLE CHARACTER ==========
        with gr.Tab("1 Nhân vật"):
            single_processor = StoryTTSProcessor()
            settings = single_processor.settings.get("single_char", {})
            
            with gr.Row():
                with gr.Column():
                    content = gr.Textbox(label="Nội dung truyện", lines=10, placeholder="Nhập nội dung truyện (mỗi dòng là một đoạn)...")
                    language = gr.Dropdown(
                        label="Ngôn ngữ",
                        choices=list(TTSConfig.LANGUAGES.keys()),
                        value=settings.get("language", "Tiếng Việt")
                    )
                    voice = gr.Dropdown(
                        label="Giọng đọc",
                        choices=[v for v in single_processor.voice_map.keys() if v.startswith(settings.get("language", "Tiếng Việt"))],
                        value=settings.get("voice", "Tiếng Việt - HoaiMy (Nữ)")
                    )
                    
                    rate = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate", 0))
                    pitch = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch", 0))
                    volume = gr.Slider(label="Âm lượng (%)", minimum=50, maximum=150, step=1, value=settings.get("volume", 100))
                    pause = gr.Slider(label="Khoảng nghỉ (ms)", minimum=100, maximum=2000, step=50, value=settings.get("pause", 500))
                    save_settings = gr.Checkbox(label="Lưu cài đặt", value=False)
                    submit_btn = gr.Button("🎤 Tạo truyện audio", variant="primary")
                
                with gr.Column():
                    output_audio = gr.Audio(label="Audio đã tạo", interactive=False)
                    output_text = gr.Textbox(label="Trạng thái", interactive=False)
                    
                    with gr.Row():
                        download_srt = gr.Button("📥 Tải phụ đề (.srt)", visible=False)
                        clear_btn = gr.Button("🧹 Xóa phụ đề", visible=False)
                    
                    subtitles_display = gr.Textbox(
                        label="Nội dung phụ đề",
                        interactive=False,
                        visible=True,
                        lines=10,
                        max_lines=20,
                        elem_classes=["subtitle-box"]
                    )				
            
            language.change(
                lambda lang: update_voice_dropdown(lang, "single"),
                inputs=language,
                outputs=voice
            )
            
            submit_btn.click(
                single_processor.process_story,
                inputs=[content, voice, rate, pitch, volume, pause, save_settings],
                outputs=[output_audio, download_srt, output_text]
            )
            
            output_audio.change(
                lambda audio_output: (
                    gr.Button(visible=is_valid_audio_path(audio_output)),
                    gr.Button(visible=is_valid_audio_path(audio_output))
                ),
                inputs=output_audio,
                outputs=[download_srt, clear_btn]
            ).then(
                show_subtitles,
                inputs=output_audio,
                outputs=subtitles_display
            )
            
            clear_btn.click(
                lambda: ("", False, False),
                outputs=[subtitles_display, download_srt, clear_btn]
            )

        # ========== TAB 2: MULTI CHARACTER ==========
        with gr.Tab("Đa nhân vật"):
            multi_processor = MultiCharacterTTSProcessor()
            settings = multi_processor.settings.get("multi_char", {})
            
            with gr.Row():
                with gr.Column():
                    content = gr.Textbox(label="Nội dung câu chuyện", lines=10, 
                                        placeholder="CHAR1: Lời thoại nhân vật 1\nCHAR2: Lời thoại nhân vật 2\nCHAR3: Lời thoại nhân vật 3\nNARRATOR: Lời dẫn truyện")
                    
                    with gr.Accordion("⚙️ Cài đặt giọng nói nhân vật", open=True):
                        with gr.Row():
                            char1_language = gr.Dropdown(
                                label="Ngôn ngữ NV1",
                                choices=sorted(list(TTSConfig.LANGUAGES.keys())),
                                value=settings.get("language_char1", "Tiếng Việt")
                            )
                            char1_voice = gr.Dropdown(
                                label="Giọng NV1",
                                choices=[v for v in multi_processor.voice_map.keys() if v.startswith(settings.get("language_char1", "Tiếng Việt"))],
                                value=settings.get("voice_char1", "Tiếng Việt - HoaiMy (Nữ)")
                            )
                        
                        with gr.Row():
                            char2_language = gr.Dropdown(
                                label="Ngôn ngữ NV2",
                                choices=sorted(list(TTSConfig.LANGUAGES.keys())),
                                value=settings.get("language_char2", "Tiếng Việt")
                            )
                            char2_voice = gr.Dropdown(
                                label="Giọng NV2",
                                choices=[v for v in multi_processor.voice_map.keys() if v.startswith(settings.get("language_char2", "Tiếng Việt"))],
                                value=settings.get("voice_char2", "Tiếng Việt - NamMinh (Nam)")
                            )
                        
                        with gr.Row():
                            char3_language = gr.Dropdown(
                                label="Ngôn ngữ NV3",
                                choices=sorted(list(TTSConfig.LANGUAGES.keys())),
                                value=settings.get("language_char3", "Tiếng Việt")
                            )
                            char3_voice = gr.Dropdown(
                                label="Giọng NV3",
                                choices=[v for v in multi_processor.voice_map.keys() if v.startswith(settings.get("language_char3", "Tiếng Việt"))],
                                value=settings.get("voice_char3", "Tiếng Việt - HoaiMy (Nữ)")
                            )
                    
                    with gr.Accordion("🔧 Điều chỉnh nhân vật 1", open=False):
                        char1_rate = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate_char1", -20))
                        char1_pitch = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch_char1", 0))
                        char1_volume = gr.Slider(label="Âm lượng (%)", minimum=50, maximum=150, step=1, value=settings.get("volume_char1", 100))
                    
                    with gr.Accordion("🔧 Điều chỉnh nhân vật 2", open=False):
                        char2_rate = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate_char2", -25))
                        char2_pitch = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch_char2", 0))
                        char2_volume = gr.Slider(label="Âm lượng (%)", minimum=50, maximum=150, step=1, value=settings.get("volume_char2", 100))
                    
                    with gr.Accordion("🔧 Điều chỉnh nhân vật 3", open=False):
                        char3_rate = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate_char3", -15))
                        char3_pitch = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch_char3", 0))
                        char3_volume = gr.Slider(label="Âm lượng (%)", minimum=50, maximum=150, step=1, value=settings.get("volume_char3", 100))
                    
                    with gr.Accordion("🔄 Cài đặt chung", open=False):
                        repeat_times = gr.Slider(label="Số lần lặp", minimum=1, maximum=5, step=1, value=settings.get("repeat_times", 1))
                        pause_between = gr.Slider(label="Khoảng nghỉ (ms)", minimum=100, maximum=2000, step=50, value=settings.get("pause_between", 500))
                        output_format = gr.Dropdown(label="Định dạng đầu ra", choices=["MP3", "WAV"], value="MP3")
                        save_settings = gr.Checkbox(label="Lưu cài đặt", value=False)
                    
                    submit_btn = gr.Button("🎧 Tạo câu chuyện audio", variant="primary")
                
                with gr.Column():
                    output_audio = gr.Audio(label="Audio đã tạo", interactive=False)
                    output_text = gr.Textbox(label="Trạng thái", interactive=False)
                    
                    with gr.Row():
                        download_srt = gr.Button("📥 Tải phụ đề (.srt)", visible=False)
                        clear_btn = gr.Button("🧹 Xóa phụ đề", visible=False)
                    
                    subtitles_display = gr.Textbox(
                        label="Nội dung phụ đề",
                        interactive=False,
                        visible=True,
                        lines=10,
                        max_lines=20,
                        elem_classes=["subtitle-box"]
                    )
            
            # Update voice dropdowns
            char1_language.change(
                lambda lang: update_voice_dropdown(lang, "multi", 1),
                inputs=char1_language,
                outputs=char1_voice
            )
            
            char2_language.change(
                lambda lang: update_voice_dropdown(lang, "multi", 2),
                inputs=char2_language,
                outputs=char2_voice
            )
            
            char3_language.change(
                lambda lang: update_voice_dropdown(lang, "multi", 3),
                inputs=char3_language,
                outputs=char3_voice
            )
            
            submit_btn.click(
                multi_processor.process_story,
                inputs=[content, output_format,
                       char1_voice, char2_voice, char3_voice,
                       char1_rate, char2_rate, char3_rate,
                       char1_pitch, char2_pitch, char3_pitch,
                       char1_volume, char2_volume, char3_volume,
                       repeat_times, pause_between, save_settings],
                outputs=[output_audio, download_srt, output_text]
            )
            
            output_audio.change(
                lambda audio_output: (
                    gr.Button(visible=is_valid_audio_path(audio_output)),
                    gr.Button(visible=is_valid_audio_path(audio_output))
                ),
                inputs=output_audio,
                outputs=[download_srt, clear_btn]
            ).then(
                show_subtitles,
                inputs=output_audio,
                outputs=subtitles_display
            )
            
            download_srt.click(
                lambda audio_path: audio_path.replace('.mp3', '.srt') if audio_path else None,
                inputs=output_audio,
                outputs=gr.File(label="Tải phụ đề")
            )

        # ========== TAB 3: Q&A DIALOGUE ==========
        with gr.Tab("Hỏi & Đáp"):
            dialogue_processor = DialogueTTSProcessor()
            settings = dialogue_processor.settings.get("dialogue", {})
            
            with gr.Row():
                with gr.Column():
                    content = gr.Textbox(label="Nội dung hội thoại", lines=10, 
                                       placeholder="Q: Câu hỏi\nA: Câu trả lời\nQ: Câu hỏi tiếp theo\nA: Câu trả lời tiếp theo")
                    
                    with gr.Accordion("⚙️ Cài đặt giọng nói", open=True):
                        with gr.Row():
                            language_q = gr.Dropdown(
                                label="Ngôn ngữ câu hỏi",
                                choices=sorted(list(TTSConfig.LANGUAGES.keys())),
                                value=settings.get("language_q", "Tiếng Việt")
                            )
                            voice_q = gr.Dropdown(
                                label="Giọng câu hỏi",
                                choices=[v for v in dialogue_processor.voice_map.keys() if v.startswith(settings.get("language_q", "Tiếng Việt"))],
                                value=settings.get("voice_q", "Tiếng Việt - HoaiMy (Nữ)")
                            )
                        
                        with gr.Row():
                            language_a = gr.Dropdown(
                                label="Ngôn ngữ câu trả lời",
                                choices=sorted(list(TTSConfig.LANGUAGES.keys())),
                                value=settings.get("language_a", "Tiếng Việt")
                            )
                            voice_a = gr.Dropdown(
                                label="Giọng câu trả lời",
                                choices=[v for v in dialogue_processor.voice_map.keys() if v.startswith(settings.get("language_a", "Tiếng Việt"))],
                                value=settings.get("voice_a", "Tiếng Việt - NamMinh (Nam)")
                            )
                    
                    with gr.Accordion("🔧 Điều chỉnh giọng câu hỏi", open=False):
                        rate_q = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate_q", -20))
                        pitch_q = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch_q", 0))
                        volume_q = gr.Slider(label="Âm lượng (%)", minimum=80, maximum=110, step=1, value=settings.get("volume_q", 100))
                    
                    with gr.Accordion("🔧 Điều chỉnh giọng câu trả lời", open=False):
                        rate_a = gr.Slider(label="Tốc độ (%)", minimum=-30, maximum=30, step=1, value=settings.get("rate_a", -25))
                        pitch_a = gr.Slider(label="Cao độ (Hz)", minimum=-30, maximum=30, step=1, value=settings.get("pitch_a", 0))
                        volume_a = gr.Slider(label="Âm lượng (%)", minimum=80, maximum=110, step=1, value=settings.get("volume_a", 100))
                    
                    with gr.Accordion("🔄 Cài đặt lặp lại", open=False):
                        repeat_times = gr.Slider(label="Số lần lặp", minimum=1, maximum=5, step=1, value=settings.get("repeat_times", 2))
                        pause_q = gr.Slider(label="Khoảng nghỉ câu hỏi (ms)", minimum=100, maximum=1000, step=50, value=settings.get("pause_q", 200))
                        pause_a = gr.Slider(label="Khoảng nghỉ câu trả lời (ms)", minimum=100, maximum=2000, step=50, value=settings.get("pause_a", 500))
                        output_format = gr.Dropdown(label="Định dạng đầu ra", choices=["MP3", "WAV"], value="MP3")
                        save_settings = gr.Checkbox(label="Lưu cài đặt", value=False)
                    
                    submit_btn = gr.Button("🎧 Tạo audio hội thoại", variant="primary")
                
                with gr.Column():
                    output_audio = gr.Audio(label="Audio đã tạo", interactive=False)
                    output_text = gr.Textbox(label="Trạng thái", interactive=False)
                    
                    with gr.Row():
                        download_srt = gr.Button("📥 Tải phụ đề (.srt)", visible=False)
                        clear_btn = gr.Button("🧹 Xóa phụ đề", visible=False)
                    
                    subtitles_display = gr.Textbox(
                        label="Nội dung phụ đề",
                        interactive=False,
                        visible=True,
                        lines=10,
                        max_lines=20,
                        elem_classes=["subtitle-box"]
                    )
            
            # Update voice dropdowns
            language_q.change(
                lambda lang: update_voice_dropdown(lang, "dialogue", "q"),
                inputs=language_q,
                outputs=voice_q
            )
            
            language_a.change(
                lambda lang: update_voice_dropdown(lang, "dialogue", "a"),
                inputs=language_a,
                outputs=voice_a
            )
            
            submit_btn.click(
                dialogue_processor.process_dialogues,
                inputs=[content, output_format,
                       language_q, voice_q, rate_q, pitch_q, volume_q,
                       language_a, voice_a, rate_a, pitch_a, volume_a,
                       repeat_times, pause_q, pause_a, save_settings],
                outputs=[output_audio, download_srt, output_text]
            )
            
            output_audio.change(
                lambda audio_output: (
                    gr.Button(visible=is_valid_audio_path(audio_output)),
                    gr.Button(visible=is_valid_audio_path(audio_output))
                ),
                inputs=output_audio,
                outputs=[download_srt, clear_btn]
            ).then(
                show_subtitles,
                inputs=output_audio,
                outputs=subtitles_display
            )
            
            download_srt.click(
                lambda audio_path: audio_path.replace('.mp3', '.srt') if audio_path else None,
                inputs=output_audio,
                outputs=gr.File(label="Tải phụ đề")
            )

if __name__ == "__main__":
    app.launch()