import numpy as np
import os
import re
import tempfile
import logging
import torch
from pydub import AudioSegment
from pathlib import Path
import subprocess
import librosa
import soundfile as sf


# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Create directory structure
def ensure_directories():
    """Ensure the required directories exist"""
    directories = ["audio", "audio2", "reference_audio"]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    
ensure_directories()  # Call immediately to ensure directories exist

# Setup audio effects for pydub
def setup_audio_effects():
    """Setup custom audio effects"""
    from pydub import effects
    
    # Add speedup if it's missing
    if not hasattr(AudioSegment, "speedup"):
        def speedup(audio_segment, playback_speed=1.5):
            if playback_speed <= 0 or playback_speed == 1.0:
                return audio_segment
            new_frame_rate = int(audio_segment.frame_rate * playback_speed)
            adjusted = audio_segment._spawn(audio_segment.raw_data, 
                                          overrides={'frame_rate': new_frame_rate})
            return adjusted.set_frame_rate(audio_segment.frame_rate)
        AudioSegment.speedup = speedup
    
    # Add time_stretch if it's missing
    if not hasattr(effects, "time_stretch"):
        def time_stretch(audio_segment, stretch_factor):
            if stretch_factor <= 0 or stretch_factor == 1.0:
                return audio_segment
            original_frame_rate = audio_segment.frame_rate
            new_frame_rate = int(original_frame_rate / stretch_factor)
            stretched = audio_segment._spawn(
                audio_segment.raw_data,
                overrides={'frame_rate': new_frame_rate}
            )
            return stretched.set_frame_rate(original_frame_rate)
        effects.time_stretch = time_stretch
    
    return effects

effects = setup_audio_effects()

def adjust_audio_duration(audio_segment, target_duration):
    """Adjust audio to target duration by adding silence or trimming"""
    current_duration = len(audio_segment) / 1000  # ms to seconds
    
    if current_duration < target_duration:
        silence_duration_ms = int((target_duration - current_duration) * 1000)
        silence = AudioSegment.silent(duration=silence_duration_ms)
        return audio_segment + silence
    else:
        return audio_segment[:int(target_duration * 1000)]

# XTTS Model Loader (Singleton pattern)
class XTTSModelLoader:
    _instance = None
    model = None
    
    @classmethod
    def get_model(cls):
        """Get or initialize the XTTS model"""
        if cls.model is None:
            try:
                from TTS.api import TTS
                
                # Determine device
                device = "cuda" if torch.cuda.is_available() else "cpu"
                logger.info(f"Loading XTTS model on {device}...")
                
                # Load the model
                cls.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
                logger.info("XTTS model loaded successfully")
            except Exception as e:
                logger.error(f"Error loading XTTS model: {e}")
                return None
                
        return cls.model

def smooth_speed_change(audio_path, target_duration):
    """
    Adjust audio speed with instantaneous time stretching to match target duration
    
    Args:
        audio_path: Path to audio file to adjust
        target_duration: Target duration in seconds
        
    Returns:
        Path to adjusted audio file (temporary file)
    """
    try:
        # Debug start
        print(f"\n[DEBUG] Starting audio speed adjustment:")
        print(f"[DEBUG] Input file: {audio_path}")
        print(f"[DEBUG] Target duration: {target_duration:.2f}s")
        
        # Load audio with librosa
        y, sr = librosa.load(audio_path, sr=None)
        
        # Calculate current duration and speed factor
        current_duration = librosa.get_duration(y=y, sr=sr)
        speed_factor = current_duration / target_duration
        
        print(f"[DEBUG] Current duration: {current_duration:.2f}s")
        print(f"[DEBUG] Calculated speed factor: {speed_factor:.3f}")
        
        # If the difference is minimal, return original path
        if abs(speed_factor - 1) < 0.05:
            print(f"[DEBUG] Speed factor {speed_factor:.3f} is within 5% threshold, skipping adjustment")
            return audio_path
        
        # Dynamic speed factor limits based on audio duration
        # Allow more aggressive speed factors for short audio
        if current_duration < 10.0:  # Short audio under 10 seconds
            max_speed = 3.0  # More aggressive for short segments
        else:
            max_speed = 2.7  # Standard limit for longer audio
            
        min_speed = 0.5  # Allow more slowdown when needed
        
        # Check if extreme speed change is needed
        extreme_adjustment = (speed_factor > max_speed)
        
        # Limit speed factor to reasonable range
        original_speed_factor = speed_factor
        speed_factor = min(max(speed_factor, min_speed), max_speed)
        
        if original_speed_factor != speed_factor:
            print(f"[DEBUG] Speed factor clamped from {original_speed_factor:.3f} to {speed_factor:.3f}")
            if extreme_adjustment:
                print(f"[DEBUG] Extreme adjustment needed - will apply max speed and then trim")
        
        # Track processing time
        import time
        start_time = time.time()
        
        # SIMPLIFIED: Apply direct time stretching to the entire audio at once
        print(f"[DEBUG] Applying instantaneous time stretching with factor {speed_factor:.3f}")
        stretched_audio = librosa.effects.time_stretch(y=y, rate=speed_factor)
        
        # Calculate new duration
        expected_duration = len(stretched_audio) / sr
        
        # Save to temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
        sf.write(temp_file.name, stretched_audio, sr)
        
        # Calculate processing time
        process_time = time.time() - start_time
        
        # Verify the actual duration after processing
        y_check, sr_check = librosa.load(temp_file.name, sr=None)
        actual_duration = librosa.get_duration(y=y_check, sr=sr_check)
        
        method = "direct"
        
        # For extreme cases, perform additional trimming
        if extreme_adjustment and actual_duration > target_duration:
            print(f"[DEBUG] Performing additional trim for extreme case")
            # Calculate how many samples to keep
            samples_to_keep = int(target_duration * sr_check)
            
            # Apply a small fade out to avoid clicks
            fade_samples = min(int(0.1 * sr_check), samples_to_keep // 4)  # 100ms fade or less
            
            # Keep only the needed samples
            trimmed_audio = y_check[:samples_to_keep]
            
            # Apply fade out to avoid clicks
            if fade_samples > 0:
                fade_env = np.linspace(1.0, 0.0, fade_samples)
                trimmed_audio[-fade_samples:] *= fade_env
            
            # Save the trimmed version
            sf.write(temp_file.name, trimmed_audio, sr_check)
            
            # Update actual duration
            actual_duration = librosa.get_duration(y=trimmed_audio, sr=sr_check)
            method += "+trim"
        
        print(f"[DEBUG] Method used: {method}")
        print(f"[DEBUG] Processing completed in {process_time:.2f} seconds")
        print(f"[DEBUG] Expected new duration: {expected_duration:.2f}s")
        print(f"[DEBUG] Actual new duration: {actual_duration:.2f}s")
        print(f"[DEBUG] Target was: {target_duration:.2f}s")
        print(f"[DEBUG] Difference from target: {abs(actual_duration - target_duration):.3f}s")
        print(f"[DEBUG] Output file: {temp_file.name}")
        
        return temp_file.name
        
    except Exception as e:
        import traceback
        print(f"[DEBUG ERROR] Audio speed adjustment failed: {e}")
        print(traceback.format_exc())
        logger.warning(f"Audio speed adjustment failed: {e}")
        return audio_path

def create_segmented_edge_tts(text, pitch, voice, output_path, target_duration=None):
    """Create voice clone with specific characteristics and timing using Edge TTS"""
    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    temp_filename = temp_file.name  # Store filename before closing
    temp_file.close()
    
    # Fix pitch formatting
    pitch_param = f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz"
    
    command = [
        "edge-tts",
        f"--pitch={pitch_param}",
        "--voice", voice,
        "--text", text,
        "--write-media", temp_filename
    ]
    subprocess.run(command, check=True)
    # Load audio
    audio = AudioSegment.from_file(temp_filename, format="mp3")
    
    # Time constraint adjustment
    if target_duration is not None:
        current_duration = len(audio) / 1000  # ms to seconds
        
        if abs(current_duration - target_duration) > 0.1:  # 100ms threshold
            speed_factor = current_duration / target_duration
            speed_factor = min(max(speed_factor, 0.7), 3)  # Keep within bounds
            
            logger.info(f"  Adjusting timing: {current_duration:.2f}s → {target_duration:.2f}s (factor: {speed_factor:.2f})")
            
            # Apply time adjustment
            # Instead of speed adjustments after generation, use Edge TTS rate parameter
            if speed_factor < 1:
                rate_adjustment = f"-{int((1 - speed_factor) * 100)}%"
            else:
                rate_adjustment = f"+{int((speed_factor - 1) * 100)}%"
            
            # Regenerate with adjusted rate
            os.unlink(temp_file.name)  # Remove the previous temp file
            
            # Create new command with rate parameter and fixed pitch formatting
            command = [
                "edge-tts",
                f"--pitch={pitch_param}",
                f"--rate={rate_adjustment}",
                "--voice", voice,
                "--text", text,
                "--write-media", temp_filename
            ]
            subprocess.run(command, check=True)
            
            # Reload audio with rate adjustment
            audio = AudioSegment.from_file(temp_filename, format="mp3")
            
            # Fine-tune if needed
            new_duration = len(audio) / 1000
            if abs(new_duration - target_duration) > 0.1:
                audio = adjust_audio_duration(audio, target_duration)
    
    # Save the modified audio
    audio.export(output_path, format="wav")
    
    # Clean up temporary file
    os.unlink(temp_file.name)
    
    # Log final duration
    final_audio = AudioSegment.from_file(output_path)
    final_duration = len(final_audio) / 1000
    logger.info(f"  Final duration: {final_duration:.2f}s (target: {target_duration if target_duration else 'None'}s)")
    
    return output_path

def create_segmented_xtts(text, reference_audio, language, output_path, target_duration=None):
    """Create voice-cloned speech using XTTS with speaker's reference audio and duration control"""
    # Get the model (will be loaded on first call)
    tts_model = XTTSModelLoader.get_model()
    
    if tts_model is None:
        raise RuntimeError("XTTS model could not be loaded. Ensure TTS is installed.")
    
    # Verify reference audio exists
    if not os.path.exists(reference_audio):
        raise FileNotFoundError(f"Reference audio file not found: {reference_audio}")
    
    # Generate speech
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
    temp_filename = temp_file.name
    temp_file.close()
    
    logger.info(f"Generating XTTS speech using reference: {os.path.basename(reference_audio)}")

    # Step 1: Try to optimize the generation parameters based on text length and target duration
    # Short text might need special handling to avoid excessive padding
    is_short_text = len(text.strip()) < 10
    
    # XTTS generation options
    generation_kwargs = {}
    
    # Add text length information for very short text to help the model
    # Note: These are example parameters - actual parameter support depends on the XTTS version
    if is_short_text and target_duration is not None and target_duration < 2.0:
        logger.info(f"  Short text detected, attempting to minimize padding")
        # These parameters may or may not be supported by the TTS model being used
        generation_kwargs = {
            'enable_text_splitting': False,  # Avoid splitting short text
            'no_silence_end': True,          # Reduce trailing silence 
        }
        # Some models may support 'speed' parameter
        if hasattr(tts_model, 'tts_with_speed'):
            generation_kwargs['speed'] = 1.2  # Slightly faster for short text
    
    try:
        # Try generating with optional parameters if supported
        if generation_kwargs:
            try:
                tts_model.tts_to_file(
                    text=text,
                    speaker_wav=reference_audio,
                    language=language,
                    file_path=temp_filename,
                    **generation_kwargs
                )
            except (TypeError, ValueError):
                # If parameters aren't supported, fall back to standard call
                logger.info("  Advanced parameters not supported, using standard generation")
                tts_model.tts_to_file(
                    text=text,
                    speaker_wav=reference_audio,
                    language=language,
                    file_path=temp_filename
                )
        else:
            # Standard generation
            tts_model.tts_to_file(
                text=text,
                speaker_wav=reference_audio,
                language=language,
                file_path=temp_filename
            )
        
        # Load generated audio
        audio = AudioSegment.from_file(temp_filename)
        
        # Step 2: Apply duration adjustment if needed
        if target_duration is not None:
            current_duration = len(audio) / 1000  # ms to seconds
            
            if abs(current_duration - target_duration) > 0.1:  # 100ms threshold
                # Calculate speed factor - inverse of duration ratio
                speed_factor = current_duration / target_duration
                speed_factor = min(max(speed_factor, 0.7), 3)  # Allow wider range for better adjustment
                
                logger.info(f"  Adjusting timing: {current_duration:.2f}s → {target_duration:.2f}s (speed factor: {speed_factor:.2f})")
                
                try:
                    # Always attempt smooth speed change since regeneration doesn't work
                    logger.info("  Applying smooth speed adjustment...")
                    adjusted_path = smooth_speed_change(temp_filename, target_duration)
                    
                    if adjusted_path != temp_filename:  # If path is different, adjustment was done
                        # Load the adjusted audio
                        audio = AudioSegment.from_file(adjusted_path)
                        
                        # Check if adjustment was successful
                        new_duration = len(audio) / 1000
                        if abs(new_duration - target_duration) <= 0.15:  # 150ms tolerance
                            logger.info(f"  Smooth adjustment successful: {new_duration:.2f}s")
                            
                            # Clean up original file and use the adjusted one
                            os.unlink(temp_filename)
                            temp_filename = adjusted_path
                        else:
                            # Clean up adjusted file and just use duration adjustment
                            logger.info(f"  Smooth adjustment not precise enough ({new_duration:.2f}s), will fine-tune with duration adjustment")
                            os.unlink(adjusted_path)
                            # We'll fall through to the final duration adjustment step
                except Exception as e:
                    logger.warning(f"  Smooth speed adjustment failed: {str(e)}")
                    # We'll fall through to the final duration adjustment step
                
                # Always perform final duration adjustment to ensure exact timing
                new_duration = len(audio) / 1000
                if abs(new_duration - target_duration) > 0.1:
                    logger.info(f"  Fine-tuning with duration adjustment: {new_duration:.2f}s → {target_duration:.2f}s")
                    audio = adjust_audio_duration(audio, target_duration)
        
        # Save the final audio
        audio.export(output_path, format="wav")
        
        # Clean up
        os.unlink(temp_filename)
        
        # Log final duration
        final_audio = AudioSegment.from_file(output_path)
        final_duration = len(final_audio) / 1000
        logger.info(f"  Final duration: {final_duration:.2f}s (target: {target_duration if target_duration else 'None'}s)")
        
        return output_path
        
    except Exception as e:
        logger.error(f"XTTS generation failed: {e}")
        if os.path.exists(temp_filename):
            os.unlink(temp_filename)
        raise

def process_voice_config(voice_config):
    """
    Process voice configuration to support both Edge TTS and XTTS
    
    Args:
        voice_config: Dict with speaker_id keys and configuration values
            For Edge TTS: {'engine': 'edge_tts', 'gender': 'male'/'female'} or simply 'male'/'female'
            For XTTS: {'engine': 'xtts', 'reference_audio': '/path/to/audio.wav', 'language': 'hi'}
    
    Returns:
        Processed configuration dictionary
    """
    processed_config = {}
    
    # Handle empty config
    if not voice_config:
        return {0: {'engine': 'edge_tts', 'voice': "hi-IN-MadhurNeural", 'pitch': 0}}
    
    # Track Edge TTS speaker counts for pitch variations
    edge_male_count = 0
    edge_female_count = 0
    
    # Pitch variations for multiple Edge TTS speakers of same gender
    male_pitches = [0, -50, 50]  # Default, deeper, higher
    female_pitches = [0, 45, -45]  # Default, higher, deeper
    
    for speaker_id, config in voice_config.items():
        # Convert string speaker_id to int if needed
        if isinstance(speaker_id, str) and speaker_id.isdigit():
            speaker_id = int(speaker_id)
        
        # Determine which engine to use (default is edge_tts)
        if isinstance(config, dict):
            engine = config.get('engine', 'edge_tts')
        else:
            # Handle simple gender strings for backwards compatibility
            engine = 'edge_tts'
            config = {'gender': config} if config in ['male', 'female'] else {'gender': 'male'}
        
        if engine == 'xtts':
            # XTTS configuration - each speaker needs their own reference audio
            if 'reference_audio' not in config:
                logger.warning(f"No reference audio provided for XTTS speaker {speaker_id}, falling back to Edge TTS")
                # Fall back to Edge TTS if no reference audio
                engine = 'edge_tts'
                gender = config.get('gender', 'male')
            else:
                # Valid XTTS configuration
                processed_config[speaker_id] = {
                    'engine': 'xtts',
                    'reference_audio': config['reference_audio'],
                    'language': config.get('language', 'hi')  # Default to Hindi
                }
                continue  # Skip the Edge TTS processing below
        
        # Edge TTS configuration (if engine is edge_tts or XTTS fallback)
        gender = config.get('gender', 'male')
        
        if gender == 'male':
            # Assign male voice and pitch
            pitch = male_pitches[edge_male_count % len(male_pitches)]
            processed_config[speaker_id] = {
                'engine': 'edge_tts',
                'voice': "hi-IN-MadhurNeural",
                'pitch': pitch
            }
            edge_male_count += 1
        else:
            # Assign female voice and pitch
            pitch = female_pitches[edge_female_count % len(female_pitches)]
            processed_config[speaker_id] = {
                'engine': 'edge_tts',
                'voice': "hi-IN-SwaraNeural", 
                'pitch': pitch
            }
            edge_female_count += 1
    
    return processed_config

def generate_tts(segments, target_language, voice_config=None, output_dir="audio2"):
    """
    Generate speech for all segments using appropriate TTS engine per speaker
    
    Args:
        segments: List of segments with text, speaker, start and end times
        target_language: Language code for TTS
        voice_config: Dictionary with speaker configurations
                     - For Edge TTS: {'gender': 'male'/'female'} or just 'male'/'female'
                     - For XTTS: {'engine': 'xtts', 'reference_audio': '/path/to/audio.wav'}
        output_dir: Directory to save the final audio
        
    Returns:
        Path to the final combined audio file
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate the full audio
    output_path = os.path.join(output_dir, "dubbed_conversation.wav")
    max_end_time = max(segment['end'] for segment in segments)
    
    # Create a silent audio of the total duration
    combined = AudioSegment.silent(duration=int(max_end_time * 1000) + 100) 
    ensure_directories()
    audio_files = []
    
    # Process voice configuration
    processed_config = process_voice_config(voice_config or {})
    print(processed_config)
    
    # Process each segment
    for i, segment in enumerate(segments):
        # Extract speaker ID
        speaker = segment.get('speaker', 'SPEAKER_00')
        match = re.search(r'SPEAKER_(\d+)', speaker)
        speaker_id = int(match.group(1)) if match else 0
        
        # Get speaker configuration
        speaker_config = processed_config.get(speaker_id, 
                                             {'engine': 'edge_tts', 'voice': "hi-IN-SwaraNeural", 'pitch': 0})
        
        # Get text and timing information
        text = segment['text']
        start = segment['start']
        end = segment['end']
        duration = end - start
        
        # Create output filename
        output_file = f"audio/{start}.wav"
        
        logger.info(f"Processing segment {i+1} (Speaker {speaker_id}, Engine: {speaker_config['engine']}):")
        logger.info(f"  Text: {text[:50]}{'...' if len(text) > 50 else ''}")
        logger.info(f"  Duration: {duration:.2f}s")
        
        # Choose appropriate TTS engine
        if speaker_config['engine'] == 'xtts':
            # XTTS generation with speaker's reference audio
            try:
                create_segmented_xtts(
                    text=text,
                    reference_audio=speaker_config['reference_audio'],
                    language=speaker_config.get('language', target_language),
                    output_path=output_file,
                    target_duration=duration,
                )
            except Exception as e:
                logger.error(f"Error using XTTS for speaker {speaker_id}: {e}")
                logger.warning(f"Falling back to Edge TTS for this segment")
                # Fallback to Edge TTS
                create_segmented_edge_tts(
                    text=text,
                    pitch=0, 
                    voice="hi-IN-SwaraNeural",
                    output_path=output_file,
                    target_duration=duration,
                )
        else:
            # Edge TTS generation
            create_segmented_edge_tts(
                text=text,
                pitch=speaker_config.get('pitch', 0),
                voice=speaker_config.get('voice', "hi-IN-SwaraNeural"),
                output_path=output_file,
                target_duration=duration,
            )
        
        audio_files.append(output_file)

        # Add segment to combined audio at the exact timestamp
        segment_audio = AudioSegment.from_file(output_file)
        position_ms = int(segment['start'] * 1000)
        combined = combined.overlay(segment_audio, position=position_ms)
    
    # Export the final combined audio
    combined.export(output_path, format="wav")
    logger.info(f"  Final combined duration: {len(combined) / 1000:.2f}s")
    
    # Clean up segment files
    for file in audio_files:
        try:
            os.remove(file)
        except:
            pass
    
    # Verify the final duration
    final_audio = AudioSegment.from_file(output_path)
    final_duration_sec = len(final_audio) / 1000
    
    print(f"\nTarget duration: {max_end_time:.2f} seconds")
    print(f"Actual duration: {final_duration_sec:.2f} seconds")
    
    # If the final audio is still too long, trim it
    if final_duration_sec > max_end_time + 0.1:  # Allow 100ms grace
        trimmed = final_audio[:int(max_end_time * 1000)]
        trimmed.export(output_path, format="wav")
        print(f"Trimmed to exactly {max_end_time:.2f} seconds")
    
    return output_path