import matplotlib.pyplot as plt
import numpy as np
import torch
import base64
import io
from io import BytesIO
import matplotlib.pyplot as plt
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from train import load_model
from text import text_to_sequence
from utils import load_wav_to_torch
import os
import random
import librosa
import librosa.display

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

hparams = create_hparams()
hparams.sampling_rate = 22050
stft = TacotronSTFT(
    hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, 
    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax).to(device)

# Function to plot data
def plot_data(data, figsize=(16, 4), titles=['Mel Spectrogram (Original)', 'Mel Spectrogram (Postnet)', 'Alignment'],
              xlabel=['Time Steps', 'Time Steps', 'Decoder Time Steps'],
              ylabel=['Mel Channels', 'Mel Channels', 'Encoder Time Steps'], colorbar_labels=None):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        im = axes[i].imshow(data[i], aspect='auto', origin='lower', interpolation='none', cmap='viridis')

        if titles:
            axes[i].set_title(titles[i])
        if xlabel:
            axes[i].set_xlabel(xlabel[i])
        if ylabel:
            axes[i].set_ylabel(ylabel[i])

        # Add color bar
        cbar = fig.colorbar(im, ax=axes[i])
        if colorbar_labels:
            cbar.set_label(colorbar_labels[i])

    plt.tight_layout()
    img_buffer = io.BytesIO()
    plt.savefig(img_buffer, format='png', bbox_inches='tight', pad_inches=0)
    plt.close()

    img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')

    return img_base64

#Function to plot timedomain waveform
def plot_waveforms(audio_data):
    # Load the audio from BytesIO
    buffer = BytesIO(audio_data)
    y, sr = librosa.load(buffer, sr=None)

    # Create waveform plot
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.title("Waveform")

    # Save the plot to a BytesIO object
    wave_buffer = BytesIO()
    plt.savefig(wave_buffer, format="png")
    wave_buffer.seek(0)
    plt.close()

    # Encode the plot as base64
    wave_base64 = base64.b64encode(wave_buffer.read()).decode('utf-8')

    return wave_base64


# load speaker model
def load_speaker_model(speaker_model_path):
    from speaker.model import SpeakerEncoder
    device = torch.device('cuda' if use_cuda else 'cpu')
    loss_device = torch.device("cpu")

    model = SpeakerEncoder(device, loss_device)
    speaker_dict = torch.load(speaker_model_path, map_location='cpu')
    model.load_state_dict(speaker_dict)

    # Freeze the weights of the speaker model
    for param in model.parameters():
        param.requires_grad = False

    return model
    
speaker_model = load_speaker_model('speaker/saved_models/saved_model_e273_LargeBatch.pt').to(device).eval().float()

def extract_speech_embedding(audio_path: str):
    audio, sampling_rate = load_wav_to_torch(audio_path)
    if sampling_rate != stft.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(sampling_rate, stft.sampling_rate))
    
    audio_norm = audio / 32768.0
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False).to(device)
    melspec = stft.mel_spectrogram(audio_norm).transpose(1,2).float()

    if melspec.shape[1] <= 128:
        mel_slice = mel
    else:
        slice_start = random.randint(0,melspec.shape[1]-128)
        mel_slice = melspec[:,slice_start:slice_start+128]
    speaker_embedding = speaker_model(mel_slice)
    return speaker_embedding
    
def synthesize_voice(text_input, checkpoint_path):
    # Load Tacotron2 model from checkpoint
    model = load_model(hparams)
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['state_dict'])
    model = model.to(device).eval().float()


    # Nepali text
    speaker_audio_path='speaker_audio/ariana.wav'
    sequence = np.array(text_to_sequence(text_input, ['transliteration_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
    speaker_embedding = extract_speech_embedding(speaker_audio_path)
    
    # Melspectrogram and Alignment graph
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_embedding)
    mel_output_data = mel_outputs.data.cpu().numpy()[0]
    mel_output_postnet_data = mel_outputs_postnet.data.cpu().numpy()[0]
    alignments_data = alignments.data.cpu().numpy()[0].T

    return mel_output_data, mel_output_postnet_data, alignments_data