import os
import time
from functools import lru_cache
from subprocess import CalledProcessError, run
from typing import Optional, Union

import wave
import torch
import numpy as np
import soundfile as sf
import torch.nn.functional as F

from .utils import exact_div

# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input

N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token


class MyStream:
    def __init__(self, 
                 ms_gran: int = 200, 
                 sample_rate: int = 16000, 
                 channels: int = 2, 
                 filename: str = None, 
                 inp_dtype: any = torch.int16, 
                 simulate_stream: bool = False,
                 wav_file: str = None,
                 relay: bool = False,
                 use_latency: bool = False,
                 pad_trim: bool = True,
                 use_remote_machine: bool = False):

        assert ms_gran % 20 == 0, "ms_gran must be a multiple of 20"

        self.ms_gran = ms_gran
        self.sample_rate = sample_rate
        self.channels = channels
        self.inp_dtype = inp_dtype
        self.relay = relay
        self.use_latency = use_latency
        self.use_remote_machine = use_remote_machine

        rate_fraction = ms_gran / 1000
        self.chunk_size = int(rate_fraction * sample_rate)
        self.filename = filename
        self.streamed_wav_file = wav_file

        self.simulate_stream = simulate_stream
        if self.simulate_stream:
            assert wav_file is not None, "when simulating stream a wav file must be provided."
            if pad_trim:
                self.wav_array = pad_or_trim(load_audio(wav_file, sample_rate), length=N_SAMPLES+180) # wav array
            else:
                audio = load_audio(wav_file, sample_rate)
                self.wav_array = pad_or_trim(audio, length=audio.shape[-1]+180)
                print(f"{self.wav_array.shape=}")
    
    def _simulate_stream_using_wav(self):
        print("Streaming simulation of a wav started...")

        for i in range(self.wav_array.shape[-1] // self.chunk_size):
            if i == 0:
                yield self.wav_array[..., :(((i + 1) * self.chunk_size) + 40 + 320)] # 320 is extra 20 msec buffer we need!  
            else:
                yield self.wav_array[..., ((i * self.chunk_size) + 40 + 320):(((i + 1) * self.chunk_size) + 40 + 320)]
            
            if self.use_latency: time.sleep(self.ms_gran / 1000) # simulating the latency between audio chunks

    def open_stream(self):
        if self.simulate_stream or self.relay or self.use_remote_machine: return

        # self.audio = pyaudio.PyAudio()
        # self.stream = self.audio.open(input=True, format=self.inp_dtype, channels=self.channels, rate=self.sample_rate, frames_per_buffer=self.chunk_size)
    
    def _read_from_stream(self):
        print("Streaming instance recording started...")
        
        while True:
            yield self.stream.read(self.chunk_size)

    def _follow_growing_wav(self):
        while not os.path.exists(self.streamed_wav_file):
            time.sleep(0.1)

        with sf.SoundFile(self.streamed_wav_file, mode='r') as f:
            while True:
                block = f.read(self.chunk_size)
                if len(block) == 0:
                    time.sleep(self.ms_gran / 1000)  # Wait for more data
                    continue
                yield block

    def _read_raw_pcm(self):
        samples_per_chunk = int(self.sample_rate * (self.ms_gran / 1000))
        bytes_per_sample = 2  # s16le = 16 bits = 2 bytes
        chunk_size = samples_per_chunk * bytes_per_sample

        while not os.path.exists(self.streamed_wav_file):
            time.sleep(0.1)

        with open(self.streamed_wav_file, 'rb') as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    time.sleep((self.ms_gran / 1000))
                    continue
                yield np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32768.0

    def read(self):
        if self.simulate_stream:
            return self._simulate_stream_using_wav()
        
        if self.use_remote_machine:
            return self._read_raw_pcm()

        return self._read_from_stream()

    def _save_recording_file(self, frames: list):
        print(f"Saving recorded audio file on path {self.filename}")
        
        waveFile = wave.open(self.filename, 'wb')
        waveFile.setnchannels(self.channels)
        waveFile.setsampwidth(self.audio.get_sample_size(self.inp_dtype))
        waveFile.setframerate(self.sample_rate)
        waveFile.writeframes(b''.join(frames))
        waveFile.close()

    def close_stream(self, frames: list):
        if self.simulate_stream: return
        
        # Stop Recording
        self.stream.stop_stream()
        self.stream.close()
        self.audio.terminate()
        
        print("Finished recording, stream and audio terminated.")

        if self.filename: self._save_recording_file(frames)


def load_audio(file: str, sr: int = SAMPLE_RATE):
    """
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """

    # This launches a subprocess to decode audio while down-mixing
    # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
    # fmt: off
    cmd = [
        "ffmpeg",
        "-nostdin",
        "-threads", "0",
        "-i", file,
        "-f", "s16le",
        "-ac", "1",
        "-acodec", "pcm_s16le",
        "-ar", str(sr),
        "-"
    ]
    # fmt: on
    try:
        out = run(cmd, capture_output=True, check=True).stdout
    except CalledProcessError as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0


def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
    """
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    """
    if torch.is_tensor(array):
        if array.shape[axis] > length:
            array = array.index_select(
                dim=axis, index=torch.arange(length, device=array.device)
            )

        if array.shape[axis] < length:
            pad_widths = [(0, 0)] * array.ndim
            pad_widths[axis] = (0, length - array.shape[axis])
            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
    else:
        if array.shape[axis] > length:
            array = array.take(indices=range(length), axis=axis)

        if array.shape[axis] < length:
            pad_widths = [(0, 0)] * array.ndim
            pad_widths[axis] = (0, length - array.shape[axis])
            array = np.pad(array, pad_widths)

    return array


@lru_cache(maxsize=None)
def mel_filters(device, n_mels: int) -> torch.Tensor:
    """
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
        )
    """
    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"

    filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
    with np.load(filters_path, allow_pickle=False) as f:
        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)


def log_mel_spectrogram(
    audio: Union[str, np.ndarray, torch.Tensor],
    n_mels: int = 80,
    padding: int = 0,
    device: Optional[Union[str, torch.device]] = None,
):
    """
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    """
    if not torch.is_tensor(audio):
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio = torch.from_numpy(audio)

    if device is not None:
        audio = audio.to(device)
    if padding > 0:
        audio = F.pad(audio, (0, padding))
        
    window = torch.hann_window(N_FFT).to(audio.device)
    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
    magnitudes = stft[..., :-1].abs() ** 2

    filters = mel_filters(audio.device, n_mels)
    mel_spec = filters @ magnitudes

    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
    log_spec = (log_spec + 4.0) / 4.0
    return log_spec


class SpectrogramStream:
    def __init__(self, n_fft: int = N_FFT, hop_length: int = HOP_LENGTH, n_mels: int = 80, window: Optional[str] = "hann", pad_mode: str = "reflect"):

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.pad_mode = pad_mode
        self.n_mels = n_mels

        self.window = torch.hann_window(n_fft)
        self.window_type = window

        self.ctx_samples = self.n_fft - self.hop_length

        self.reset()

    def reset(self):
        self.is_first = True
        self.audio_ctx = torch.tensor([])
        self.log_spec_max = -torch.inf

    def calc_mel_with_new_frame(self, audio_frame: torch.Tensor, is_last: bool = False):
        
        self.window = self.window.to(audio_frame.device)
        
        if len(audio_frame.shape) == 1:
            audio_frame = audio_frame.unsqueeze(0)

        n_batch = audio_frame.shape[0]
        
        if isinstance(self.log_spec_max, float):
            self.log_spec_max = torch.ones((n_batch)).to(audio_frame.device) * -torch.inf

        # check if we are on first frame, if so, pad using reflection
        if self.is_first:
            pad = int(self.n_fft // 2) + 1
            audio_input = F.pad(audio_frame, [pad, 0], self.pad_mode)
            self.is_first = False
        else: # pad with previous context
            audio_input = torch.cat([self.audio_ctx[..., -self.ctx_samples:], audio_frame], dim=-1)
        
        if is_last: # pad reflect last frame
            pad = int(self.n_fft // 4) + 1
            audio_input = F.pad(audio_input, [pad, 0], self.pad_mode)

        self.audio_ctx = audio_frame # now audio ctx is the last frame

        stft = torch.stft(audio_input, self.n_fft, self.hop_length, window=self.window, return_complex=True, center=False)
        magnitudes = stft.abs() ** 2
        filters = mel_filters(audio_frame.device, self.n_mels)
        mel_spec = filters @ magnitudes

        log_spec = torch.clamp(mel_spec, min=1e-10).log10() # from shape (b, n_mels, audio_frames)
        self.log_spec_max = torch.maximum(log_spec.view(n_batch, -1).max(dim=-1).values, self.log_spec_max).to(log_spec.device)
        
        log_spec = torch.maximum(log_spec.view(n_batch, -1).permute(1, 0), self.log_spec_max - 8.0).permute(1, 0).view(n_batch, self.n_mels, -1)
        log_spec = (log_spec + 4.0) / 4.0
        return log_spec

    def _simulate_streaming_log_spec(self, audio: torch.Tensor, ms_gran: int = 300, total_frames: int = 3000, get_gt: bool = False):
        self.reset()

        samples_gran = HOP_LENGTH * (ms_gran // 10)
        sub_mel_frames = int(total_frames / ms_gran) * 10
        # print(samples_gran, sub_mel_frames)
        pred_mel = torch.cat([self.calc_mel_with_new_frame(audio[..., (i * samples_gran) + (40 * int(i != 0)): ((i + 1) * samples_gran) + 40], is_last=(i == sub_mel_frames - 1)) for i in range(sub_mel_frames)], dim=-1)
        
        if get_gt: 
            gt_mel = log_mel_spectrogram(audio)
            return pred_mel, gt_mel
        
        return pred_mel