"""
Pipeline for enrolling:
1. Provide Recording
2. Convert to 16 kHz
3. Divide into recordings
4. Get embeddings for each recording
5. Find centroid
6. Save conditioning as some value.
"""
import os
import argbind
import sounddevice as sd
import soundfile
import torch
import numpy as np

import sys

sys.path.append('.')

from src.constants import CONDITIONING_FILENAME, CONDITIONING_FOLDER
from src.data import DataProperties
from src.models import ResNetSE34V2


MIN_WINDOWS = 10
WINDOW_SIZE = 64_000
BLOCK_SIZE = 256

RECORDING_TEXT = """
This script will record you speaking, and will create an embedding
to be used for conditioning Voicebox. This will overwrite any previous
embeddings. We recommend at least 10 seconds of non-stop voice recording.
Press enter to begin recording. To stop recording, press ctrl-C.
"""


def get_streams(input_name: str, block_size: int) -> sd.InputStream:
    """
    Gets Input stream object
    """
    try:
        input_name = int(input_name)
    except ValueError:
        pass
    return (
        sd.InputStream(device=input_name,
                       samplerate=DataProperties.get('sample_rate'),
                       channels=1,
                       blocksize=block_size)
    )


def record_from_user(input_name: str) -> torch.Tensor:
    input_stream = get_streams(input_name, BLOCK_SIZE)
    input(RECORDING_TEXT)
    input_stream.start()
    all_frames = []
    try:
        print("Recording...")
        while True:
            frames, _ = input_stream.read(BLOCK_SIZE)
            all_frames.append(frames)
    except KeyboardInterrupt:
        print("Stopped Recording.")
        pass
    all_frames = torch.Tensor(np.array(all_frames))
    recording = all_frames.reshape(-1)
    return recording


def get_embedding(recording) -> torch.Tensor:
    model = ResNetSE34V2(nOut=512, encoder_type='ASP')
    recording = recording.view(1, -1)
    embedding = model(recording)
    return embedding


def save(embedding, audio) -> None:
    os.makedirs(CONDITIONING_FOLDER, exist_ok=True)
    torch.save(embedding, CONDITIONING_FILENAME)
    soundfile.write(
        CONDITIONING_FOLDER / 'conditioning_audio.wav',
        audio.detach().cpu(),
        DataProperties.get('sample_rate')
    )


@argbind.bind(positional=True, without_prefix=True)
def main(input: str = None):
    """
    Creating a conditioning vector for VoiceBox from your voice

    :param input: Index or name of input audio interface. Defaults to current device
    :type input: str, optional
    """
    recording = record_from_user(input)
    embedding = get_embedding(recording)
    save(embedding, recording)


if __name__ == "__main__":
    args = argbind.parse_args()
    with argbind.scope(args):
        main()