import torch, torchaudio
import requests
import IPython.display as display
import gradio as gr
import os
import sys



hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)

hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))

# Set the state dictionaries to the models
# model.load_state_dict(hubert_loaded.state_dict(),  strict=False)
# acoustic.load_state_dict(acoustic_loaded.state_dict(),  strict=False)
# hifigan.load_state_dict(hifigan_loaded.state_dict(),  strict=False)


# print(hubert_loaded)
# print(model)
# sys.exit()
# Move models to CPU (if not already on CPU)
# hubert = hubert.to('cpu')
# acoustic = acoustic.to('cpu')
# hifigan = hifigan.to('cpu')



# Conversion function
def convert_speech(filename, progress=gr.Progress()):
    if not filename:
        raise ValueError("Please provide an audio")    
    progress(0, desc="Starting conversion")

    progress(0.1, desc="Loading audio")

    source, sr = torchaudio.load(filename)

    progress(0.3, desc="Preprocessing audio")

    # Use the first channel if the audio is stereo
    if source.shape[0] > 1:
        source = source[0, :].unsqueeze(0)
    source = torchaudio.functional.resample(source, sr, 16000)
    source = source.unsqueeze(0).to('cpu')

    progress(0.6, desc="Converting speech")
    
    # Convert to the target speaker:
    with torch.inference_mode():
        # Extract speech units
        units = hubert_loaded.units(source)
        progress(0.7, desc="Generating target spectrogram")

        # Generate target spectrogram
        mel = acoustic_loaded.generate(units).transpose(1, 2)
        progress(0.8, desc="Generating audio waveform")

        # Generate audio waveform
        target = hifigan_loaded(mel)
        progress(0.9, desc="Postprocessing audio")
    # Move the tensor to CPU and convert to NumPy
    target = target.squeeze().cpu().numpy()
    progress(1.0, desc="Conversion complete")
    return 16000, target

"""Convert to the target speaker:"""

def enable_convert_button(audio):
    if audio is not None:
        return gr.update(interactive=True), gr.update(value="", visible=False)
    return gr.update(interactive=False), None

def clear_components():
  return None, None

def stop_recording_info(audio):
    if audio is None:
        return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
    return gr.update(value="", visible=False)

def stop():
    print("this is working")

# Gradio interface
def gui():
    with gr.Blocks() as interface:
        gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
        gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
                convert_button = gr.Button("Convert Speech", interactive=False)
                info = gr.Markdown("", visible=False)

            with gr.Column():
                converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)




        # Use audio_input.change to trigger stop_recording_info when audio changes

        audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])
        
        audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])

        convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])


        audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])

        # audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
    
    return interface


if __name__ == "__main__":

    app = gui()

    app.queue(default_concurrency_limit=40)

    app.launch(
        max_threads=40,
        share=True,
        show_error=True,
        quiet=False,
        debug=False,
    )