File size: 4,620 Bytes
92dd9b7
 
 
 
7d807f6
288d695
6bb5d28
fa2468d
efabd33
677e8db
 
 
231bc11
 
 
 
7585608
83b4e28
ae95dde
a7af99a
 
 
49bb6e1
ae95dde
 
 
49bb6e1
10d2c76
 
 
3c80605
490051a
9417555
92dd9b7
 
833dcce
 
92dd9b7
 
 
 
 
 
 
 
 
 
 
 
0288ea7
92dd9b7
 
1840c64
92dd9b7
 
 
231bc11
7ec5d0b
0e32fbb
92dd9b7
231bc11
7ec5d0b
0e32fbb
92dd9b7
231bc11
92dd9b7
 
 
 
 
 
 
 
d099b64
 
 
 
92dd9b7
4327eb9
 
92dd9b7
4ff9373
d2fc5e2
53c09fb
d099b64
d500a47
5e3224c
55eefe7
 
793d67e
 
 
 
 
92dd9b7
793d67e
 
 
 
 
e244b7c
793d67e
 
92dd9b7
1af470c
d6f741b
4327eb9
793d67e
4327eb9
9952831
5e3224c
55eefe7
d099b64
793d67e
d099b64
4327eb9
ad61f31
f68e440
793d67e
f68e440
793d67e
f68e440
 
793d67e
f68e440
793d67e
f68e440
793d67e
f68e440
793d67e
 
d9fb395
793d67e
 
 
 
92dd9b7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import torch, torchaudio
import requests
import IPython.display as display
import gradio as gr
import os
import sys



hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)

hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))

# Set the state dictionaries to the models
# model.load_state_dict(hubert_loaded.state_dict(),  strict=False)
# acoustic.load_state_dict(acoustic_loaded.state_dict(),  strict=False)
# hifigan.load_state_dict(hifigan_loaded.state_dict(),  strict=False)


# print(hubert_loaded)
# print(model)
# sys.exit()
# Move models to CPU (if not already on CPU)
# hubert = hubert.to('cpu')
# acoustic = acoustic.to('cpu')
# hifigan = hifigan.to('cpu')



# Conversion function
def convert_speech(filename, progress=gr.Progress()):
    if not filename:
        raise ValueError("Please provide an audio")    
    progress(0, desc="Starting conversion")

    progress(0.1, desc="Loading audio")

    source, sr = torchaudio.load(filename)

    progress(0.3, desc="Preprocessing audio")

    # Use the first channel if the audio is stereo
    if source.shape[0] > 1:
        source = source[0, :].unsqueeze(0)
    source = torchaudio.functional.resample(source, sr, 16000)
    source = source.unsqueeze(0).to('cpu')

    progress(0.6, desc="Converting speech")
    
    # Convert to the target speaker:
    with torch.inference_mode():
        # Extract speech units
        units = hubert_loaded.units(source)
        progress(0.7, desc="Generating target spectrogram")

        # Generate target spectrogram
        mel = acoustic_loaded.generate(units).transpose(1, 2)
        progress(0.8, desc="Generating audio waveform")

        # Generate audio waveform
        target = hifigan_loaded(mel)
        progress(0.9, desc="Postprocessing audio")
    # Move the tensor to CPU and convert to NumPy
    target = target.squeeze().cpu().numpy()
    progress(1.0, desc="Conversion complete")
    return 16000, target

"""Convert to the target speaker:"""

def enable_convert_button(audio):
    if audio is not None:
        return gr.update(interactive=True), gr.update(value="", visible=False)
    return gr.update(interactive=False), None

def clear_components():
  return None, None

def stop_recording_info(audio):
    if audio is None:
        return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
    return gr.update(value="", visible=False)

def stop():
    print("this is working")

# Gradio interface
def gui():
    with gr.Blocks() as interface:
        gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
        gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
                convert_button = gr.Button("Convert Speech", interactive=False)
                info = gr.Markdown("", visible=False)

            with gr.Column():
                converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)




        # Use audio_input.change to trigger stop_recording_info when audio changes

        audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])
        
        audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])

        convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])


        audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])

        # audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
    
    return interface


if __name__ == "__main__":

    app = gui()

    app.queue(default_concurrency_limit=40)

    app.launch(
        max_threads=40,
        share=True,
        show_error=True,
        quiet=False,
        debug=False,
    )