SefyanKehail commited on
Commit ·
0288ea7
1
Parent(s): 6d79719
GPU
Browse files
app.py
CHANGED
|
@@ -5,20 +5,48 @@ import gradio as gr
|
|
| 5 |
import multiprocessing
|
| 6 |
import os
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
|
| 12 |
-
hubert, acoustic, hifigan = None, None, None
|
| 13 |
|
| 14 |
# Function to initialize models with CUDA
|
| 15 |
def initialize_models():
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).cuda()
|
| 21 |
-
print("Models initialized.")
|
| 22 |
|
| 23 |
|
| 24 |
|
|
@@ -36,15 +64,17 @@ def convert_speech(filename, progress=gr.Progress()):
|
|
| 36 |
if source.shape[0] > 1:
|
| 37 |
source = source[0, :].unsqueeze(0)
|
| 38 |
source = torchaudio.functional.resample(source, sr, 16000)
|
| 39 |
-
source = source.unsqueeze(0).
|
| 40 |
|
| 41 |
progress(0.6, desc="Converting speech")
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Convert to the target speaker:
|
| 45 |
with torch.inference_mode():
|
| 46 |
-
initialize_models()
|
| 47 |
-
|
| 48 |
# Extract speech units
|
| 49 |
units = hubert.units(source)
|
| 50 |
# Generate target spectrogram
|
|
@@ -78,9 +108,6 @@ def get_audio_input(mic_input, audio_input):
|
|
| 78 |
def clear_components():
|
| 79 |
return None, None
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
# Gradio interface
|
| 85 |
with gr.Blocks() as interface:
|
| 86 |
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
|
|
|
|
| 5 |
import multiprocessing
|
| 6 |
import os
|
| 7 |
|
| 8 |
+
def download_file(url, filename):
|
| 9 |
+
if not os.path.exists(filename):
|
| 10 |
+
print(f"{filename} files missing. Downloading ...")
|
| 11 |
+
response = requests.get(url, stream=True)
|
| 12 |
+
with open(filename, 'wb') as file:
|
| 13 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 14 |
+
if chunk:
|
| 15 |
+
file.write(chunk)
|
| 16 |
+
else:
|
| 17 |
+
print(f"{filename} exists. Skipping download")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
model_urls = {
|
| 21 |
+
"hubert.pt": "https://www.dropbox.com/scl/fi/99ww7w3z1gwiqfuvq85ju/hubert_cpu.pt?rlkey=5wiqve2kdzn7aw7bth3wz1lqu&st=ec63yc3v&dl=1",
|
| 22 |
+
"acoustic.pt": "https://www.dropbox.com/scl/fi/hi8o0kjr6rhwpjb4jj56w/acoustic_cpu.pt?rlkey=0x62tmwbnjpmmvs9u1kdfb1bk&st=n8u2kzuw&dl=1",
|
| 23 |
+
"hifigan.pt": "https://www.dropbox.com/scl/fi/oi6642xskncc2fhwubawj/hifigan_cpu.pt?rlkey=amljq6kih4vuocj6335wa1hxa&st=9midccb2&dl=1"
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def verify_file(filename):
|
| 27 |
+
try:
|
| 28 |
+
# Attempt to load the file to verify its integrity
|
| 29 |
+
torch.load(filename, map_location='cpu')
|
| 30 |
+
torch.load_state_dict()
|
| 31 |
+
print(f"{filename} is valid.")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"Error with {filename}: {e}")
|
| 34 |
+
|
| 35 |
+
for filename, url in model_urls.items():
|
| 36 |
+
print(f"Downloading {filename}...")
|
| 37 |
+
download_file(url, filename)
|
| 38 |
+
print(f"{filename} downloaded successfully.")
|
| 39 |
|
| 40 |
+
verify_file(filename)
|
| 41 |
|
| 42 |
|
|
|
|
| 43 |
|
| 44 |
# Function to initialize models with CUDA
|
| 45 |
def initialize_models():
|
| 46 |
+
hubert = torch.load("hubert.pt", map_location=torch.device('cpu'))
|
| 47 |
+
acoustic = torch.load("acoustic.pt", map_location=torch.device('cpu'))
|
| 48 |
+
hifigan = torch.load("hifigan.pt", map_location=torch.device('cpu'))
|
| 49 |
+
return hubert, acoustic, hifigan
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
|
|
|
|
| 64 |
if source.shape[0] > 1:
|
| 65 |
source = source[0, :].unsqueeze(0)
|
| 66 |
source = torchaudio.functional.resample(source, sr, 16000)
|
| 67 |
+
source = source.unsqueeze(0).to('cpu')
|
| 68 |
|
| 69 |
progress(0.6, desc="Converting speech")
|
| 70 |
|
| 71 |
+
# # Initialize models
|
| 72 |
+
# hubert, acoustic, hifigan = initialize_models()
|
| 73 |
+
|
| 74 |
|
| 75 |
# Convert to the target speaker:
|
| 76 |
with torch.inference_mode():
|
| 77 |
+
hubert, acoustic, hifigan = initialize_models()
|
|
|
|
| 78 |
# Extract speech units
|
| 79 |
units = hubert.units(source)
|
| 80 |
# Generate target spectrogram
|
|
|
|
| 108 |
def clear_components():
|
| 109 |
return None, None
|
| 110 |
|
|
|
|
|
|
|
|
|
|
| 111 |
# Gradio interface
|
| 112 |
with gr.Blocks() as interface:
|
| 113 |
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
|