SefyanKehail commited on
Commit
0288ea7
·
1 Parent(s): 6d79719
Files changed (1) hide show
  1. app.py +41 -14
app.py CHANGED
@@ -5,20 +5,48 @@ import gradio as gr
5
  import multiprocessing
6
  import os
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- print(torch.cuda.is_available())
10
 
11
 
12
- hubert, acoustic, hifigan = None, None, None
13
 
14
  # Function to initialize models with CUDA
15
  def initialize_models():
16
- global hubert, acoustic, hifigan
17
- print("Initializing models...")
18
- hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()
19
- acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()
20
- hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).cuda()
21
- print("Models initialized.")
22
 
23
 
24
 
@@ -36,15 +64,17 @@ def convert_speech(filename, progress=gr.Progress()):
36
  if source.shape[0] > 1:
37
  source = source[0, :].unsqueeze(0)
38
  source = torchaudio.functional.resample(source, sr, 16000)
39
- source = source.unsqueeze(0).cuda()
40
 
41
  progress(0.6, desc="Converting speech")
42
 
 
 
 
43
 
44
  # Convert to the target speaker:
45
  with torch.inference_mode():
46
- initialize_models()
47
-
48
  # Extract speech units
49
  units = hubert.units(source)
50
  # Generate target spectrogram
@@ -78,9 +108,6 @@ def get_audio_input(mic_input, audio_input):
78
  def clear_components():
79
  return None, None
80
 
81
-
82
-
83
-
84
  # Gradio interface
85
  with gr.Blocks() as interface:
86
  gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
 
5
  import multiprocessing
6
  import os
7
 
8
+ def download_file(url, filename):
9
+ if not os.path.exists(filename):
10
+ print(f"{filename} files missing. Downloading ...")
11
+ response = requests.get(url, stream=True)
12
+ with open(filename, 'wb') as file:
13
+ for chunk in response.iter_content(chunk_size=8192):
14
+ if chunk:
15
+ file.write(chunk)
16
+ else:
17
+ print(f"{filename} exists. Skipping download")
18
+
19
+
20
+ model_urls = {
21
+ "hubert.pt": "https://www.dropbox.com/scl/fi/99ww7w3z1gwiqfuvq85ju/hubert_cpu.pt?rlkey=5wiqve2kdzn7aw7bth3wz1lqu&st=ec63yc3v&dl=1",
22
+ "acoustic.pt": "https://www.dropbox.com/scl/fi/hi8o0kjr6rhwpjb4jj56w/acoustic_cpu.pt?rlkey=0x62tmwbnjpmmvs9u1kdfb1bk&st=n8u2kzuw&dl=1",
23
+ "hifigan.pt": "https://www.dropbox.com/scl/fi/oi6642xskncc2fhwubawj/hifigan_cpu.pt?rlkey=amljq6kih4vuocj6335wa1hxa&st=9midccb2&dl=1"
24
+ }
25
+
26
+ def verify_file(filename):
27
+ try:
28
+ # Attempt to load the file to verify its integrity
29
+ torch.load(filename, map_location='cpu')
30
+ torch.load_state_dict()
31
+ print(f"{filename} is valid.")
32
+ except Exception as e:
33
+ print(f"Error with {filename}: {e}")
34
+
35
+ for filename, url in model_urls.items():
36
+ print(f"Downloading {filename}...")
37
+ download_file(url, filename)
38
+ print(f"{filename} downloaded successfully.")
39
 
40
+ verify_file(filename)
41
 
42
 
 
43
 
44
  # Function to initialize models with CUDA
45
  def initialize_models():
46
+ hubert = torch.load("hubert.pt", map_location=torch.device('cpu'))
47
+ acoustic = torch.load("acoustic.pt", map_location=torch.device('cpu'))
48
+ hifigan = torch.load("hifigan.pt", map_location=torch.device('cpu'))
49
+ return hubert, acoustic, hifigan
 
 
50
 
51
 
52
 
 
64
  if source.shape[0] > 1:
65
  source = source[0, :].unsqueeze(0)
66
  source = torchaudio.functional.resample(source, sr, 16000)
67
+ source = source.unsqueeze(0).to('cpu')
68
 
69
  progress(0.6, desc="Converting speech")
70
 
71
+ # # Initialize models
72
+ # hubert, acoustic, hifigan = initialize_models()
73
+
74
 
75
  # Convert to the target speaker:
76
  with torch.inference_mode():
77
+ hubert, acoustic, hifigan = initialize_models()
 
78
  # Extract speech units
79
  units = hubert.units(source)
80
  # Generate target spectrogram
 
108
  def clear_components():
109
  return None, None
110
 
 
 
 
111
  # Gradio interface
112
  with gr.Blocks() as interface:
113
  gr.Markdown("# Soft Speech Units for Improved Voice Conversion")