Vaishnavi0404 commited on
Commit
475a0f9
·
verified ·
1 Parent(s): 1f65ac7

Update voice_synthesizer.py

Browse files
Files changed (1) hide show
  1. voice_synthesizer.py +186 -67
voice_synthesizer.py CHANGED
@@ -1,4 +1,7 @@
 
 
1
  import torch
 
2
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
  import scipy
4
  import numpy as np
@@ -6,27 +9,122 @@ import soundfile as sf
6
 
7
  class VoiceSynthesizer:
8
  def __init__(self):
9
- """Initialize the voice synthesizer with the SpeechT5 model"""
10
- # Load models
11
- self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
- self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
- self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
14
 
15
  # Load speaker embeddings
16
- self.speaker_embeddings = {
17
- "neutral": torch.load("speaker_embeddings/neutral.pt") if torch.cuda.is_available() else
18
- torch.load("speaker_embeddings/neutral.pt", map_location=torch.device('cpu')),
19
- "feminine": torch.load("speaker_embeddings/feminine.pt") if torch.cuda.is_available() else
20
- torch.load("speaker_embeddings/feminine.pt", map_location=torch.device('cpu')),
21
- "masculine": torch.load("speaker_embeddings/masculine.pt") if torch.cuda.is_available() else
22
- torch.load("speaker_embeddings/masculine.pt", map_location=torch.device('cpu'))
23
- }
24
 
25
  # Set device
26
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
27
  self.model.to(self.device)
28
  self.vocoder.to(self.device)
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def synthesize(self, text, output_path, voice_type="neutral", speed=1.0):
31
  """
32
  Synthesize speech from text
@@ -42,62 +140,76 @@ class VoiceSynthesizer:
42
  voice_type = "neutral"
43
  print(f"Invalid voice type. Using default 'neutral' voice.")
44
 
45
- # Process input text
46
- inputs = self.processor(text=text, return_tensors="pt").to(self.device)
47
-
48
- # Get speaker embeddings and ensure proper shape
49
- speaker_embeddings = self.speaker_embeddings[voice_type].to(self.device)
50
-
51
- # Print shape for debugging
52
- print(f"Speaker embeddings shape before: {speaker_embeddings.shape}")
53
-
54
- # Fix the dimension issue - ensure it's a 2D tensor with shape [1, embedding_dim]
55
- if len(speaker_embeddings.shape) == 1:
56
- speaker_embeddings = speaker_embeddings.unsqueeze(0) # Add batch dimension
57
-
58
- print(f"Speaker embeddings shape after: {speaker_embeddings.shape}")
59
-
60
  try:
61
- # Generate speech
62
- speech = self.model.generate_speech(
63
- inputs["input_ids"],
64
- speaker_embeddings,
65
- vocoder=self.vocoder
66
- )
67
- except IndexError as e:
68
- # Alternative approach if the above fails
69
- print(f"Error in generate_speech: {e}")
70
- print("Trying alternative approach...")
71
 
72
- # If the first approach fails, try reshaping the embeddings differently
73
- if len(speaker_embeddings.shape) == 2:
74
- if speaker_embeddings.shape[0] > 1 and speaker_embeddings.shape[0] > speaker_embeddings.shape[1]:
75
- speaker_embeddings = speaker_embeddings.mean(dim=0, keepdim=True)
76
- elif speaker_embeddings.shape[0] == 1:
77
- # Ensure correct normalization dimension
78
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=1)
79
 
80
- # Try generation again
81
- speech = self.model.generate_speech(
82
- inputs["input_ids"],
83
- speaker_embeddings,
84
- vocoder=self.vocoder
85
- )
86
-
87
- # Convert to numpy array
88
- speech = speech.cpu().numpy()
89
-
90
- # Adjust speed if needed
91
- if speed != 1.0:
92
- import librosa
93
- speech = librosa.effects.time_stretch(speech, rate=speed)
94
-
95
- # Save audio file
96
- sf.write(output_path, speech, samplerate=16000)
97
-
98
- print(f"Speech synthesized and saved to {output_path}")
99
-
100
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def create_speaker_embedding(self, reference_file, output_path):
103
  """
@@ -112,7 +224,14 @@ class VoiceSynthesizer:
112
  print("Creating speaker embeddings requires a speaker encoder model")
113
  print("Using default embeddings instead")
114
 
 
 
 
115
  # For now, we'll just copy one of the existing embeddings
116
- torch.save(self.speaker_embeddings["neutral"], output_path)
 
 
 
 
117
 
118
  return output_path
 
1
+ import os
2
+ import time
3
  import torch
4
+ import requests
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
  import scipy
7
  import numpy as np
 
9
 
10
  class VoiceSynthesizer:
11
  def __init__(self):
12
+ """Initialize the voice synthesizer with the SpeechT5 model with local caching"""
13
+ # Create cache directory
14
+ model_cache_dir = os.path.join(os.path.dirname(__file__), "model_cache")
15
+ os.makedirs(model_cache_dir, exist_ok=True)
16
+
17
+ # Initialize models with retry mechanism
18
+ self.processor, self.model, self.vocoder = self._initialize_models(model_cache_dir)
19
 
20
  # Load speaker embeddings
21
+ self.speaker_embeddings = self._load_speaker_embeddings()
 
 
 
 
 
 
 
22
 
23
  # Set device
24
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
  self.model.to(self.device)
26
  self.vocoder.to(self.device)
27
 
28
+ def _initialize_models(self, cache_dir, max_retries=3):
29
+ """Initialize models with retry mechanism"""
30
+ for attempt in range(max_retries):
31
+ try:
32
+ print(f"Loading models (attempt {attempt+1}/{max_retries})...")
33
+
34
+ # Try to load from local cache first
35
+ try:
36
+ processor = SpeechT5Processor.from_pretrained(
37
+ "microsoft/speecht5_tts",
38
+ local_files_only=True,
39
+ cache_dir=cache_dir
40
+ )
41
+ model = SpeechT5ForTextToSpeech.from_pretrained(
42
+ "microsoft/speecht5_tts",
43
+ local_files_only=True,
44
+ cache_dir=cache_dir
45
+ )
46
+ vocoder = SpeechT5HifiGan.from_pretrained(
47
+ "microsoft/speecht5_hifigan",
48
+ local_files_only=True,
49
+ cache_dir=cache_dir
50
+ )
51
+ print("Successfully loaded models from local cache.")
52
+ return processor, model, vocoder
53
+ except Exception as local_err:
54
+ print(f"Could not load models from local cache: {local_err}")
55
+ # If loading from cache fails, try downloading
56
+ print("Downloading models from Hugging Face Hub...")
57
+
58
+ # Increase timeout for downloads
59
+ import huggingface_hub
60
+ huggingface_hub.constants.HF_HUB_DOWNLOAD_TIMEOUT = 30 # Increase timeout to 30 seconds
61
+
62
+ processor = SpeechT5Processor.from_pretrained(
63
+ "microsoft/speecht5_tts",
64
+ cache_dir=cache_dir
65
+ )
66
+ model = SpeechT5ForTextToSpeech.from_pretrained(
67
+ "microsoft/speecht5_tts",
68
+ cache_dir=cache_dir
69
+ )
70
+ vocoder = SpeechT5HifiGan.from_pretrained(
71
+ "microsoft/speecht5_hifigan",
72
+ cache_dir=cache_dir
73
+ )
74
+ print("Successfully downloaded and cached models.")
75
+ return processor, model, vocoder
76
+
77
+ except (OSError, requests.exceptions.ReadTimeout) as e:
78
+ if attempt < max_retries - 1:
79
+ wait_time = 5 * (attempt + 1) # Linear backoff: 5s, 10s, 15s
80
+ print(f"Attempt {attempt+1} failed: {e}")
81
+ print(f"Retrying in {wait_time} seconds...")
82
+ time.sleep(wait_time)
83
+ else:
84
+ print(f"Failed to load models after {max_retries} attempts.")
85
+ raise e
86
+
87
+ def _load_speaker_embeddings(self):
88
+ """Load speaker embeddings with error handling"""
89
+ embeddings_dir = os.path.join(os.path.dirname(__file__), "speaker_embeddings")
90
+ os.makedirs(embeddings_dir, exist_ok=True)
91
+
92
+ # Create mapping for speaker embeddings
93
+ embedding_files = {
94
+ "neutral": os.path.join(embeddings_dir, "neutral.pt"),
95
+ "feminine": os.path.join(embeddings_dir, "feminine.pt"),
96
+ "masculine": os.path.join(embeddings_dir, "masculine.pt")
97
+ }
98
+
99
+ # Load embeddings with proper error handling
100
+ speaker_embeddings = {}
101
+ for voice_type, file_path in embedding_files.items():
102
+ try:
103
+ if os.path.exists(file_path):
104
+ if torch.cuda.is_available():
105
+ speaker_embeddings[voice_type] = torch.load(file_path)
106
+ else:
107
+ speaker_embeddings[voice_type] = torch.load(file_path, map_location=torch.device('cpu'))
108
+ print(f"Loaded {voice_type} speaker embedding")
109
+ else:
110
+ print(f"Warning: Speaker embedding file {file_path} not found")
111
+ # Create a fallback embedding if file doesn't exist
112
+ # This is placeholder - in production you'd want real speaker embeddings
113
+ if not speaker_embeddings: # Only create placeholder for first missing file
114
+ print("Creating placeholder speaker embedding")
115
+ placeholder = torch.ones(1, 512) / 512 # Typical embedding dimension is 512
116
+ speaker_embeddings[voice_type] = placeholder
117
+ else:
118
+ # Reuse existing embedding for missing voices
119
+ speaker_embeddings[voice_type] = next(iter(speaker_embeddings.values()))
120
+ except Exception as e:
121
+ print(f"Error loading {voice_type} speaker embedding: {e}")
122
+ # Create fallback embedding on error
123
+ placeholder = torch.ones(1, 512) / 512
124
+ speaker_embeddings[voice_type] = placeholder
125
+
126
+ return speaker_embeddings
127
+
128
  def synthesize(self, text, output_path, voice_type="neutral", speed=1.0):
129
  """
130
  Synthesize speech from text
 
140
  voice_type = "neutral"
141
  print(f"Invalid voice type. Using default 'neutral' voice.")
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  try:
144
+ # Process input text
145
+ inputs = self.processor(text=text, return_tensors="pt").to(self.device)
 
 
 
 
 
 
 
 
146
 
147
+ # Get speaker embeddings and ensure proper shape
148
+ speaker_embeddings = self.speaker_embeddings[voice_type].to(self.device)
 
 
 
 
 
149
 
150
+ # Print shape for debugging
151
+ print(f"Speaker embeddings shape before: {speaker_embeddings.shape}")
152
+
153
+ # Fix the dimension issue - ensure it's a 2D tensor with shape [1, embedding_dim]
154
+ if len(speaker_embeddings.shape) == 1:
155
+ speaker_embeddings = speaker_embeddings.unsqueeze(0) # Add batch dimension
156
+
157
+ print(f"Speaker embeddings shape after: {speaker_embeddings.shape}")
158
+
159
+ try:
160
+ # Generate speech
161
+ speech = self.model.generate_speech(
162
+ inputs["input_ids"],
163
+ speaker_embeddings,
164
+ vocoder=self.vocoder
165
+ )
166
+ except IndexError as e:
167
+ # Alternative approach if the above fails
168
+ print(f"Error in generate_speech: {e}")
169
+ print("Trying alternative approach...")
170
+
171
+ # If the first approach fails, try reshaping the embeddings differently
172
+ if len(speaker_embeddings.shape) == 2:
173
+ if speaker_embeddings.shape[0] > 1 and speaker_embeddings.shape[0] > speaker_embeddings.shape[1]:
174
+ speaker_embeddings = speaker_embeddings.mean(dim=0, keepdim=True)
175
+ elif speaker_embeddings.shape[0] == 1:
176
+ # Ensure correct normalization dimension
177
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=1)
178
+
179
+ # Try generation again
180
+ speech = self.model.generate_speech(
181
+ inputs["input_ids"],
182
+ speaker_embeddings,
183
+ vocoder=self.vocoder
184
+ )
185
+
186
+ # Convert to numpy array
187
+ speech = speech.cpu().numpy()
188
+
189
+ # Adjust speed if needed
190
+ if speed != 1.0:
191
+ try:
192
+ import librosa
193
+ speech = librosa.effects.time_stretch(speech, rate=speed)
194
+ except Exception as e:
195
+ print(f"Error adjusting speed: {e}")
196
+ # Continue with original speed
197
+ pass
198
+
199
+ # Create output directory if it doesn't exist
200
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
201
+
202
+ # Save audio file
203
+ sf.write(output_path, speech, samplerate=16000)
204
+
205
+ print(f"Speech synthesized and saved to {output_path}")
206
+
207
+ return output_path
208
+
209
+ except Exception as e:
210
+ print(f"Error in speech synthesis: {e}")
211
+ # Return error placeholder or raise exception based on your error handling strategy
212
+ raise
213
 
214
  def create_speaker_embedding(self, reference_file, output_path):
215
  """
 
224
  print("Creating speaker embeddings requires a speaker encoder model")
225
  print("Using default embeddings instead")
226
 
227
+ # Create output directory if it doesn't exist
228
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
229
+
230
  # For now, we'll just copy one of the existing embeddings
231
+ try:
232
+ torch.save(self.speaker_embeddings["neutral"], output_path)
233
+ print(f"Saved placeholder speaker embedding to {output_path}")
234
+ except Exception as e:
235
+ print(f"Error saving speaker embedding: {e}")
236
 
237
  return output_path