Anjan9320 commited on
Commit
106b563
·
verified ·
1 Parent(s): c5614c9

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +52 -211
model.py CHANGED
@@ -66,92 +66,48 @@ class INF5Model(PreTrainedModel):
66
  # # Load state dict into model
67
  self.ema_model.load_state_dict(state_dict, strict=False)
68
 
69
- def _extract_embedding_from_audio_and_text(self, audio_path: str, text: str) -> torch.Tensor:
70
- device = next(self.parameters()).device # model device
71
-
72
- # Load audio waveform on CPU first
73
- waveform, sample_rate = torchaudio.load(audio_path)
74
- target_sample_rate = 24000
75
- if sample_rate != target_sample_rate:
76
- # Move waveform to device before resampling to avoid device mismatch
77
- waveform = waveform.to(device)
78
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate).to(device)
79
- waveform = resampler(waveform)
80
- else:
81
- # If no resampling, still move waveform to device for model
82
- waveform = waveform.to(device)
83
-
84
- # Forward pass - pass waveform and text directly to ema_model
85
- with torch.no_grad():
86
- outputs = self.ema_model(waveform, text)
87
-
88
- # Extract speaker embedding from outputs
89
- speaker_embedding = getattr(outputs, "speaker_embedding", None)
90
- if speaker_embedding is None:
91
- if isinstance(outputs, dict) and "speaker_embedding" in outputs:
92
- speaker_embedding = outputs["speaker_embedding"]
93
- else:
94
- raise RuntimeError("Speaker embedding not found in model output")
95
-
96
- return speaker_embedding.squeeze()
97
 
98
-
99
- def extract_speaker_embedding(self, ref_audio_path: str, ref_text: str):
100
  """
101
- Extract speaker embedding or reference features from audio and text.
102
- Converts audio to WAV if needed. Returns numpy array for saving/reuse.
 
 
 
 
 
 
 
103
  """
 
104
  if not os.path.exists(ref_audio_path):
105
- raise FileNotFoundError(f"Reference audio file '{ref_audio_path}' not found.")
106
-
107
- # Step 1: Preprocess audio + text (clip silence, convert etc)
108
- processed_audio_path, processed_text = preprocess_ref_audio_text(ref_audio_path, ref_text)
109
-
110
- # Step 2: Use model’s internal method to extract embedding from processed audio + text
111
- # IMPORTANT: Replace `self._extract_embedding_from_audio_and_text` with your actual method!
112
- speaker_embedding = self._extract_embedding_from_audio_and_text(processed_audio_path, processed_text)
113
-
114
- # Clean up temporary processed file if created
115
- if processed_audio_path != ref_audio_path and os.path.exists(processed_audio_path):
116
- os.remove(processed_audio_path)
117
-
118
- # Convert to numpy if it’s a tensor
119
- if isinstance(speaker_embedding, torch.Tensor):
120
- speaker_embedding = speaker_embedding.detach().cpu().numpy()
121
-
122
- return speaker_embedding
123
-
124
- def forward(self, text: str, speaker_embedding=None, ref_audio_path=None, ref_text=None):
125
- if speaker_embedding is None:
126
- if not ref_audio_path or not ref_text:
127
- raise ValueError("You must provide either a speaker_embedding or both ref_audio_path and ref_text.")
128
- # Extract speaker embedding correctly
129
- speaker_embedding = self.extract_speaker_embedding(ref_audio_path, ref_text)
130
- speaker_embedding = torch.tensor(speaker_embedding, dtype=torch.float32).to(self.device)
131
- else:
132
- if isinstance(speaker_embedding, np.ndarray):
133
- speaker_embedding = torch.tensor(speaker_embedding, dtype=torch.float32)
134
- speaker_embedding = speaker_embedding.to(self.device)
135
-
136
  self.ema_model.to(self.device)
137
  self.vocoder.to(self.device)
138
-
139
- audio, final_sample_rate, _ = infer_from_embedding(
140
- speaker_embedding=speaker_embedding,
141
- text=text,
142
- model=self.ema_model,
143
- vocoder=self.vocoder,
 
 
 
144
  speed=self.config.speed,
145
  device=self.device,
146
  )
147
-
148
- # Convert to pydub.AudioSegment for post-processing
149
  buffer = io.BytesIO()
150
- sf.write(buffer, audio, samplerate=final_sample_rate, format="WAV")
151
  buffer.seek(0)
152
  audio_segment = AudioSegment.from_file(buffer, format="wav")
153
-
154
- # Optional: Remove silence
155
  if self.config.remove_sil:
156
  non_silent_segs = silence.split_on_silence(
157
  audio_segment,
@@ -160,59 +116,44 @@ class INF5Model(PreTrainedModel):
160
  keep_silence=500,
161
  seek_step=10,
162
  )
163
- audio_segment = sum(non_silent_segs, AudioSegment.silent(duration=0))
164
-
165
- # Normalize to target loudness
 
166
  target_dBFS = -20.0
167
  change_in_dBFS = target_dBFS - audio_segment.dBFS
168
  audio_segment = audio_segment.apply_gain(change_in_dBFS)
169
-
170
  return np.array(audio_segment.get_array_of_samples())
171
 
172
 
 
173
  if __name__ == '__main__':
174
- import os
 
 
 
175
  import numpy as np
176
  import soundfile as sf
177
  from transformers import AutoConfig, AutoModel
178
- from f5_tts.infer.utils_infer import preprocess_ref_audio_text
179
-
180
- # Register your custom config and model
181
  AutoConfig.register("inf5", INF5Config)
182
  AutoModel.register(INF5Config, INF5Model)
183
 
184
- # Instantiate your model with config
185
- model = INF5Model(INF5Config(ckpt_path="checkpoints/model_best.pt", vocab_path="checkpoints/vocab.txt"))
186
- model.save_pretrained("INF5")
187
- model.config.save_pretrained("INF5")
188
-
189
- # Load model via HF AutoModel interface for proper loading from the saved folder
190
  model = AutoModel.from_pretrained("INF5")
191
-
192
- # Step 1: Extract speaker embedding from reference audio + text
193
- speaker_embedding = model.extract_speaker_embedding(
194
- "prompts/PAN_F_HAPPY_00001.wav",
195
- "ਭਹੰਪੀ ਵਿੱਚ ਸਮਾਰਕਾਂ ਦੇ ਭਵਨ ਨਿਰਮਾਣ ਕਲਾ ਦੇ ਵੇਰਵੇ ਗੁੰਝਲਦਾਰ ਅਤੇ ਹੈਰਾਨ ਕਰਨ ਵਾਲੇ ਹਨ, ਜੋ ਮੈਨੂੰ ਖੁਸ਼ ਕਰਦੇ ਹਨ।"
196
- )
197
- np.save("speaker_embedding.npy", speaker_embedding)
198
-
199
- # Step 2: Load saved embedding (simulate reuse)
200
- loaded_embedding = np.load("speaker_embedding.npy")
201
-
202
- # Step 3: Generate audio using precomputed embedding + new text
203
- audio = model(
204
- "नमस्ते! संगीत की तरह जीवन भी खूबसूरत होता है, बस इसे सही ताल में जीना आना चाहिए.",
205
- speaker_embedding=loaded_embedding
206
- )
207
-
208
- # Normalize audio dtype if needed before saving
209
  if audio.dtype == np.int16:
210
- audio = audio.astype(np.float32) / 32768.0
211
- sf.write("samples/namaste.wav", audio.astype(np.float32), samplerate=24000)
212
 
213
- # Upload model directory to Hugging Face Hub
214
  from huggingface_hub import HfApi
 
215
  repo_id = "svp19/INF5" # Change to your HF repo
 
 
216
  api = HfApi()
217
  api.upload_folder(
218
  folder_path="INF5",
@@ -221,108 +162,8 @@ if __name__ == '__main__':
221
  )
222
  print(f"Model pushed to https://huggingface.co/{repo_id} 🚀")
223
 
224
- # Verify upload by reloading
 
225
  model = AutoModel.from_pretrained(repo_id)
226
  print("Success")
227
 
228
-
229
- # def forward(self, text: str, ref_audio_path: str, ref_text: str):
230
- # """
231
- # Generate speech given a reference audio & text input.
232
-
233
- # Args:
234
- # text (str): The text to be synthesized.
235
- # ref_audio_path (str): Path to the reference audio file.
236
- # ref_text (str): The reference text.
237
-
238
- # Returns:
239
- # np.array: Generated waveform.
240
- # """
241
-
242
- # if not os.path.exists(ref_audio_path):
243
- # raise FileNotFoundError(f"Reference audio file {ref_audio_path} not found.")
244
-
245
- # # Load reference audio & text
246
- # ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_path, ref_text)
247
-
248
-
249
- # self.ema_model.to(self.device)
250
- # self.vocoder.to(self.device)
251
-
252
- # # Perform inference
253
- # audio, final_sample_rate, _ = infer_process(
254
- # ref_audio,
255
- # ref_text,
256
- # text,
257
- # self.ema_model,
258
- # self.vocoder,
259
- # mel_spec_type="vocos",
260
- # speed=self.config.speed,
261
- # device=self.device,
262
- # )
263
-
264
- # # Convert to pydub format and remove silence if needed
265
- # buffer = io.BytesIO()
266
- # sf.write(buffer, audio, samplerate=24000, format="WAV")
267
- # buffer.seek(0)
268
- # audio_segment = AudioSegment.from_file(buffer, format="wav")
269
-
270
- # if self.config.remove_sil:
271
- # non_silent_segs = silence.split_on_silence(
272
- # audio_segment,
273
- # min_silence_len=1000,
274
- # silence_thresh=-50,
275
- # keep_silence=500,
276
- # seek_step=10,
277
- # )
278
- # non_silent_wave = sum(non_silent_segs, AudioSegment.silent(duration=0))
279
- # audio_segment = non_silent_wave
280
-
281
- # # Normalize loudness
282
- # target_dBFS = -20.0
283
- # change_in_dBFS = target_dBFS - audio_segment.dBFS
284
- # audio_segment = audio_segment.apply_gain(change_in_dBFS)
285
-
286
- # return np.array(audio_segment.get_array_of_samples())
287
-
288
-
289
-
290
- # if __name__ == '__main__':
291
- # model = INF5Model(INF5Config(ckpt_path="checkpoints/model_best.pt", vocab_path="checkpoints/vocab.txt"))
292
- # model.save_pretrained("INF5")
293
- # model.config.save_pretrained("INF5")
294
-
295
- # import numpy as np
296
- # import soundfile as sf
297
- # from transformers import AutoConfig, AutoModel
298
-
299
- # AutoConfig.register("inf5", INF5Config)
300
- # AutoModel.register(INF5Config, INF5Model)
301
-
302
- # model = AutoModel.from_pretrained("INF5")
303
- # audio = model("नमस्ते! संगीत की तरह जीवन भी खूबसूरत होता है, बस इसे सही ताल में जीना आना चाहिए.",
304
- # ref_audio_path="prompts/PAN_F_HAPPY_00001.wav",
305
- # ref_text="भਹੰਪੀ ਵਿੱਚ ਸਮਾਰਕਾਂ ਦੇ ਭਵਨ ਨਿਰਮਾਣ ਕਲਾ ਦੇ ਵੇਰਵੇ ਗੁੰਝਲਦਾਰ ਅਤੇ ਹੈਰਾਨ ਕਰਨ ਵਾਲੇ ਹਨ, ਜੋ ਮੈਨੂੰ ਖੁਸ਼ ਕਰਦੇ ਹਨ।")
306
-
307
- # if audio.dtype == np.int16:
308
- # audio = audio.astype(np.float32) / 32768.0
309
- # sf.write("samples/namaste.wav", np.array(audio, dtype=np.float32), samplerate=24000)
310
-
311
- # from huggingface_hub import HfApi
312
-
313
- # repo_id = "svp19/INF5" # Change to your HF repo
314
-
315
- # # Upload model directory to HF
316
- # api = HfApi()
317
- # api.upload_folder(
318
- # folder_path="INF5",
319
- # repo_id=repo_id,
320
- # repo_type="model"
321
- # )
322
- # print(f"Model pushed to https://huggingface.co/{repo_id} 🚀")
323
-
324
- # print("Verify Upload")
325
- # from transformers import AutoModel
326
- # model = AutoModel.from_pretrained(repo_id)
327
- # print("Success")
328
-
 
66
  # # Load state dict into model
67
  self.ema_model.load_state_dict(state_dict, strict=False)
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ def forward(self, text: str, ref_audio_path: str, ref_text: str):
 
71
  """
72
+ Generate speech given a reference audio & text input.
73
+
74
+ Args:
75
+ text (str): The text to be synthesized.
76
+ ref_audio_path (str): Path to the reference audio file.
77
+ ref_text (str): The reference text.
78
+
79
+ Returns:
80
+ np.array: Generated waveform.
81
  """
82
+
83
  if not os.path.exists(ref_audio_path):
84
+ raise FileNotFoundError(f"Reference audio file {ref_audio_path} not found.")
85
+
86
+ # Load reference audio & text
87
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_path, ref_text)
88
+
89
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  self.ema_model.to(self.device)
91
  self.vocoder.to(self.device)
92
+
93
+ # Perform inference
94
+ audio, final_sample_rate, _ = infer_process(
95
+ ref_audio,
96
+ ref_text,
97
+ text,
98
+ self.ema_model,
99
+ self.vocoder,
100
+ mel_spec_type="vocos",
101
  speed=self.config.speed,
102
  device=self.device,
103
  )
104
+
105
+ # Convert to pydub format and remove silence if needed
106
  buffer = io.BytesIO()
107
+ sf.write(buffer, audio, samplerate=24000, format="WAV")
108
  buffer.seek(0)
109
  audio_segment = AudioSegment.from_file(buffer, format="wav")
110
+
 
111
  if self.config.remove_sil:
112
  non_silent_segs = silence.split_on_silence(
113
  audio_segment,
 
116
  keep_silence=500,
117
  seek_step=10,
118
  )
119
+ non_silent_wave = sum(non_silent_segs, AudioSegment.silent(duration=0))
120
+ audio_segment = non_silent_wave
121
+
122
+ # Normalize loudness
123
  target_dBFS = -20.0
124
  change_in_dBFS = target_dBFS - audio_segment.dBFS
125
  audio_segment = audio_segment.apply_gain(change_in_dBFS)
126
+
127
  return np.array(audio_segment.get_array_of_samples())
128
 
129
 
130
+
131
  if __name__ == '__main__':
132
+ model = INF5Model(INF5Config(ckpt_path="checkpoints/model_best.pt", vocab_path="checkpoints/vocab.txt"))
133
+ model.save_pretrained("INF5")
134
+ model.config.save_pretrained("INF5")
135
+
136
  import numpy as np
137
  import soundfile as sf
138
  from transformers import AutoConfig, AutoModel
139
+
 
 
140
  AutoConfig.register("inf5", INF5Config)
141
  AutoModel.register(INF5Config, INF5Model)
142
 
 
 
 
 
 
 
143
  model = AutoModel.from_pretrained("INF5")
144
+ audio = model("नमस्ते! संगीत की तरह जीवन भी खूबसूरत होता है, बस इसे सही ताल में जीना आना चाहिए.",
145
+ ref_audio_path="prompts/PAN_F_HAPPY_00001.wav",
146
+ ref_text="भਹੰਪੀ ਵਿੱਚ ਸਮਾਰਕਾਂ ਦੇ ਭਵਨ ਨਿਰਮਾਣ ਕਲਾ ਦੇ ਵੇਰਵੇ ਗੁੰਝਲਦਾਰ ਅਤੇ ਹੈਰਾਨ ਕਰਨ ਵਾਲੇ ਹਨ, ਜੋ ਮੈਨੂੰ ਖੁਸ਼ ਕਰਦੇ ਹਨ।")
147
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  if audio.dtype == np.int16:
149
+ audio = audio.astype(np.float32) / 32768.0
150
+ sf.write("samples/namaste.wav", np.array(audio, dtype=np.float32), samplerate=24000)
151
 
 
152
  from huggingface_hub import HfApi
153
+
154
  repo_id = "svp19/INF5" # Change to your HF repo
155
+
156
+ # Upload model directory to HF
157
  api = HfApi()
158
  api.upload_folder(
159
  folder_path="INF5",
 
162
  )
163
  print(f"Model pushed to https://huggingface.co/{repo_id} 🚀")
164
 
165
+ print("Verify Upload")
166
+ from transformers import AutoModel
167
  model = AutoModel.from_pretrained(repo_id)
168
  print("Success")
169