ashishkblink commited on
Commit
9898207
·
verified ·
1 Parent(s): 6b69eec

Upload f5_tts/infer/utils_infer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. f5_tts/infer/utils_infer.py +549 -0
f5_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+
6
+ os.environ["PYTOCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
7
+ sys.path.append(f"../../{os.path.dirname(os.path.abspath(__file__))}/third_party/BigVGAN/")
8
+
9
+ import hashlib
10
+ import re
11
+ import tempfile
12
+ from importlib.resources import files
13
+
14
+ import matplotlib
15
+
16
+ matplotlib.use("Agg")
17
+
18
+ import matplotlib.pylab as plt
19
+ import numpy as np
20
+ import torch
21
+ import torchaudio
22
+ import tqdm
23
+ from huggingface_hub import snapshot_download, hf_hub_download
24
+ from pydub import AudioSegment, silence
25
+ from transformers import pipeline
26
+ from vocos import Vocos
27
+
28
+ from f5_tts.model import CFM
29
+ from f5_tts.model.utils import (
30
+ get_tokenizer,
31
+ convert_char_to_pinyin,
32
+ )
33
+
34
+ _ref_audio_cache = {}
35
+
36
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
37
+
38
+ # -----------------------------------------
39
+
40
+ target_sample_rate = 24000
41
+ n_mel_channels = 100
42
+ hop_length = 256
43
+ win_length = 1024
44
+ n_fft = 1024
45
+ mel_spec_type = "vocos"
46
+ target_rms = 0.1
47
+ cross_fade_duration = 0.15
48
+ ode_method = "euler"
49
+ nfe_step = 32 # 16, 32
50
+ cfg_strength = 2.0
51
+ sway_sampling_coef = -1.0
52
+ speed = 1.0
53
+ fix_duration = None
54
+
55
+ # -----------------------------------------
56
+
57
+
58
+ # chunk text into smaller pieces
59
+
60
+
61
+ def chunk_text(text, max_chars=135):
62
+ """
63
+ Splits the input text into chunks, each with a maximum number of characters.
64
+
65
+ Args:
66
+ text (str): The text to be split.
67
+ max_chars (int): The maximum number of characters per chunk.
68
+
69
+ Returns:
70
+ List[str]: A list of text chunks.
71
+ """
72
+ chunks = []
73
+ current_chunk = ""
74
+ # Split the text into sentences based on punctuation followed by whitespace
75
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
76
+
77
+ for sentence in sentences:
78
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
79
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
80
+ else:
81
+ if current_chunk:
82
+ chunks.append(current_chunk.strip())
83
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
84
+
85
+ if current_chunk:
86
+ chunks.append(current_chunk.strip())
87
+
88
+ return chunks
89
+
90
+
91
+ # load vocoder
92
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
93
+ if vocoder_name == "vocos":
94
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
95
+ if is_local:
96
+ print(f"Load vocos from local path {local_path}")
97
+ config_path = f"{local_path}/config.yaml"
98
+ model_path = f"{local_path}/pytorch_model.bin"
99
+ else:
100
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
101
+ repo_id = "charactr/vocos-mel-24khz"
102
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
103
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
104
+ vocoder = Vocos.from_hparams(config_path)
105
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
106
+ from vocos.feature_extractors import EncodecFeatures
107
+
108
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
109
+ encodec_parameters = {
110
+ "feature_extractor.encodec." + key: value
111
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
112
+ }
113
+ state_dict.update(encodec_parameters)
114
+ vocoder.load_state_dict(state_dict)
115
+ vocoder = vocoder.eval().to(device)
116
+ elif vocoder_name == "bigvgan":
117
+ try:
118
+ from third_party.BigVGAN import bigvgan
119
+ except ImportError:
120
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
121
+ if is_local:
122
+ """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
123
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
124
+ else:
125
+ local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=hf_cache_dir)
126
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
127
+
128
+ vocoder.remove_weight_norm()
129
+ vocoder = vocoder.eval().to(device)
130
+ return vocoder
131
+
132
+ # load asr pipeline
133
+
134
+ asr_pipe = None
135
+
136
+
137
+ def initialize_asr_pipeline(device: str = device, dtype=None):
138
+ if dtype is None:
139
+ dtype = (
140
+ torch.float16
141
+ if "cuda" in device
142
+ and torch.cuda.get_device_properties(device).major >= 6
143
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
144
+ else torch.float32
145
+ )
146
+ global asr_pipe
147
+ asr_pipe = pipeline(
148
+ "automatic-speech-recognition",
149
+ model="openai/whisper-large-v3-turbo",
150
+ torch_dtype=dtype,
151
+ device=device,
152
+ )
153
+
154
+
155
+ # transcribe
156
+
157
+
158
+ def transcribe(ref_audio, language=None):
159
+ global asr_pipe
160
+ if asr_pipe is None:
161
+ initialize_asr_pipeline(device=device)
162
+ return asr_pipe(
163
+ ref_audio,
164
+ chunk_length_s=30,
165
+ batch_size=128,
166
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
167
+ return_timestamps=False,
168
+ )["text"].strip()
169
+
170
+
171
+ # load model checkpoint for inference
172
+
173
+
174
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
175
+ if dtype is None:
176
+ dtype = torch.float32
177
+ # dtype = (
178
+ # torch.float16
179
+ # if "cuda" in device
180
+ # and torch.cuda.get_device_properties(device).major >= 6
181
+ # and not torch.cuda.get_device_name().endswith("[ZLUDA]")
182
+ # else torch.float32
183
+ # )
184
+ model = model.to(dtype)
185
+
186
+ ckpt_type = ckpt_path.split(".")[-1]
187
+ if ckpt_type == "safetensors":
188
+ from safetensors.torch import load_file
189
+
190
+ checkpoint = load_file(ckpt_path, device=device)
191
+ else:
192
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
193
+
194
+ if use_ema:
195
+ if ckpt_type == "safetensors":
196
+ checkpoint = {"ema_model_state_dict": checkpoint}
197
+ checkpoint["model_state_dict"] = {
198
+ k.replace("ema_model.", ""): v
199
+ for k, v in checkpoint["ema_model_state_dict"].items()
200
+ if k not in ["initted", "step"]
201
+ }
202
+
203
+ # patch for backward compatibility, 305e3ea
204
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
205
+ if key in checkpoint["model_state_dict"]:
206
+ del checkpoint["model_state_dict"][key]
207
+
208
+ model.load_state_dict(checkpoint["model_state_dict"])
209
+ else:
210
+ if ckpt_type == "safetensors":
211
+ checkpoint = {"model_state_dict": checkpoint}
212
+ model.load_state_dict(checkpoint["model_state_dict"])
213
+
214
+ del checkpoint
215
+ torch.cuda.empty_cache()
216
+
217
+ return model.to(device)
218
+
219
+
220
+ # load model for inference
221
+
222
+
223
+ def load_model(
224
+ model_cls,
225
+ model_cfg,
226
+ mel_spec_type=mel_spec_type,
227
+ vocab_file="",
228
+ ode_method=ode_method,
229
+ use_ema=True,
230
+ device=device,
231
+ ):
232
+ if vocab_file == "":
233
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
234
+ tokenizer = "custom"
235
+
236
+ print("\nvocab : ", vocab_file)
237
+ print("token : ", tokenizer)
238
+
239
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
240
+ model = CFM(
241
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
242
+ mel_spec_kwargs=dict(
243
+ n_fft=n_fft,
244
+ hop_length=hop_length,
245
+ win_length=win_length,
246
+ n_mel_channels=n_mel_channels,
247
+ target_sample_rate=target_sample_rate,
248
+ mel_spec_type=mel_spec_type,
249
+ ),
250
+ odeint_kwargs=dict(
251
+ method=ode_method,
252
+ ),
253
+ vocab_char_map=vocab_char_map,
254
+ ).to(device)
255
+
256
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
257
+ # model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
258
+
259
+ return model
260
+
261
+
262
+ def remove_silence_edges(audio, silence_threshold=-42):
263
+ # Remove silence from the start
264
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
265
+ audio = audio[non_silent_start_idx:]
266
+
267
+ # Remove silence from the end
268
+ non_silent_end_duration = audio.duration_seconds
269
+ for ms in reversed(audio):
270
+ if ms.dBFS > silence_threshold:
271
+ break
272
+ non_silent_end_duration -= 0.001
273
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
274
+
275
+ return trimmed_audio
276
+
277
+
278
+ # preprocess reference audio and text
279
+
280
+
281
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
282
+ # show_info("Converting audio...")
283
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
284
+ aseg = AudioSegment.from_file(ref_audio_orig)
285
+
286
+ if clip_short:
287
+ # 1. try to find long silence for clipping
288
+ non_silent_segs = silence.split_on_silence(
289
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
290
+ )
291
+ non_silent_wave = AudioSegment.silent(duration=0)
292
+ for non_silent_seg in non_silent_segs:
293
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
294
+ show_info("Audio is over 15s, clipping short. (1)")
295
+ break
296
+ non_silent_wave += non_silent_seg
297
+
298
+ # 2. try to find short silence for clipping if 1. failed
299
+ if len(non_silent_wave) > 15000:
300
+ non_silent_segs = silence.split_on_silence(
301
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
302
+ )
303
+ non_silent_wave = AudioSegment.silent(duration=0)
304
+ for non_silent_seg in non_silent_segs:
305
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
306
+ show_info("Audio is over 15s, clipping short. (2)")
307
+ break
308
+ non_silent_wave += non_silent_seg
309
+
310
+ aseg = non_silent_wave
311
+
312
+ # 3. if no proper silence found for clipping
313
+ if len(aseg) > 15000:
314
+ aseg = aseg[:15000]
315
+ show_info("Audio is over 15s, clipping short. (3)")
316
+
317
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
318
+ aseg.export(f.name, format="wav")
319
+ ref_audio = f.name
320
+
321
+ # Compute a hash of the reference audio file
322
+ with open(ref_audio, "rb") as audio_file:
323
+ audio_data = audio_file.read()
324
+ audio_hash = hashlib.md5(audio_data).hexdigest()
325
+
326
+ if not ref_text.strip():
327
+ global _ref_audio_cache
328
+ if audio_hash in _ref_audio_cache:
329
+ # Use cached asr transcription
330
+ show_info("Using cached reference text...")
331
+ ref_text = _ref_audio_cache[audio_hash]
332
+ else:
333
+ show_info("No reference text provided, transcribing reference audio...")
334
+ ref_text = transcribe(ref_audio)
335
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
336
+ _ref_audio_cache[audio_hash] = ref_text
337
+ else:
338
+ # show_info("Using custom reference text...")
339
+ pass
340
+
341
+ # Ensure ref_text ends with a proper sentence-ending punctuation
342
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
343
+ if ref_text.endswith("."):
344
+ ref_text += " "
345
+ else:
346
+ ref_text += ". "
347
+
348
+ # print("\nref_text ", ref_text)
349
+
350
+ return ref_audio, ref_text
351
+
352
+
353
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
354
+
355
+
356
+ def infer_process(
357
+ ref_audio,
358
+ ref_text,
359
+ gen_text,
360
+ model_obj,
361
+ vocoder,
362
+ mel_spec_type=mel_spec_type,
363
+ show_info=print,
364
+ progress=tqdm,
365
+ target_rms=target_rms,
366
+ cross_fade_duration=cross_fade_duration,
367
+ nfe_step=nfe_step,
368
+ cfg_strength=cfg_strength,
369
+ sway_sampling_coef=sway_sampling_coef,
370
+ speed=speed,
371
+ fix_duration=fix_duration,
372
+ device=device,
373
+ ):
374
+ # Split the input text into batches
375
+ audio, sr = torchaudio.load(ref_audio)
376
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
377
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
378
+ # for i, gen_text in enumerate(gen_text_batches):
379
+ # print(f"gen_text {i}", gen_text)
380
+ # print("\n")
381
+
382
+ # show_info(f"Generating audio in {len(gen_text_batches)} batches...")
383
+ return infer_batch_process(
384
+ (audio, sr),
385
+ ref_text,
386
+ gen_text_batches,
387
+ model_obj,
388
+ vocoder,
389
+ mel_spec_type=mel_spec_type,
390
+ progress=progress,
391
+ target_rms=target_rms,
392
+ cross_fade_duration=cross_fade_duration,
393
+ nfe_step=nfe_step,
394
+ cfg_strength=cfg_strength,
395
+ sway_sampling_coef=sway_sampling_coef,
396
+ speed=speed,
397
+ fix_duration=fix_duration,
398
+ device=device,
399
+ )
400
+
401
+
402
+ # infer batches
403
+
404
+
405
+ def infer_batch_process(
406
+ ref_audio,
407
+ ref_text,
408
+ gen_text_batches,
409
+ model_obj,
410
+ vocoder,
411
+ mel_spec_type="vocos",
412
+ progress=tqdm,
413
+ target_rms=0.1,
414
+ cross_fade_duration=0.15,
415
+ nfe_step=32,
416
+ cfg_strength=2.0,
417
+ sway_sampling_coef=-1,
418
+ speed=1,
419
+ fix_duration=None,
420
+ device=None,
421
+ ):
422
+ audio, sr = ref_audio
423
+ if audio.shape[0] > 1:
424
+ audio = torch.mean(audio, dim=0, keepdim=True)
425
+
426
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
427
+ if rms < target_rms:
428
+ audio = audio * target_rms / rms
429
+ if sr != target_sample_rate:
430
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
431
+ audio = resampler(audio)
432
+ audio = audio.to(device)
433
+
434
+ generated_waves = []
435
+ spectrograms = []
436
+
437
+ if len(ref_text[-1].encode("utf-8")) == 1:
438
+ ref_text = ref_text + " "
439
+ # for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
440
+ for i, gen_text in enumerate(gen_text_batches):
441
+ # Prepare the text
442
+ text_list = [ref_text + gen_text]
443
+ final_text_list = convert_char_to_pinyin(text_list)
444
+
445
+ ref_audio_len = audio.shape[-1] // hop_length
446
+ if fix_duration is not None:
447
+ duration = int(fix_duration * target_sample_rate / hop_length)
448
+ else:
449
+ # Calculate duration
450
+ ref_text_len = len(ref_text.encode("utf-8"))
451
+ gen_text_len = len(gen_text.encode("utf-8"))
452
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
453
+ # print("ref_text_len:", ref_text_len)
454
+ # print("gen_text_len:", gen_text_len)
455
+ # print("duration:", duration)
456
+ # inference
457
+ with torch.inference_mode():
458
+ generated, _ = model_obj.sample(
459
+ cond=audio,
460
+ text=final_text_list,
461
+ duration=duration,
462
+ steps=nfe_step,
463
+ cfg_strength=cfg_strength,
464
+ sway_sampling_coef=sway_sampling_coef,
465
+ )
466
+
467
+ generated = generated.to(torch.float32)
468
+ generated = generated[:, ref_audio_len:, :]
469
+ generated_mel_spec = generated.permute(0, 2, 1)
470
+ if mel_spec_type == "vocos":
471
+ generated_wave = vocoder.decode(generated_mel_spec)
472
+ elif mel_spec_type == "bigvgan":
473
+ generated_wave = vocoder(generated_mel_spec)
474
+ if rms < target_rms:
475
+ generated_wave = generated_wave * rms / target_rms
476
+
477
+ # wav -> numpy
478
+ generated_wave = generated_wave.squeeze().cpu().numpy()
479
+
480
+ generated_waves.append(generated_wave)
481
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
482
+
483
+ # Combine all generated waves with cross-fading
484
+ if cross_fade_duration <= 0:
485
+ # Simply concatenate
486
+ final_wave = np.concatenate(generated_waves)
487
+ else:
488
+ final_wave = generated_waves[0]
489
+ for i in range(1, len(generated_waves)):
490
+ prev_wave = final_wave
491
+ next_wave = generated_waves[i]
492
+
493
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
494
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
495
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
496
+
497
+ if cross_fade_samples <= 0:
498
+ # No overlap possible, concatenate
499
+ final_wave = np.concatenate([prev_wave, next_wave])
500
+ continue
501
+
502
+ # Overlapping parts
503
+ prev_overlap = prev_wave[-cross_fade_samples:]
504
+ next_overlap = next_wave[:cross_fade_samples]
505
+
506
+ # Fade out and fade in
507
+ fade_out = np.linspace(1, 0, cross_fade_samples)
508
+ fade_in = np.linspace(0, 1, cross_fade_samples)
509
+
510
+ # Cross-faded overlap
511
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
512
+
513
+ # Combine
514
+ new_wave = np.concatenate(
515
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
516
+ )
517
+
518
+ final_wave = new_wave
519
+
520
+ # Create a combined spectrogram
521
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
522
+
523
+ return final_wave, target_sample_rate, combined_spectrogram
524
+
525
+
526
+ # remove silence from generated wav
527
+
528
+
529
+ def remove_silence_for_generated_wav(filename):
530
+ aseg = AudioSegment.from_file(filename)
531
+ non_silent_segs = silence.split_on_silence(
532
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
533
+ )
534
+ non_silent_wave = AudioSegment.silent(duration=0)
535
+ for non_silent_seg in non_silent_segs:
536
+ non_silent_wave += non_silent_seg
537
+ aseg = non_silent_wave
538
+ aseg.export(filename, format="wav")
539
+
540
+
541
+ # save spectrogram
542
+
543
+
544
+ def save_spectrogram(spectrogram, path):
545
+ plt.figure(figsize=(12, 4))
546
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
547
+ plt.colorbar()
548
+ plt.savefig(path)
549
+ plt.close()