Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +1 -1
- src/f5_tts/infer/utils_infer.py +26 -15
app.py
CHANGED
|
@@ -189,7 +189,7 @@ def infer(
|
|
| 189 |
|
| 190 |
# Remove silence
|
| 191 |
if remove_silence:
|
| 192 |
-
with tempfile.NamedTemporaryFile(
|
| 193 |
sf.write(f.name, final_wave, final_sample_rate)
|
| 194 |
remove_silence_for_generated_wav(f.name)
|
| 195 |
final_wave, _ = torchaudio.load(f.name)
|
|
|
|
| 189 |
|
| 190 |
# Remove silence
|
| 191 |
if remove_silence:
|
| 192 |
+
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
|
| 193 |
sf.write(f.name, final_wave, final_sample_rate)
|
| 194 |
remove_silence_for_generated_wav(f.name)
|
| 195 |
final_wave, _ = torchaudio.load(f.name)
|
src/f5_tts/infer/utils_infer.py
CHANGED
|
@@ -33,6 +33,7 @@ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
|
|
| 33 |
|
| 34 |
|
| 35 |
_ref_audio_cache = {}
|
|
|
|
| 36 |
|
| 37 |
device = (
|
| 38 |
"cuda"
|
|
@@ -290,12 +291,24 @@ def remove_silence_edges(audio, silence_threshold=-42):
|
|
| 290 |
# preprocess reference audio and text
|
| 291 |
|
| 292 |
|
| 293 |
-
def preprocess_ref_audio_text(ref_audio_orig, ref_text,
|
| 294 |
show_info("Converting audio...")
|
| 295 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 296 |
-
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 297 |
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
# 1. try to find long silence for clipping
|
| 300 |
non_silent_segs = silence.split_on_silence(
|
| 301 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
|
|
@@ -326,26 +339,24 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
| 326 |
aseg = aseg[:12000]
|
| 327 |
show_info("Audio is over 12s, clipping short. (3)")
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
audio_data = audio_file.read()
|
| 336 |
-
audio_hash = hashlib.md5(audio_data).hexdigest()
|
| 337 |
|
| 338 |
if not ref_text.strip():
|
| 339 |
-
global
|
| 340 |
-
if audio_hash in
|
| 341 |
# Use cached asr transcription
|
| 342 |
show_info("Using cached reference text...")
|
| 343 |
-
ref_text =
|
| 344 |
else:
|
| 345 |
show_info("No reference text provided, transcribing reference audio...")
|
| 346 |
ref_text = transcribe(ref_audio)
|
| 347 |
# Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
|
| 348 |
-
|
| 349 |
else:
|
| 350 |
show_info("Using custom reference text...")
|
| 351 |
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
_ref_audio_cache = {}
|
| 36 |
+
_ref_text_cache = {}
|
| 37 |
|
| 38 |
device = (
|
| 39 |
"cuda"
|
|
|
|
| 291 |
# preprocess reference audio and text
|
| 292 |
|
| 293 |
|
| 294 |
+
def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
|
| 295 |
show_info("Converting audio...")
|
|
|
|
|
|
|
| 296 |
|
| 297 |
+
# Compute a hash of the reference audio file
|
| 298 |
+
with open(ref_audio_orig, "rb") as audio_file:
|
| 299 |
+
audio_data = audio_file.read()
|
| 300 |
+
audio_hash = hashlib.md5(audio_data).hexdigest()
|
| 301 |
+
|
| 302 |
+
global _ref_audio_cache
|
| 303 |
+
|
| 304 |
+
if audio_hash in _ref_audio_cache:
|
| 305 |
+
show_info("Using cached preprocessed reference audio...")
|
| 306 |
+
ref_audio = _ref_audio_cache[audio_hash]
|
| 307 |
+
|
| 308 |
+
else: # first pass, do preprocess
|
| 309 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 310 |
+
aseg = AudioSegment.from_file(ref_audio_orig)
|
| 311 |
+
|
| 312 |
# 1. try to find long silence for clipping
|
| 313 |
non_silent_segs = silence.split_on_silence(
|
| 314 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
|
|
|
|
| 339 |
aseg = aseg[:12000]
|
| 340 |
show_info("Audio is over 12s, clipping short. (3)")
|
| 341 |
|
| 342 |
+
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
| 343 |
+
aseg.export(f.name, format="wav")
|
| 344 |
+
ref_audio = f.name
|
| 345 |
|
| 346 |
+
# Cache the processed reference audio
|
| 347 |
+
_ref_audio_cache[audio_hash] = ref_audio
|
|
|
|
|
|
|
| 348 |
|
| 349 |
if not ref_text.strip():
|
| 350 |
+
global _ref_text_cache
|
| 351 |
+
if audio_hash in _ref_text_cache:
|
| 352 |
# Use cached asr transcription
|
| 353 |
show_info("Using cached reference text...")
|
| 354 |
+
ref_text = _ref_text_cache[audio_hash]
|
| 355 |
else:
|
| 356 |
show_info("No reference text provided, transcribing reference audio...")
|
| 357 |
ref_text = transcribe(ref_audio)
|
| 358 |
# Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
|
| 359 |
+
_ref_text_cache[audio_hash] = ref_text
|
| 360 |
else:
|
| 361 |
show_info("Using custom reference text...")
|
| 362 |
|