Anjan9320 commited on
Commit
879ec61
·
verified ·
1 Parent(s): 106b563

Update f5_tts/infer/utils_infer.py

Browse files
Files changed (1) hide show
  1. f5_tts/infer/utils_infer.py +0 -50
f5_tts/infer/utils_infer.py CHANGED
@@ -404,56 +404,6 @@ def infer_process(
404
  device=device,
405
  )
406
 
407
- def infer_from_embedding(
408
- speaker_embedding,
409
- gen_text,
410
- model_obj,
411
- vocoder,
412
- mel_spec_type="vocos",
413
- speed=1.0,
414
- nfe_step=32,
415
- cfg_strength=2.0,
416
- sway_sampling_coef=-1,
417
- fix_duration=None,
418
- device="cuda"
419
- ):
420
- generated_waves = []
421
- spectrograms = []
422
-
423
- # Estimate duration if fix_duration is not provided
424
- # These are just heuristics since no ref_text is used
425
- est_duration = fix_duration
426
- if est_duration is None:
427
- avg_duration_sec = max(len(gen_text) * 0.06, 1.0) # crude approximation
428
- est_duration = int(avg_duration_sec * target_sample_rate / hop_length)
429
-
430
- # Tokenize gen_text
431
- final_text_list = convert_char_to_pinyin([gen_text])
432
-
433
- with torch.inference_mode():
434
- generated, _ = model_obj.sample(
435
- cond=speaker_embedding.to(device),
436
- text=final_text_list,
437
- duration=est_duration,
438
- steps=nfe_step,
439
- cfg_strength=cfg_strength,
440
- sway_sampling_coef=sway_sampling_coef,
441
- )
442
-
443
- generated = generated.to(torch.float32)
444
- generated_mel_spec = generated.permute(0, 2, 1)
445
-
446
- if mel_spec_type == "vocos":
447
- generated_wave = vocoder.decode(generated_mel_spec)
448
- elif mel_spec_type == "bigvgan":
449
- generated_wave = vocoder(generated_mel_spec)
450
-
451
- final_wave = generated_wave.squeeze().cpu().numpy()
452
- final_spec = generated_mel_spec[0].cpu().numpy()
453
-
454
- return final_wave, target_sample_rate, final_spec
455
-
456
-
457
  # infer batches
458
 
459
 
 
404
  device=device,
405
  )
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  # infer batches
408
 
409