eloicito333 commited on
Commit
81e51dc
·
verified ·
1 Parent(s): 7104330

Custom endpoint handler added + requirements.txt added

Browse files
Files changed (2) hide show
  1. handler.py +91 -0
  2. requirements.txt +9 -0
handler.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ import tempfile
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import re
6
+ from num2words import num2words
7
+ from f5_tts.model import DiT
8
+ from f5_tts.infer.utils_infer import (
9
+ load_vocoder,
10
+ load_model,
11
+ preprocess_ref_audio_text,
12
+ infer_process,
13
+ remove_silence_for_generated_wav,
14
+ )
15
+ import base64
16
+ import io
17
+ import numpy as np
18
+
19
+
20
+ class EndpointHandler:
21
+ def __init__(self, path=""):
22
+ self.vocoder = load_vocoder()
23
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
24
+ model_path = "hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"
25
+ self.ema_model = load_model(DiT, model_cfg, model_path)
26
+
27
+ def traducir_numero_a_texto(self, texto):
28
+ texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
29
+ texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
30
+ def reemplazar_numero(match):
31
+ numero = match.group()
32
+ return num2words(int(numero), lang='es')
33
+ return re.sub(r'\b\d+\b', reemplazar_numero, texto_separado)
34
+
35
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
36
+ try:
37
+ ref_audio_path = data.get("ref_audio")
38
+ if not ref_audio_path:
39
+ return {"error": "Missing required field: 'ref_audio'"}
40
+
41
+ ref_text = data.get("ref_text", "")
42
+ gen_text = data.get("gen_text", "")
43
+ if not gen_text:
44
+ return {"error": "Missing required field: 'gen_text'"}
45
+
46
+ remove_silence = data.get("remove_silence", False)
47
+ cross_fade_duration = data.get("cross_fade_duration", 0.15)
48
+ speed = data.get("speed", 1.0)
49
+
50
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_path, ref_text, show_info=print)
51
+
52
+ if not gen_text.startswith(" "):
53
+ gen_text = " " + gen_text
54
+ if not gen_text.endswith(". "):
55
+ gen_text += ". "
56
+ gen_text = self.traducir_numero_a_texto(gen_text.lower())
57
+
58
+ final_wave, final_sample_rate, _ = infer_process(
59
+ ref_audio,
60
+ ref_text,
61
+ gen_text,
62
+ self.ema_model,
63
+ self.vocoder,
64
+ cross_fade_duration=cross_fade_duration,
65
+ speed=speed,
66
+ show_info=print,
67
+ progress=None,
68
+ )
69
+
70
+ if remove_silence:
71
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
72
+ sf.write(f.name, final_wave, final_sample_rate)
73
+ remove_silence_for_generated_wav(f.name)
74
+ final_wave, _ = torchaudio.load(f.name)
75
+ final_wave = final_wave.squeeze().cpu().numpy()
76
+
77
+ with io.BytesIO() as buffer:
78
+ sf.write(buffer, final_wave, final_sample_rate, format="WAV")
79
+ buffer.seek(0)
80
+ encoded_audio = base64.b64encode(buffer.read()).decode("utf-8")
81
+
82
+ return {
83
+ "sucess": True,
84
+ "audio_base64": encoded_audio
85
+ }
86
+
87
+ except Exception as e:
88
+ return {
89
+ "success": False,
90
+ "error": f"{type(e).__name__}: {str(e)}"
91
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ numpy
5
+ soundfile
6
+ num2words
7
+ cached_path
8
+ openai-whisper
9
+ git+https://github.com/jpgallegoar/Spanish-F5.git