baenacoco commited on
Commit
781f017
Β·
verified Β·
1 Parent(s): cf2df63

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +18 -3
app.py CHANGED
@@ -39,9 +39,10 @@ os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR)
39
 
40
  F5_SPANISH_MODEL_ID = "jpgallegoar/F5-Spanish"
41
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
42
- APP_VERSION = "1.0.1"
43
 
44
  _f5_model = None
 
45
 
46
 
47
  def _clear_cache():
@@ -89,6 +90,18 @@ def _get_reference_audio():
89
  raise FileNotFoundError("No hay reference.wav. Descarga el modelo primero.")
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # ── Gradio handlers ──
93
 
94
  def download_model(project_name, progress=gr.Progress()):
@@ -135,6 +148,7 @@ def generate_speech(project_name, text, speed, progress=gr.Progress()):
135
  _load_tts()
136
 
137
  ref_audio = _get_reference_audio()
 
138
  output_path = str(TEMP_DIR / "tts_output.wav")
139
 
140
  progress(0.3, desc="Generando voz...")
@@ -142,7 +156,7 @@ def generate_speech(project_name, text, speed, progress=gr.Progress()):
142
 
143
  audio, sr, _spec = _f5_model.infer(
144
  ref_file=ref_audio,
145
- ref_text="",
146
  gen_text=text,
147
  speed=speed,
148
  )
@@ -174,9 +188,10 @@ def generate_with_custom_ref(project_name, text, ref_audio_path, speed, progress
174
  progress(0.3, desc="Generando voz...")
175
  logger.info(f"Generating with custom ref: '{text[:80]}...'")
176
 
 
177
  audio, sr, _spec = _f5_model.infer(
178
  ref_file=ref_audio_path,
179
- ref_text="",
180
  gen_text=text,
181
  speed=speed,
182
  )
 
39
 
40
  F5_SPANISH_MODEL_ID = "jpgallegoar/F5-Spanish"
41
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
42
+ APP_VERSION = "1.1.0"
43
 
44
  _f5_model = None
45
+ _ref_text_cache = {} # {audio_path: transcribed_text}
46
 
47
 
48
  def _clear_cache():
 
90
  raise FileNotFoundError("No hay reference.wav. Descarga el modelo primero.")
91
 
92
 
93
+ def _get_ref_text(audio_path):
94
+ """Pre-transcribe reference audio in Spanish to avoid Whisper auto-detecting wrong language."""
95
+ if audio_path in _ref_text_cache:
96
+ return _ref_text_cache[audio_path]
97
+ _load_tts()
98
+ logger.info(f"Transcribing reference audio as Spanish: {audio_path}")
99
+ ref_text = _f5_model.transcribe(audio_path, language="spanish")
100
+ logger.info(f"Reference transcription: {ref_text}")
101
+ _ref_text_cache[audio_path] = ref_text
102
+ return ref_text
103
+
104
+
105
  # ── Gradio handlers ──
106
 
107
  def download_model(project_name, progress=gr.Progress()):
 
148
  _load_tts()
149
 
150
  ref_audio = _get_reference_audio()
151
+ ref_text = _get_ref_text(ref_audio)
152
  output_path = str(TEMP_DIR / "tts_output.wav")
153
 
154
  progress(0.3, desc="Generando voz...")
 
156
 
157
  audio, sr, _spec = _f5_model.infer(
158
  ref_file=ref_audio,
159
+ ref_text=ref_text,
160
  gen_text=text,
161
  speed=speed,
162
  )
 
188
  progress(0.3, desc="Generando voz...")
189
  logger.info(f"Generating with custom ref: '{text[:80]}...'")
190
 
191
+ ref_text = _get_ref_text(ref_audio_path)
192
  audio, sr, _spec = _f5_model.infer(
193
  ref_file=ref_audio_path,
194
+ ref_text=ref_text,
195
  gen_text=text,
196
  speed=speed,
197
  )