TGPro1 commited on
Commit
6ce2f3c
Β·
verified Β·
1 Parent(s): 86effca

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +60 -28
app.py CHANGED
@@ -10,21 +10,34 @@ from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
 
13
- # --- [v158] πŸš€ ULTRA STABLE ENGINE ---
14
- # This version fixes the TorchCodec/torchaudio dependency hell on H200 ZeroGPU
15
- print(f"--- [v158] πŸ“‘ BOOTING ENGINE ---")
16
 
17
- # πŸ› οΈ MONKEYPATCH torchaudio BEFORE XTTS LOADING πŸ› οΈ
 
18
  import torchaudio
19
- import librosa
20
- def stable_load(filepath, **kwargs):
21
- # Redirect torchaudio.load to librosa to bypass torchcodec issues
22
- # Coqui XTTS usually passes sr as a keyword or positional argument
23
- target_sr = kwargs.get("sample_rate") or kwargs.get("sr") or None
24
- y, sr = librosa.load(filepath, sr=target_sr)
25
- return torch.from_numpy(y).unsqueeze(0), sr
26
- torchaudio.load = stable_load
27
- print("--- [v158] 🩹 TORCHAUDIO PATCH APPLIED ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
30
  from TTS.api import TTS
@@ -59,15 +72,16 @@ MODELS = {"stt": None, "tts": None}
59
  def load_tts_cpu():
60
  global MODELS
61
  if MODELS.get("tts") is None:
62
- print("--- [v158] πŸ“₯ LOADING XTTS V2 (CPU) ---")
 
63
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
64
- print("--- [v158] βœ… XTTS READY (CPU) ---")
65
 
66
  @spaces.GPU(duration=60)
67
  def gpu_stt_base(temp_path, lang):
68
  global MODELS
69
  if MODELS.get("stt") is None:
70
- print("--- [v158] πŸ“₯ LOADING WHISPER (Base) ON GPU ---")
71
  model_id = "openai/whisper-base"
72
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to("cuda")
73
  processor = AutoProcessor.from_pretrained(model_id)
@@ -78,6 +92,7 @@ def gpu_stt_base(temp_path, lang):
78
  feature_extractor=processor.feature_extractor,
79
  device="cuda"
80
  )
 
81
  res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
82
  return res["text"].strip()
83
 
@@ -86,30 +101,41 @@ async def handle_process(request: Request):
86
  try:
87
  data = await request.json()
88
  action = data.get("action")
89
- if action == "health": return {"status": "awake", "v": "158"}
90
 
91
- print(f"--- [v158] πŸ› οΈ {action} ---")
92
 
 
93
  stt_text = ""
94
  if action in ["stt", "s2st"]:
95
- audio_bytes = base64.b64decode(data.get("file"))
 
 
 
96
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
97
- f.write(audio_bytes); temp_path = f.name
 
98
  try:
99
  stt_text = gpu_stt_base(temp_path, data.get("lang"))
 
100
  finally:
101
  if os.path.exists(temp_path): os.unlink(temp_path)
 
102
  if action == "stt": return {"text": stt_text}
103
 
 
104
  if action in ["tts", "s2st"]:
105
- load_tts_cpu()
106
  text = (data.get("text") if action == "tts" else stt_text).strip()
 
 
107
  trans_text = text
108
  target = data.get("target_lang") or data.get("lang") or "en"
109
 
110
  if action == "s2st":
 
111
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
112
  text = trans_text
 
113
 
114
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
115
  clean_lang = target.split('-')[0].lower()
@@ -117,23 +143,29 @@ async def handle_process(request: Request):
117
 
118
  if not mapped_lang:
119
  if HAS_CHATTERBOX:
 
120
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
121
  audio_b64 = base64.b64encode(audio_bytes).decode()
122
  else: return {"error": f"Lang {clean_lang} unsupported"}
123
  else:
 
124
  speaker_wav_b64 = data.get("speaker_wav")
125
  speaker_path = None
126
  if speaker_wav_b64:
127
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
128
- f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
 
129
  else:
130
  speaker_path = "default_speaker.wav"
131
  if not os.path.exists(speaker_path): speaker_path = None
132
 
133
  try:
134
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
 
 
135
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
136
- with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
 
137
  finally:
138
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
139
  if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
@@ -142,20 +174,20 @@ async def handle_process(request: Request):
142
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
143
 
144
  except Exception as e:
145
- print(f"❌ [v158] ERROR: {traceback.format_exc()}")
146
  return {"error": str(e)}
147
  finally:
148
- print(f"--- [v158] ✨ DONE ({time.time()-t1:.1f}s) ---")
149
 
150
  @app.post("/process")
151
  @app.post("/api/v1/process")
152
  async def api_process(request: Request): return await handle_process(request)
153
 
154
  @app.get("/health")
155
- def health(): return {"status": "ok", "v": "158", "gpu": torch.cuda.is_available()}
156
 
157
  @app.get("/", response_class=HTMLResponse)
158
- def root(): return "<h1>πŸš€ AI Engine v158 (ULTRA STABLE)</h1>"
159
 
160
  if __name__ == "__main__":
161
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
 
13
+ # --- [v159] πŸš€ HERO STABILITY ENGINE (FINAL MISSION) ---
14
+ print(f"--- [v159] πŸ“‘ BOOTING ENGINE ---")
 
15
 
16
+ # πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
17
+ # This bypasses the 'torchcodec' ModuleNotFoundError on Hugging Face Spaces.
18
  import torchaudio
19
+ import soundfile as sf
20
+ import numpy as np
21
+
22
+ defHeroLoad(filepath, **kwargs):
23
+ """Robust alternative to torchaudio.load using soundfile."""
24
+ try:
25
+ data, samplerate = sf.read(filepath)
26
+ # Convert to float32 and ensure shape is (channels, samples)
27
+ if len(data.shape) == 1:
28
+ data = data.reshape(1, -1)
29
+ else:
30
+ data = data.T # (samples, channels) -> (channels, samples)
31
+ return torch.from_numpy(data).float(), samplerate
32
+ except Exception as e:
33
+ print(f"--- [v159] ❌ PATCHED LOAD FAILED: {e} ---")
34
+ # Fallback to original if soundfile fails (unlikely)
35
+ return torchaudio.load_orig(filepath, **kwargs)
36
+
37
+ if not hasattr(torchaudio, 'load_orig'):
38
+ torchaudio.load_orig = torchaudio.load
39
+ torchaudio.load = HeroLoad
40
+ print("--- [v159] 🩹 TORCHAUDIO HERO PATCH APPLIED (soundfile) ---")
41
 
42
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
43
  from TTS.api import TTS
 
72
  def load_tts_cpu():
73
  global MODELS
74
  if MODELS.get("tts") is None:
75
+ print("--- [v159] πŸ“₯ LOADING XTTS V2 (CPU MODE) ---")
76
+ # CPU loading is 100% stable on ZeroGPU H200
77
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
78
+ print("--- [v159] βœ… XTTS READY (CPU) ---")
79
 
80
  @spaces.GPU(duration=60)
81
  def gpu_stt_base(temp_path, lang):
82
  global MODELS
83
  if MODELS.get("stt") is None:
84
+ print("--- [v159] πŸ“₯ LOADING WHISPER (Base) ON GPU ---")
85
  model_id = "openai/whisper-base"
86
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to("cuda")
87
  processor = AutoProcessor.from_pretrained(model_id)
 
92
  feature_extractor=processor.feature_extractor,
93
  device="cuda"
94
  )
95
+ print(f"--- [v159] πŸŽ™οΈ RUNNING WHISPER INFERENCE ---")
96
  res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
97
  return res["text"].strip()
98
 
 
101
  try:
102
  data = await request.json()
103
  action = data.get("action")
104
+ if action == "health": return {"status": "awake", "v": "159"}
105
 
106
+ print(f"--- [v159] πŸ› οΈ {action.upper()} REQUESTED ---")
107
 
108
+ # 🟒 STT PATH
109
  stt_text = ""
110
  if action in ["stt", "s2st"]:
111
+ audio_b64 = data.get("file")
112
+ if not audio_b64: return {"error": "Missing audio file"}
113
+
114
+ audio_bytes = base64.b64decode(audio_b64)
115
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
116
+ f.write(audio_bytes)
117
+ temp_path = f.name
118
  try:
119
  stt_text = gpu_stt_base(temp_path, data.get("lang"))
120
+ print(f"--- [v159] πŸŽ™οΈ TRANSCRIPT: {stt_text} ---")
121
  finally:
122
  if os.path.exists(temp_path): os.unlink(temp_path)
123
+
124
  if action == "stt": return {"text": stt_text}
125
 
126
+ # πŸ”΅ TTS PATH
127
  if action in ["tts", "s2st"]:
 
128
  text = (data.get("text") if action == "tts" else stt_text).strip()
129
+ if not text: return {"error": "Input text is empty"}
130
+
131
  trans_text = text
132
  target = data.get("target_lang") or data.get("lang") or "en"
133
 
134
  if action == "s2st":
135
+ print(f"--- [v159] 🌏 TRANSLATING TO {target} ---")
136
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
137
  text = trans_text
138
+ print(f"--- [v159] πŸ“ TRANSLATED: {text} ---")
139
 
140
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
141
  clean_lang = target.split('-')[0].lower()
 
143
 
144
  if not mapped_lang:
145
  if HAS_CHATTERBOX:
146
+ print(f"--- [v159] πŸ“¦ FALLBACK: CHATTERBOX FOR {clean_lang} ---")
147
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
148
  audio_b64 = base64.b64encode(audio_bytes).decode()
149
  else: return {"error": f"Lang {clean_lang} unsupported"}
150
  else:
151
+ load_tts_cpu()
152
  speaker_wav_b64 = data.get("speaker_wav")
153
  speaker_path = None
154
  if speaker_wav_b64:
155
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
156
+ f.write(base64.b64decode(speaker_wav_b64))
157
+ speaker_path = f.name
158
  else:
159
  speaker_path = "default_speaker.wav"
160
  if not os.path.exists(speaker_path): speaker_path = None
161
 
162
  try:
163
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
164
+ out_p = out_f.name
165
+ print(f"--- [v159] πŸ”Š XTTS INFERENCE (CPU) ---")
166
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
167
+ with open(out_p, "rb") as f:
168
+ audio_b64 = base64.b64encode(f.read()).decode()
169
  finally:
170
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
171
  if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
 
174
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
175
 
176
  except Exception as e:
177
+ print(f"❌ [v159] HERO ERROR: {traceback.format_exc()}")
178
  return {"error": str(e)}
179
  finally:
180
+ print(f"--- [v159] ✨ DONE ({time.time()-t1:.1f}s) ---")
181
 
182
  @app.post("/process")
183
  @app.post("/api/v1/process")
184
  async def api_process(request: Request): return await handle_process(request)
185
 
186
  @app.get("/health")
187
+ def health(): return {"status": "ok", "v": "159", "mode": "HERO_STABLE", "gpu": torch.cuda.is_available()}
188
 
189
  @app.get("/", response_class=HTMLResponse)
190
+ def root(): return "<h1>πŸš€ AI Engine v159 (HERO STABLE)</h1>"
191
 
192
  if __name__ == "__main__":
193
  uvicorn.run(app, host="0.0.0.0", port=7860)