TGPro1 commited on
Commit
7bc2a36
Β·
verified Β·
1 Parent(s): 8e2d3db

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +86 -56
app.py CHANGED
@@ -6,38 +6,33 @@ import torch
6
  import tempfile
7
  import traceback
8
  import uvicorn
 
9
  from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
 
13
- # --- [v160] πŸš€ HERO STABILITY ENGINE (FINAL MISSION) ---
14
- print(f"--- [v160] πŸ“‘ BOOTING ENGINE ---")
15
 
16
  # πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
17
- # This bypasses the 'torchcodec' ModuleNotFoundError on Hugging Face Spaces.
18
  import torchaudio
19
  import soundfile as sf
20
- import numpy as np
21
-
22
  def HeroLoad(filepath, **kwargs):
23
- """Robust alternative to torchaudio.load using soundfile."""
24
  try:
25
  data, samplerate = sf.read(filepath)
26
- # Convert to float32 and ensure shape is (channels, samples)
27
  if len(data.shape) == 1:
28
  data = data.reshape(1, -1)
29
  else:
30
- data = data.T # (samples, channels) -> (channels, samples)
31
  return torch.from_numpy(data).float(), samplerate
32
  except Exception as e:
33
- print(f"--- [v159] ❌ PATCHED LOAD FAILED: {e} ---")
34
- # Fallback to original if soundfile fails (unlikely)
35
  return torchaudio.load_orig(filepath, **kwargs)
36
 
37
  if not hasattr(torchaudio, 'load_orig'):
38
  torchaudio.load_orig = torchaudio.load
39
  torchaudio.load = HeroLoad
40
- print("--- [v160] 🩹 TORCHAUDIO HERO PATCH APPLIED (soundfile) ---")
41
 
42
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
43
  from TTS.api import TTS
@@ -67,75 +62,111 @@ os.environ["PYTHONWARNINGS"] = "ignore"
67
  app = FastAPI()
68
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
69
 
70
- MODELS = {"stt": None, "tts": None}
71
 
72
- def load_tts_cpu():
73
- global MODELS
74
- if MODELS.get("tts") is None:
75
- print("--- [v160] πŸ“₯ LOADING XTTS V2 (CPU MODE) ---")
76
- # CPU loading is 100% stable on ZeroGPU H200
77
- MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
78
- print("--- [v160] βœ… XTTS READY (CPU) ---")
 
79
 
80
- @spaces.GPU(duration=60)
81
- def gpu_stt_base(temp_path, lang):
82
  global MODELS
 
 
83
  if MODELS.get("stt") is None:
84
- print("--- [v160] πŸ“₯ LOADING WHISPER (Base) ON GPU ---")
85
- model_id = "openai/whisper-base"
86
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to("cuda")
87
  processor = AutoProcessor.from_pretrained(model_id)
88
  MODELS["stt"] = pipeline(
89
  "automatic-speech-recognition",
90
  model=model,
91
  tokenizer=processor.tokenizer,
92
  feature_extractor=processor.feature_extractor,
93
- device="cuda"
94
  )
95
- print(f"--- [v160] πŸŽ™οΈ RUNNING WHISPER INFERENCE ---")
96
- res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
 
 
 
 
 
 
 
 
 
97
  return res["text"].strip()
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  async def handle_process(request: Request):
100
  t1 = time.time()
101
  try:
102
  data = await request.json()
103
  action = data.get("action")
104
- if action == "health": return {"status": "awake", "v": "160"}
105
 
106
- print(f"--- [v160] πŸ› οΈ {action.upper()} REQUESTED ---")
107
 
108
- # 🟒 STT PATH
109
  stt_text = ""
 
110
  if action in ["stt", "s2st"]:
111
  audio_b64 = data.get("file")
112
  if not audio_b64: return {"error": "Missing audio file"}
113
-
114
  audio_bytes = base64.b64decode(audio_b64)
115
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
116
- f.write(audio_bytes)
117
- temp_path = f.name
118
  try:
119
- stt_text = gpu_stt_base(temp_path, data.get("lang"))
120
- print(f"--- [v160] πŸŽ™οΈ TRANSCRIPT: {stt_text} ---")
121
  finally:
122
  if os.path.exists(temp_path): os.unlink(temp_path)
123
-
124
  if action == "stt": return {"text": stt_text}
125
 
126
- # πŸ”΅ TTS PATH
127
  if action in ["tts", "s2st"]:
128
  text = (data.get("text") if action == "tts" else stt_text).strip()
129
  if not text: return {"error": "Input text is empty"}
130
 
131
- trans_text = text
132
  target = data.get("target_lang") or data.get("lang") or "en"
133
-
134
  if action == "s2st":
135
- print(f"--- [v160] 🌏 TRANSLATING TO {target} ---")
136
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
137
  text = trans_text
138
- print(f"--- [v160] πŸ“ TRANSLATED: {text} ---")
139
 
140
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
141
  clean_lang = target.split('-')[0].lower()
@@ -143,51 +174,50 @@ async def handle_process(request: Request):
143
 
144
  if not mapped_lang:
145
  if HAS_CHATTERBOX:
146
- print(f"--- [v160] πŸ“¦ FALLBACK: CHATTERBOX FOR {clean_lang} ---")
147
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
148
  audio_b64 = base64.b64encode(audio_bytes).decode()
149
  else: return {"error": f"Lang {clean_lang} unsupported"}
150
  else:
151
- load_tts_cpu()
152
  speaker_wav_b64 = data.get("speaker_wav")
153
  speaker_path = None
154
  if speaker_wav_b64:
155
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
156
- f.write(base64.b64decode(speaker_wav_b64))
157
- speaker_path = f.name
158
  else:
159
  speaker_path = "default_speaker.wav"
160
  if not os.path.exists(speaker_path): speaker_path = None
161
 
162
  try:
163
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
164
- out_p = out_f.name
165
- print(f"--- [v160] πŸ”Š XTTS INFERENCE (CPU) ---")
166
- MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
167
- with open(out_p, "rb") as f:
168
- audio_b64 = base64.b64encode(f.read()).decode()
169
  finally:
170
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
171
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
172
 
173
  if action == "tts": return {"audio": audio_b64}
174
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
175
 
176
  except Exception as e:
177
- print(f"❌ [v160] HERO ERROR: {traceback.format_exc()}")
178
  return {"error": str(e)}
179
  finally:
180
- print(f"--- [v160] ✨ DONE ({time.time()-t1:.1f}s) ---")
181
 
182
  @app.post("/process")
183
  @app.post("/api/v1/process")
184
  async def api_process(request: Request): return await handle_process(request)
185
 
186
  @app.get("/health")
187
- def health(): return {"status": "ok", "v": "160", "mode": "HERO_STABLE", "gpu": torch.cuda.is_available()}
 
 
 
 
 
 
 
188
 
189
  @app.get("/", response_class=HTMLResponse)
190
- def root(): return "<h1>πŸš€ AI Engine v160 (HERO STABLE)</h1>"
191
 
192
  if __name__ == "__main__":
193
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
6
  import tempfile
7
  import traceback
8
  import uvicorn
9
+ import gc
10
  from fastapi import FastAPI, Request
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import HTMLResponse
13
 
14
+ # --- [v161] πŸš€ ULTIMATE GPU ENGINE (STABILITY MISSION) ---
15
+ print(f"--- [v161] πŸ“‘ BOOTING ULTIMATE ENGINE ---")
16
 
17
  # πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
 
18
  import torchaudio
19
  import soundfile as sf
 
 
20
  def HeroLoad(filepath, **kwargs):
 
21
  try:
22
  data, samplerate = sf.read(filepath)
 
23
  if len(data.shape) == 1:
24
  data = data.reshape(1, -1)
25
  else:
26
+ data = data.T
27
  return torch.from_numpy(data).float(), samplerate
28
  except Exception as e:
29
+ print(f"--- [v161] ❌ PATCHED LOAD FAILED: {e} ---")
 
30
  return torchaudio.load_orig(filepath, **kwargs)
31
 
32
  if not hasattr(torchaudio, 'load_orig'):
33
  torchaudio.load_orig = torchaudio.load
34
  torchaudio.load = HeroLoad
35
+ print("--- [v161] 🩹 TORCHAUDIO HERO PATCH APPLIED ---")
36
 
37
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
38
  from TTS.api import TTS
 
62
  app = FastAPI()
63
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
64
 
65
+ MODELS = {"stt": None, "tts": None, "current_gpu": "auto"}
66
 
67
+ def select_best_gpu():
68
+ """Logic to select the best available GPU (Switch)."""
69
+ if torch.cuda.is_available():
70
+ count = torch.cuda.device_count()
71
+ print(f"--- [v161] πŸ–₯️ DETECTED {count} GPUs ---")
72
+ # In ZeroGPU, we usually have 1 MIG instance assigned.
73
+ return "cuda:0"
74
+ return "cpu"
75
 
76
+ @spaces.GPU(duration=120)
77
+ def gpu_stt_large(temp_path, lang):
78
  global MODELS
79
+ device = select_best_gpu() if MODELS["current_gpu"] == "auto" else MODELS["current_gpu"]
80
+
81
  if MODELS.get("stt") is None:
82
+ print(f"--- [v161] πŸ“₯ LOADING WHISPER (Large-v3-Turbo) ON {device} ---")
83
+ model_id = "openai/whisper-large-v3-turbo"
84
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16 if "cuda" in device else torch.float32).to(device)
85
  processor = AutoProcessor.from_pretrained(model_id)
86
  MODELS["stt"] = pipeline(
87
  "automatic-speech-recognition",
88
  model=model,
89
  tokenizer=processor.tokenizer,
90
  feature_extractor=processor.feature_extractor,
91
+ device=device
92
  )
93
+
94
+ print(f"--- [v161] πŸŽ™οΈ WHISPER INFERENCE (Temp=0, TS=True) ---")
95
+ # Added return_timestamps=True to handle > 30s audio
96
+ res = MODELS["stt"](
97
+ temp_path,
98
+ generate_kwargs={
99
+ "language": lang if lang and len(lang) <= 3 else None,
100
+ "temperature": 0.0,
101
+ "return_timestamps": True
102
+ }
103
+ )
104
  return res["text"].strip()
105
 
106
+ @spaces.GPU(duration=120)
107
+ def gpu_tts_inference(text, mapped_lang, speaker_path):
108
+ global MODELS
109
+ device = "cuda" # Always use CUDA inside the decorator
110
+
111
+ if MODELS.get("tts") is None:
112
+ print(f"--- [v161] πŸ“₯ LOADING XTTS V2 ON {device} ---")
113
+ # We load directly to GPU inside the decorator for max stability
114
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
115
+
116
+ # Ensure model is on GPU (sometimes ZeroGPU logic moves it back to CPU)
117
+ if hasattr(MODELS["tts"], "to"):
118
+ MODELS["tts"].to(device)
119
+
120
+ print(f"--- [v161] πŸ”Š XTTS INFERENCE ON GPU ---")
121
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
122
+ out_p = out_f.name
123
+
124
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
125
+
126
+ with open(out_p, "rb") as f:
127
+ audio_b64 = base64.b64encode(f.read()).decode()
128
+
129
+ if os.path.exists(out_p):
130
+ os.unlink(out_p)
131
+
132
+ return audio_b64
133
+
134
  async def handle_process(request: Request):
135
  t1 = time.time()
136
  try:
137
  data = await request.json()
138
  action = data.get("action")
139
+ if action == "health": return {"status": "awake", "v": "161"}
140
 
141
+ print(f"--- [v161] πŸ› οΈ {action.upper()} REQUESTED ---")
142
 
 
143
  stt_text = ""
144
+ # 🟒 STT
145
  if action in ["stt", "s2st"]:
146
  audio_b64 = data.get("file")
147
  if not audio_b64: return {"error": "Missing audio file"}
 
148
  audio_bytes = base64.b64decode(audio_b64)
149
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
150
+ f.write(audio_bytes); temp_path = f.name
 
151
  try:
152
+ stt_text = gpu_stt_large(temp_path, data.get("lang"))
153
+ print(f"--- [v161] πŸŽ™οΈ TRANSCRIPT: {stt_text[:100]}... ---")
154
  finally:
155
  if os.path.exists(temp_path): os.unlink(temp_path)
 
156
  if action == "stt": return {"text": stt_text}
157
 
158
+ # πŸ”΅ TTS
159
  if action in ["tts", "s2st"]:
160
  text = (data.get("text") if action == "tts" else stt_text).strip()
161
  if not text: return {"error": "Input text is empty"}
162
 
 
163
  target = data.get("target_lang") or data.get("lang") or "en"
164
+ trans_text = text
165
  if action == "s2st":
166
+ print(f"--- [v161] 🌏 TRANSLATING TO {target} ---")
167
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
168
  text = trans_text
169
+ print(f"--- [v161] πŸ“ TRANSLATED: {text[:100]}... ---")
170
 
171
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
172
  clean_lang = target.split('-')[0].lower()
 
174
 
175
  if not mapped_lang:
176
  if HAS_CHATTERBOX:
 
177
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
178
  audio_b64 = base64.b64encode(audio_bytes).decode()
179
  else: return {"error": f"Lang {clean_lang} unsupported"}
180
  else:
 
181
  speaker_wav_b64 = data.get("speaker_wav")
182
  speaker_path = None
183
  if speaker_wav_b64:
184
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
185
+ f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
 
186
  else:
187
  speaker_path = "default_speaker.wav"
188
  if not os.path.exists(speaker_path): speaker_path = None
189
 
190
  try:
191
+ # RUN ON GPU
192
+ audio_b64 = gpu_tts_inference(text, mapped_lang, speaker_path)
 
 
 
 
193
  finally:
194
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
 
195
 
196
  if action == "tts": return {"audio": audio_b64}
197
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
198
 
199
  except Exception as e:
200
+ print(f"❌ [v161] ULTIMATE ERROR: {traceback.format_exc()}")
201
  return {"error": str(e)}
202
  finally:
203
+ print(f"--- [v161] ✨ DONE ({time.time()-t1:.1f}s) ---")
204
 
205
  @app.post("/process")
206
  @app.post("/api/v1/process")
207
  async def api_process(request: Request): return await handle_process(request)
208
 
209
  @app.get("/health")
210
+ def health():
211
+ return {
212
+ "status": "ok",
213
+ "v": "161",
214
+ "mode": "ULTIMATE_GPU",
215
+ "gpu_available": torch.cuda.is_available(),
216
+ "gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0
217
+ }
218
 
219
  @app.get("/", response_class=HTMLResponse)
220
+ def root(): return "<h1>πŸš€ AI Engine v161 (ULTIMATE GPU)</h1>"
221
 
222
  if __name__ == "__main__":
223
  uvicorn.run(app, host="0.0.0.0", port=7860)