TGPro1 commited on
Commit
4bfa772
Β·
verified Β·
1 Parent(s): ef8b4e2

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +61 -56
app.py CHANGED
@@ -11,8 +11,9 @@ from fastapi import FastAPI, Request
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import HTMLResponse
13
 
14
- # --- [v161] πŸš€ ULTIMATE GPU ENGINE (STABILITY MISSION) ---
15
- print(f"--- [v161] πŸ“‘ BOOTING ULTIMATE ENGINE ---")
 
16
 
17
  # πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
18
  import torchaudio
@@ -26,13 +27,13 @@ def HeroLoad(filepath, **kwargs):
26
  data = data.T
27
  return torch.from_numpy(data).float(), samplerate
28
  except Exception as e:
29
- print(f"--- [v161] ❌ PATCHED LOAD FAILED: {e} ---")
30
  return torchaudio.load_orig(filepath, **kwargs)
31
 
32
  if not hasattr(torchaudio, 'load_orig'):
33
  torchaudio.load_orig = torchaudio.load
34
  torchaudio.load = HeroLoad
35
- print("--- [v161] 🩹 TORCHAUDIO HERO PATCH APPLIED ---")
36
 
37
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
38
  from TTS.api import TTS
@@ -62,62 +63,61 @@ os.environ["PYTHONWARNINGS"] = "ignore"
62
  app = FastAPI()
63
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
64
 
65
- MODELS = {"stt": None, "tts": None, "current_gpu": "auto"}
66
 
67
- def select_best_gpu():
68
- """Logic to select the best available GPU (Switch)."""
69
- if torch.cuda.is_available():
70
- count = torch.cuda.device_count()
71
- print(f"--- [v161] πŸ–₯️ DETECTED {count} GPUs ---")
72
- # In ZeroGPU, we usually have 1 MIG instance assigned.
73
- return "cuda:0"
74
- return "cpu"
75
 
76
  @spaces.GPU(duration=120)
77
- def gpu_stt_large(temp_path, lang):
78
  global MODELS
79
- device = select_best_gpu() if MODELS["current_gpu"] == "auto" else MODELS["current_gpu"]
80
 
81
  if MODELS.get("stt") is None:
82
- print(f"--- [v161] πŸ“₯ LOADING WHISPER (Large-v3-Turbo) ON {device} ---")
83
  model_id = "openai/whisper-large-v3-turbo"
84
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16 if "cuda" in device else torch.float32).to(device)
85
  processor = AutoProcessor.from_pretrained(model_id)
86
  MODELS["stt"] = pipeline(
87
  "automatic-speech-recognition",
88
  model=model,
89
  tokenizer=processor.tokenizer,
90
  feature_extractor=processor.feature_extractor,
 
 
91
  device=device
92
  )
93
 
94
- print(f"--- [v161] πŸŽ™οΈ WHISPER INFERENCE (Temp=0, TS=True) ---")
95
- # Added return_timestamps=True to handle > 30s audio
96
  res = MODELS["stt"](
97
  temp_path,
98
  generate_kwargs={
99
  "language": lang if lang and len(lang) <= 3 else None,
100
- "temperature": 0.0,
101
- "return_timestamps": True
102
- }
103
  )
104
  return res["text"].strip()
105
 
106
- @spaces.GPU(duration=120)
107
- def gpu_tts_inference(text, mapped_lang, speaker_path):
108
  global MODELS
109
- device = "cuda" # Always use CUDA inside the decorator
110
 
111
  if MODELS.get("tts") is None:
112
- print(f"--- [v161] πŸ“₯ LOADING XTTS V2 ON {device} ---")
113
- # We load directly to GPU inside the decorator for max stability
114
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
115
-
116
- # Ensure model is on GPU (sometimes ZeroGPU logic moves it back to CPU)
117
- if hasattr(MODELS["tts"], "to"):
118
- MODELS["tts"].to(device)
119
-
120
- print(f"--- [v161] πŸ”Š XTTS INFERENCE ON GPU ---")
121
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
122
  out_p = out_f.name
123
 
@@ -126,9 +126,12 @@ def gpu_tts_inference(text, mapped_lang, speaker_path):
126
  with open(out_p, "rb") as f:
127
  audio_b64 = base64.b64encode(f.read()).decode()
128
 
129
- if os.path.exists(out_p):
130
- os.unlink(out_p)
131
-
 
 
 
132
  return audio_b64
133
 
134
  async def handle_process(request: Request):
@@ -136,37 +139,40 @@ async def handle_process(request: Request):
136
  try:
137
  data = await request.json()
138
  action = data.get("action")
139
- if action == "health": return {"status": "awake", "v": "161"}
140
 
141
- print(f"--- [v161] πŸ› οΈ {action.upper()} REQUESTED ---")
142
 
143
  stt_text = ""
144
- # 🟒 STT
145
  if action in ["stt", "s2st"]:
146
  audio_b64 = data.get("file")
147
- if not audio_b64: return {"error": "Missing audio file"}
 
148
  audio_bytes = base64.b64decode(audio_b64)
149
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
150
  f.write(audio_bytes); temp_path = f.name
151
  try:
152
- stt_text = gpu_stt_large(temp_path, data.get("lang"))
153
- print(f"--- [v161] πŸŽ™οΈ TRANSCRIPT: {stt_text[:100]}... ---")
154
  finally:
155
  if os.path.exists(temp_path): os.unlink(temp_path)
 
156
  if action == "stt": return {"text": stt_text}
157
 
158
- # πŸ”΅ TTS
159
  if action in ["tts", "s2st"]:
160
  text = (data.get("text") if action == "tts" else stt_text).strip()
161
  if not text: return {"error": "Input text is empty"}
162
 
163
  target = data.get("target_lang") or data.get("lang") or "en"
164
  trans_text = text
 
165
  if action == "s2st":
166
- print(f"--- [v161] 🌏 TRANSLATING TO {target} ---")
167
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
168
  text = trans_text
169
- print(f"--- [v161] πŸ“ TRANSLATED: {text[:100]}... ---")
170
 
171
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
172
  clean_lang = target.split('-')[0].lower()
@@ -176,7 +182,7 @@ async def handle_process(request: Request):
176
  if HAS_CHATTERBOX:
177
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
178
  audio_b64 = base64.b64encode(audio_bytes).decode()
179
- else: return {"error": f"Lang {clean_lang} unsupported"}
180
  else:
181
  speaker_wav_b64 = data.get("speaker_wav")
182
  speaker_path = None
@@ -188,8 +194,8 @@ async def handle_process(request: Request):
188
  if not os.path.exists(speaker_path): speaker_path = None
189
 
190
  try:
191
- # RUN ON GPU
192
- audio_b64 = gpu_tts_inference(text, mapped_lang, speaker_path)
193
  finally:
194
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
195
 
@@ -197,10 +203,10 @@ async def handle_process(request: Request):
197
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
198
 
199
  except Exception as e:
200
- print(f"❌ [v161] ULTIMATE ERROR: {traceback.format_exc()}")
201
  return {"error": str(e)}
202
  finally:
203
- print(f"--- [v161] ✨ DONE ({time.time()-t1:.1f}s) ---")
204
 
205
  @app.post("/process")
206
  @app.post("/api/v1/process")
@@ -209,15 +215,14 @@ async def api_process(request: Request): return await handle_process(request)
209
  @app.get("/health")
210
  def health():
211
  return {
212
- "status": "ok",
213
- "v": "161",
214
- "mode": "ULTIMATE_GPU",
215
- "gpu_available": torch.cuda.is_available(),
216
- "gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0
217
  }
218
 
219
  @app.get("/", response_class=HTMLResponse)
220
- def root(): return "<h1>πŸš€ AI Engine v161 (ULTIMATE GPU)</h1>"
221
 
222
  if __name__ == "__main__":
223
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import HTMLResponse
13
 
14
+ # --- [v162] πŸš€ PRO GPU ENGINE (SPEED & PRECISION) ---
15
+ # This version enforces GPU usage for STT and TTS to meet high-performance requirements.
16
+ print(f"--- [v162] πŸ“‘ BOOTING ENGINE ---")
17
 
18
  # πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
19
  import torchaudio
 
27
  data = data.T
28
  return torch.from_numpy(data).float(), samplerate
29
  except Exception as e:
30
+ print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---")
31
  return torchaudio.load_orig(filepath, **kwargs)
32
 
33
  if not hasattr(torchaudio, 'load_orig'):
34
  torchaudio.load_orig = torchaudio.load
35
  torchaudio.load = HeroLoad
36
+ print("--- [v162] 🩹 TORCHAUDIO PATCH APPLIED ---")
37
 
38
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
39
  from TTS.api import TTS
 
63
  app = FastAPI()
64
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
65
 
66
+ MODELS = {"stt": None, "tts": None, "gpu_id": 0}
67
 
68
+ def get_best_gpu():
69
+ """Architecture for multi-GPU support (Switch)."""
70
+ if not torch.cuda.is_available(): return "cpu"
71
+ # Select GPU with most free memory if multiple exist
72
+ # For ZeroGPU, this defaults to the allocated MIG instance.
73
+ return f"cuda:{MODELS['gpu_id']}"
 
 
74
 
75
  @spaces.GPU(duration=120)
76
+ def gpu_stt_full(temp_path, lang):
77
  global MODELS
78
+ device = get_best_gpu()
79
 
80
  if MODELS.get("stt") is None:
81
+ print(f"--- [v162] πŸ“₯ LOADING WHISPER LARGE ON {device} ---")
82
  model_id = "openai/whisper-large-v3-turbo"
83
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
84
  processor = AutoProcessor.from_pretrained(model_id)
85
  MODELS["stt"] = pipeline(
86
  "automatic-speech-recognition",
87
  model=model,
88
  tokenizer=processor.tokenizer,
89
  feature_extractor=processor.feature_extractor,
90
+ chunk_length_s=30, # Fixes >30s audio support
91
+ batch_size=8, # Accelerated batch processing
92
  device=device
93
  )
94
 
95
+ print(f"--- [v162] πŸŽ™οΈ WHISPER INFERENCE (TEMP 0) ---")
 
96
  res = MODELS["stt"](
97
  temp_path,
98
  generate_kwargs={
99
  "language": lang if lang and len(lang) <= 3 else None,
100
+ "temperature": 0.0 # High precision as requested
101
+ },
102
+ return_timestamps=True
103
  )
104
  return res["text"].strip()
105
 
106
+ @spaces.GPU(duration=180)
107
+ def gpu_tts_full(text, mapped_lang, speaker_path):
108
  global MODELS
109
+ device = "cuda"
110
 
111
  if MODELS.get("tts") is None:
112
+ print(f"--- [v162] πŸ“₯ LOADING XTTS V2 ON GPU ---")
113
+ # Pre-load to RAM if possible or just load directly
114
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
115
+ else:
116
+ # Hybrid management: Ensure model is on CUDA during inference
117
+ try: MODELS["tts"].to(device)
118
+ except: pass
119
+
120
+ print(f"--- [v162] πŸ”Š XTTS GPU INFERENCE ---")
121
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
122
  out_p = out_f.name
123
 
 
126
  with open(out_p, "rb") as f:
127
  audio_b64 = base64.b64encode(f.read()).decode()
128
 
129
+ if os.path.exists(out_p): os.unlink(out_p)
130
+
131
+ # Cleanup to prevent ZeroGPU worker errors
132
+ torch.cuda.empty_cache()
133
+ gc.collect()
134
+
135
  return audio_b64
136
 
137
  async def handle_process(request: Request):
 
139
  try:
140
  data = await request.json()
141
  action = data.get("action")
142
+ if action == "health": return {"status": "awake", "v": "162"}
143
 
144
+ print(f"--- [v162] πŸ› οΈ API REQUEST: {action.upper()} ---")
145
 
146
  stt_text = ""
147
+ # 🟒 SPEECH-TO-TEXT
148
  if action in ["stt", "s2st"]:
149
  audio_b64 = data.get("file")
150
+ if not audio_b64: return {"error": "Missing audio data"}
151
+
152
  audio_bytes = base64.b64decode(audio_b64)
153
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
154
  f.write(audio_bytes); temp_path = f.name
155
  try:
156
+ stt_text = gpu_stt_full(temp_path, data.get("lang"))
157
+ print(f"--- [v162] πŸŽ™οΈ TEXT: {stt_text[:100]}... ---")
158
  finally:
159
  if os.path.exists(temp_path): os.unlink(temp_path)
160
+
161
  if action == "stt": return {"text": stt_text}
162
 
163
+ # πŸ”΅ TEXT-TO-SPEECH
164
  if action in ["tts", "s2st"]:
165
  text = (data.get("text") if action == "tts" else stt_text).strip()
166
  if not text: return {"error": "Input text is empty"}
167
 
168
  target = data.get("target_lang") or data.get("lang") or "en"
169
  trans_text = text
170
+
171
  if action == "s2st":
172
+ print(f"--- [v162] 🌏 TRANSLATING TO {target} ---")
173
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
174
  text = trans_text
175
+ print(f"--- [v162] πŸ“ TRANS: {text[:100]}... ---")
176
 
177
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
178
  clean_lang = target.split('-')[0].lower()
 
182
  if HAS_CHATTERBOX:
183
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
184
  audio_b64 = base64.b64encode(audio_bytes).decode()
185
+ else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
186
  else:
187
  speaker_wav_b64 = data.get("speaker_wav")
188
  speaker_path = None
 
194
  if not os.path.exists(speaker_path): speaker_path = None
195
 
196
  try:
197
+ # EXECUTE GPU TTS
198
+ audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
199
  finally:
200
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
201
 
 
203
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
204
 
205
  except Exception as e:
206
+ print(f"❌ [v162] ENGINE ERROR: {traceback.format_exc()}")
207
  return {"error": str(e)}
208
  finally:
209
+ print(f"--- [v162] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")
210
 
211
  @app.post("/process")
212
  @app.post("/api/v1/process")
 
215
  @app.get("/health")
216
  def health():
217
  return {
218
+ "status": "ready",
219
+ "v": "162",
220
+ "gpu": torch.cuda.is_available(),
221
+ "devices": torch.cuda.device_count()
 
222
  }
223
 
224
  @app.get("/", response_class=HTMLResponse)
225
+ def root(): return "<h1>πŸš€ PRO AI Engine v162 (GPU MODE)</h1>"
226
 
227
  if __name__ == "__main__":
228
  uvicorn.run(app, host="0.0.0.0", port=7860)