TGPro1 commited on
Commit
b0d71b5
Β·
verified Β·
1 Parent(s): d9b00f1

Deploy v150 High Stability

Browse files
Files changed (1) hide show
  1. app.py +89 -84
app.py CHANGED
@@ -10,9 +10,8 @@ from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.responses import HTMLResponse
11
  import uvicorn
12
 
13
- # --- [v149] πŸš€ FULL FEATURE H200 ENGINE (STT + TRANS + TTS + FALLBACK) ---
14
- # Target: https://tgpro1-s2st.hf.space/api/v1/process
15
- print(f"--- [v149] πŸ“‘ BOOTING FULL ENGINE (API v1) ---")
16
 
17
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
18
  from TTS.api import TTS
@@ -38,7 +37,6 @@ except ImportError:
38
 
39
  os.environ["COQUI_TOS_AGREED"] = "1"
40
  os.environ["PYTHONWARNINGS"] = "ignore"
41
- os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
42
 
43
  app = FastAPI()
44
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
@@ -48,8 +46,9 @@ MODELS = {"stt": None, "tts": None}
48
  def load_stt_gpu():
49
  global MODELS
50
  if MODELS.get("stt") is None:
51
- print("--- [v149] πŸ“₯ LOADING WHISPER (Large-v3-Turbo) ---")
52
- model_id = "openai/whisper-large-v3-turbo"
 
53
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
54
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
55
  ).to("cuda")
@@ -63,107 +62,113 @@ def load_stt_gpu():
63
  device="cuda",
64
  model_kwargs={"attn_implementation": "eager"}
65
  )
66
- print("--- [v149] βœ… WHISPER READY ---")
67
 
68
  def load_tts_gpu():
69
  global MODELS
70
  if MODELS.get("tts") is None:
71
- print("--- [v149] πŸ“₯ LOADING XTTS v2 ---")
72
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
73
  MODELS["tts"].to(torch.float32)
74
- print("--- [v149] βœ… XTTS READY ---")
75
 
76
  @spaces.GPU(duration=120)
77
- def core_stt(audio_b64, lang):
78
- load_stt_gpu()
79
- audio_bytes = base64.b64decode(audio_b64)
80
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
81
- f.write(audio_bytes); temp_path = f.name
82
- try:
83
- result = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
84
- return result["text"].strip()
85
- finally:
86
- if os.path.exists(temp_path): os.unlink(temp_path)
87
-
88
- @spaces.GPU(duration=120)
89
- def core_tts(text, target_lang, speaker_wav_b64=None):
90
- load_tts_gpu()
91
- XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
92
- clean_lang = target_lang.split('-')[0].lower()
93
- mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
94
-
95
- if not mapped_lang:
96
- if HAS_CHATTERBOX:
97
- print(f"--- [v149] πŸ“¦ FALLBACK: CHATTERBOX FOR {clean_lang} ---")
98
- audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
99
- return base64.b64encode(audio_bytes).decode()
100
- return {"error": f"Language {clean_lang} not supported."}
101
 
102
- speaker_wav_path = "default_speaker.wav"
103
- if speaker_wav_b64:
104
- sb = base64.b64decode(speaker_wav_b64)
 
 
105
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
106
- f.write(sb); speaker_wav_path = f.name
107
- elif not os.path.exists(speaker_wav_path):
108
- speaker_wav_path = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- try:
111
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
112
- MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
113
- with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
114
- return audio_b64
115
- finally:
116
- if speaker_wav_b64 and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
117
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
118
 
119
- async def handle_process(request: Request):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  try:
121
  data = await request.json()
122
  action = data.get("action")
123
- if action == "health": return {"status": "awake", "v": "149", "mode": "FULL_API_V1"}
124
-
125
- print(f"--- [v149] πŸ› οΈ ACTION: {action} ---")
126
- t1 = time.time()
127
-
128
- stt_text = None
129
- if action in ["stt", "s2st"]:
130
- stt_text = core_stt(data.get("file"), data.get("lang"))
131
- if action == "stt": return {"text": stt_text}
132
-
133
- if action in ["tts", "s2st"]:
134
- text = (data.get("text") if action == "tts" else stt_text).strip()
135
- trans_text = text
136
- if action == "s2st":
137
- target = data.get("target_lang") or "en"
138
- trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
139
- text = trans_text
140
-
141
- if len(text) < 2: return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
142
-
143
- audio_res = core_tts(text, (data.get("lang") if action == "tts" else target), data.get("speaker_wav"))
144
-
145
- if isinstance(audio_res, dict) and "error" in audio_res: return audio_res
146
- if action == "tts": return {"audio": audio_res}
147
- return {"text": stt_text, "translated": trans_text, "audio": audio_res}
148
-
149
  except Exception as e:
150
- print(f"❌ [v149] ERROR: {traceback.format_exc()}")
151
  return {"error": str(e)}
152
  finally:
153
- print(f"--- [v149] ✨ DONE ({time.time()-t1:.1f}s) ---")
154
  torch.cuda.empty_cache()
155
 
156
- @app.post("/process")
157
- async def api_process(request: Request): return await handle_process(request)
158
-
159
- @app.post("/api/v1/process")
160
- async def api_v1_process(request: Request): return await handle_process(request)
161
-
162
  @app.get("/health")
163
- def health(): return {"status": "ok", "v": "149", "mode": "FULL_H200", "gpu": HAS_SPACES, "chatterbox": HAS_CHATTERBOX}
 
 
 
 
 
 
 
 
164
 
165
  @app.get("/", response_class=HTMLResponse)
166
- def root(): return "<html><body><h1>πŸš€ AI Engine v149 (FULL API V1 READY)</h1></body></html>"
167
 
168
  if __name__ == "__main__":
169
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  from fastapi.responses import HTMLResponse
11
  import uvicorn
12
 
13
+ # --- [v150] πŸš€ HIGH STABILITY H200 ENGINE ---
14
+ print(f"--- [v150] πŸ“‘ BOOTING ENGINE ---")
 
15
 
16
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
17
  from TTS.api import TTS
 
37
 
38
  os.environ["COQUI_TOS_AGREED"] = "1"
39
  os.environ["PYTHONWARNINGS"] = "ignore"
 
40
 
41
  app = FastAPI()
42
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
46
  def load_stt_gpu():
47
  global MODELS
48
  if MODELS.get("stt") is None:
49
+ print("--- [v150] πŸ“₯ LOADING WHISPER (Base for Stability) ---")
50
+ # Use Base model to ensure it fits and loads quickly in serverless context
51
+ model_id = "openai/whisper-base"
52
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
53
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
54
  ).to("cuda")
 
62
  device="cuda",
63
  model_kwargs={"attn_implementation": "eager"}
64
  )
65
+ print("--- [v150] βœ… WHISPER READY ---")
66
 
67
  def load_tts_gpu():
68
  global MODELS
69
  if MODELS.get("tts") is None:
70
+ print("--- [v150] πŸ“₯ LOADING XTTS v2 ---")
71
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
72
  MODELS["tts"].to(torch.float32)
73
+ print("--- [v150] βœ… XTTS READY ---")
74
 
75
  @spaces.GPU(duration=120)
76
+ def unified_gpu_process(action, data):
77
+ """Single GPU entry point to avoid context flip-flops."""
78
+ global MODELS
79
+ res = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # πŸŽ™οΈ STT
82
+ stt_text = ""
83
+ if action in ["stt", "s2st"]:
84
+ load_stt_gpu()
85
+ audio_bytes = base64.b64decode(data.get("file"))
86
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
87
+ f.write(audio_bytes); temp_path = f.name
88
+ try:
89
+ lang = data.get("lang")
90
+ # Batch size 1 for stability
91
+ stt_res = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
92
+ stt_text = stt_res["text"].strip()
93
+ if action == "stt": return {"text": stt_text}
94
+ finally:
95
+ if os.path.exists(temp_path): os.unlink(temp_path)
96
+
97
+ # πŸ”Š TTS
98
+ if action in ["tts", "s2st"]:
99
+ load_tts_gpu()
100
+ text = (data.get("text") if action == "tts" else stt_text).strip()
101
+ trans_text = text
102
+
103
+ target_lang = data.get("target_lang") or data.get("lang") or "en"
104
+
105
+ if action == "s2st":
106
+ trans_text = GoogleTranslator(source='auto', target=target_lang).translate(stt_text)
107
+ text = trans_text
108
 
109
+ if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
 
 
 
 
 
 
 
110
 
111
+ XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
112
+ clean_lang = target_lang.split('-')[0].lower()
113
+ mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
114
+
115
+ if not mapped_lang:
116
+ if HAS_CHATTERBOX:
117
+ print(f"--- [v150] πŸ“¦ FALLBACK: CHATTERBOX FOR {clean_lang} ---")
118
+ audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
119
+ audio_b64 = base64.b64encode(audio_bytes).decode()
120
+ else:
121
+ return {"error": f"Language {clean_lang} not supported."}
122
+ else:
123
+ speaker_wav_path = "default_speaker.wav"
124
+ if data.get("speaker_wav"):
125
+ sb = base64.b64decode(data.get("speaker_wav"))
126
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
127
+ f.write(sb); speaker_wav_path = f.name
128
+ elif not os.path.exists(speaker_wav_path):
129
+ speaker_wav_path = None
130
+
131
+ try:
132
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
133
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
134
+ with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
135
+ finally:
136
+ if data.get("speaker_wav") and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
137
+ if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
138
+
139
+ if action == "tts": return {"audio": audio_b64}
140
+ return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
141
+
142
+ return {"error": "Invalid action"}
143
+
144
+ @app.post("/process")
145
+ @app.post("/api/v1/process")
146
+ async def api_gateway(request: Request):
147
  try:
148
  data = await request.json()
149
  action = data.get("action")
150
+ if action == "health": return {"status": "awake", "v": "150"}
151
+ print(f"--- [v150] πŸ› οΈ {action} requested ---")
152
+ return unified_gpu_process(action, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  except Exception as e:
154
+ print(f"❌ [v150] ERROR: {traceback.format_exc()}")
155
  return {"error": str(e)}
156
  finally:
 
157
  torch.cuda.empty_cache()
158
 
 
 
 
 
 
 
159
  @app.get("/health")
160
+ def health():
161
+ diag = {
162
+ "status": "ok", "v": "150",
163
+ "gpu": torch.cuda.is_available(),
164
+ "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None",
165
+ "spaces": HAS_SPACES,
166
+ "chatterbox": HAS_CHATTERBOX
167
+ }
168
+ return diag
169
 
170
  @app.get("/", response_class=HTMLResponse)
171
+ def root(): return "<html><body><h1>πŸš€ AI Engine v150 (HIGH STABILITY)</h1></body></html>"
172
 
173
  if __name__ == "__main__":
174
  uvicorn.run(app, host="0.0.0.0", port=7860)