TGPro1 commited on
Commit
23b6539
Β·
verified Β·
1 Parent(s): 05274d0

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +57 -61
app.py CHANGED
@@ -10,8 +10,8 @@ from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.responses import HTMLResponse
11
  import uvicorn
12
 
13
- # --- [v150] πŸš€ HIGH STABILITY H200 ENGINE ---
14
- print(f"--- [v150] πŸ“‘ BOOTING ENGINE ---")
15
 
16
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
17
  from TTS.api import TTS
@@ -46,8 +46,7 @@ MODELS = {"stt": None, "tts": None}
46
  def load_stt_gpu():
47
  global MODELS
48
  if MODELS.get("stt") is None:
49
- print("--- [v150] πŸ“₯ LOADING WHISPER (Base for Stability) ---")
50
- # Use Base model to ensure it fits and loads quickly in serverless context
51
  model_id = "openai/whisper-base"
52
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
53
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
@@ -62,113 +61,110 @@ def load_stt_gpu():
62
  device="cuda",
63
  model_kwargs={"attn_implementation": "eager"}
64
  )
65
- print("--- [v150] βœ… WHISPER READY ---")
66
 
67
  def load_tts_gpu():
68
  global MODELS
69
  if MODELS.get("tts") is None:
70
- print("--- [v150] πŸ“₯ LOADING XTTS v2 ---")
71
- MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
72
- MODELS["tts"].to(torch.float32)
73
- print("--- [v150] βœ… XTTS READY ---")
74
-
75
- @spaces.GPU(duration=120)
76
- def unified_gpu_process(action, data):
77
- """Single GPU entry point to avoid context flip-flops."""
 
 
 
 
78
  global MODELS
79
- res = {}
80
 
81
  # πŸŽ™οΈ STT
82
  stt_text = ""
83
  if action in ["stt", "s2st"]:
84
  load_stt_gpu()
85
- audio_bytes = base64.b64decode(data.get("file"))
86
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
87
- f.write(audio_bytes); temp_path = f.name
88
  try:
 
 
 
89
  lang = data.get("lang")
90
- # Batch size 1 for stability
91
  stt_res = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
92
  stt_text = stt_res["text"].strip()
 
93
  if action == "stt": return {"text": stt_text}
94
  finally:
95
- if os.path.exists(temp_path): os.unlink(temp_path)
96
 
97
  # πŸ”Š TTS
98
  if action in ["tts", "s2st"]:
99
- load_tts_gpu()
100
  text = (data.get("text") if action == "tts" else stt_text).strip()
101
  trans_text = text
102
-
103
- target_lang = data.get("target_lang") or data.get("lang") or "en"
104
 
105
  if action == "s2st":
106
- trans_text = GoogleTranslator(source='auto', target=target_lang).translate(stt_text)
 
107
  text = trans_text
 
108
 
109
  if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
110
 
111
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
112
- clean_lang = target_lang.split('-')[0].lower()
113
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
114
 
115
  if not mapped_lang:
116
  if HAS_CHATTERBOX:
117
- print(f"--- [v150] πŸ“¦ FALLBACK: CHATTERBOX FOR {clean_lang} ---")
118
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
119
- audio_b64 = base64.b64encode(audio_bytes).decode()
120
- else:
121
- return {"error": f"Language {clean_lang} not supported."}
 
 
 
 
 
 
 
 
122
  else:
123
- speaker_wav_path = "default_speaker.wav"
124
- if data.get("speaker_wav"):
125
- sb = base64.b64decode(data.get("speaker_wav"))
126
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
127
- f.write(sb); speaker_wav_path = f.name
128
- elif not os.path.exists(speaker_wav_path):
129
- speaker_wav_path = None
130
-
131
- try:
132
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
133
- MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
134
- with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
135
- finally:
136
- if data.get("speaker_wav") and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
137
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
138
 
139
  if action == "tts": return {"audio": audio_b64}
140
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
141
-
142
- return {"error": "Invalid action"}
143
 
144
  @app.post("/process")
145
  @app.post("/api/v1/process")
146
- async def api_gateway(request: Request):
147
  try:
148
  data = await request.json()
149
  action = data.get("action")
150
- if action == "health": return {"status": "awake", "v": "150"}
151
- print(f"--- [v150] πŸ› οΈ {action} requested ---")
152
- return unified_gpu_process(action, data)
153
  except Exception as e:
154
- print(f"❌ [v150] ERROR: {traceback.format_exc()}")
155
  return {"error": str(e)}
156
- finally:
157
- torch.cuda.empty_cache()
158
 
159
  @app.get("/health")
160
  def health():
161
- diag = {
162
- "status": "ok", "v": "150",
163
- "gpu": torch.cuda.is_available(),
164
- "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None",
165
- "spaces": HAS_SPACES,
166
- "chatterbox": HAS_CHATTERBOX
167
- }
168
- return diag
169
 
170
  @app.get("/", response_class=HTMLResponse)
171
- def root(): return "<html><body><h1>πŸš€ AI Engine v150 (HIGH STABILITY)</h1></body></html>"
172
 
173
  if __name__ == "__main__":
174
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  from fastapi.responses import HTMLResponse
11
  import uvicorn
12
 
13
+ # --- [v151] πŸš€ TTS DEBUG ENGINE ---
14
+ print(f"--- [v151] πŸ“‘ BOOTING DEBUG ENGINE ---")
15
 
16
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
17
  from TTS.api import TTS
 
46
  def load_stt_gpu():
47
  global MODELS
48
  if MODELS.get("stt") is None:
49
+ print("--- [v151] πŸ“₯ LOADING WHISPER (Base) ---")
 
50
  model_id = "openai/whisper-base"
51
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
52
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
 
61
  device="cuda",
62
  model_kwargs={"attn_implementation": "eager"}
63
  )
64
+ print("--- [v151] βœ… WHISPER READY ---")
65
 
66
  def load_tts_gpu():
67
  global MODELS
68
  if MODELS.get("tts") is None:
69
+ print("--- [v151] πŸ“₯ LOADING XTTS V2 ---")
70
+ try:
71
+ # Try loading once and keeping in VRAM if possible (ZeroGPU might clear it)
72
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
73
+ MODELS["tts"].to(torch.float32)
74
+ print("--- [v151] βœ… XTTS LOADED SUCCESSFULLY ---")
75
+ except Exception as e:
76
+ print(f"--- [v151] ❌ XTTS FAILED TO LOAD: {e} ---")
77
+ raise e
78
+
79
+ @spaces.GPU(duration=180) # Longer duration for XTTS
80
+ def process_full(action, data):
81
  global MODELS
82
+ print(f"--- [v151] πŸš€ STARTING {action} on GPU ---")
83
 
84
  # πŸŽ™οΈ STT
85
  stt_text = ""
86
  if action in ["stt", "s2st"]:
87
  load_stt_gpu()
 
 
 
88
  try:
89
+ audio_bytes = base64.b64decode(data.get("file"))
90
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
91
+ f.write(audio_bytes); temp_path = f.name
92
  lang = data.get("lang")
 
93
  stt_res = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
94
  stt_text = stt_res["text"].strip()
95
+ print(f"--- [v151] πŸŽ™οΈ STT: {stt_text[:50]}... ---")
96
  if action == "stt": return {"text": stt_text}
97
  finally:
98
+ if 'temp_path' in locals() and os.path.exists(temp_path): os.unlink(temp_path)
99
 
100
  # πŸ”Š TTS
101
  if action in ["tts", "s2st"]:
 
102
  text = (data.get("text") if action == "tts" else stt_text).strip()
103
  trans_text = text
104
+ target = data.get("target_lang") or data.get("lang") or "en"
 
105
 
106
  if action == "s2st":
107
+ print(f"--- [v151] 🌏 TRANSLATING TO {target}... ---")
108
+ trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
109
  text = trans_text
110
+ print(f"--- [v151] πŸ“ TRANS: {trans_text[:50]}... ---")
111
 
112
  if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
113
 
114
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
115
+ clean_lang = target.split('-')[0].lower()
116
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
117
 
118
  if not mapped_lang:
119
  if HAS_CHATTERBOX:
120
+ print(f"--- [v151] πŸ“¦ FALLBACK: CHATTERBOX ---")
121
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
122
+ return {"text": stt_text, "translated": trans_text, "audio": base64.b64encode(audio_bytes).decode()}
123
+ return {"error": f"Lang {clean_lang} unsupported"}
124
+
125
+ print(f"--- [v151] πŸ“₯ LOADING XTTS... ---")
126
+ load_tts_gpu()
127
+
128
+ speaker_wav = data.get("speaker_wav")
129
+ speaker_path = None
130
+ if speaker_wav:
131
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
132
+ f.write(base64.b64decode(speaker_wav)); speaker_path = f.name
133
  else:
134
+ speaker_path = "default_speaker.wav"
135
+ if not os.path.exists(speaker_path): speaker_path = None
136
+
137
+ print(f"--- [v151] πŸ”Š RUNNING XTTS INFERENCE... ---")
138
+ try:
139
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
140
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
141
+ with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
142
+ print(f"--- [v151] βœ… TTS SUCCESS! ---")
143
+ finally:
144
+ if speaker_wav and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
145
+ if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
 
 
 
146
 
147
  if action == "tts": return {"audio": audio_b64}
148
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
 
 
149
 
150
  @app.post("/process")
151
  @app.post("/api/v1/process")
152
+ async def api_process(request: Request):
153
  try:
154
  data = await request.json()
155
  action = data.get("action")
156
+ if action == "health": return {"status": "awake", "v": "151"}
157
+ return process_full(action, data)
 
158
  except Exception as e:
159
+ print(f"❌ [v151] CRASH: {traceback.format_exc()}")
160
  return {"error": str(e)}
 
 
161
 
162
  @app.get("/health")
163
  def health():
164
+ return {"status": "ok", "v": "151", "gpu": torch.cuda.is_available(), "chatterbox": HAS_CHATTERBOX}
 
 
 
 
 
 
 
165
 
166
  @app.get("/", response_class=HTMLResponse)
167
+ def root(): return "<html><body><h1>πŸš€ AI Engine v151 (DEBUG)</h1></body></html>"
168
 
169
  if __name__ == "__main__":
170
  uvicorn.run(app, host="0.0.0.0", port=7860)