TGPro1 commited on
Commit
971c294
Β·
verified Β·
1 Parent(s): ee86840

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +45 -49
app.py CHANGED
@@ -11,10 +11,10 @@ from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
 
14
- # --- [v153] πŸš€ EPHEMERAL GPU ENGINE ---
15
- print(f"--- [v153] πŸ“‘ BOOTING EPHEMERAL ENGINE ---")
16
 
17
- from transformers import pipeline
18
  from TTS.api import TTS
19
  from deep_translator import GoogleTranslator
20
 
@@ -42,68 +42,63 @@ os.environ["PYTHONWARNINGS"] = "ignore"
42
  app = FastAPI()
43
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
44
 
45
- MODELS = {"stt": None}
46
 
47
- def load_stt_cpu():
 
 
 
 
 
 
 
 
 
48
  global MODELS
49
  if MODELS.get("stt") is None:
50
- print("--- [v153] πŸ“₯ LOADING WHISPER (Base) ON CPU ---")
51
- MODELS["stt"] = pipeline("automatic-speech-recognition", model="openai/whisper-base", device="cpu")
52
- print("--- [v153] βœ… WHISPER READY (CPU) ---")
53
-
54
- @spaces.GPU(duration=180)
55
- def ephemeral_tts(text, mapped_lang, speaker_path):
56
- """Pure ephemeral loading on GPU to bypass VRAM watchdogs."""
57
- print(f"--- [v153] πŸ“₯ LOADING XTTS EPOCH... ---")
58
- local_tts = None
59
- try:
60
- local_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
61
- local_tts.to(torch.float32)
62
-
63
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
64
- out_p = out_f.name
65
-
66
- print(f"--- [v153] πŸ”Š INFERENCE... ---")
67
- local_tts.tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
68
-
69
- with open(out_p, "rb") as f:
70
- audio_b64 = base64.b64encode(f.read()).decode()
71
- return audio_b64
72
- finally:
73
- print(f"--- [v153] 🧹 CLEANUP ---")
74
- if local_tts:
75
- del local_tts
76
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
77
- gc.collect()
78
- torch.cuda.empty_cache()
79
 
80
  async def handle_process(request: Request):
81
  try:
82
  data = await request.json()
83
  action = data.get("action")
84
- if action == "health": return {"status": "awake", "v": "153"}
85
 
86
- print(f"--- [v153] πŸ› οΈ {action} ---")
87
  t1 = time.time()
88
 
89
- # πŸŽ™οΈ STT (CPU)
90
  stt_text = ""
91
  if action in ["stt", "s2st"]:
92
- load_stt_cpu()
93
  audio_bytes = base64.b64decode(data.get("file"))
94
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
95
  f.write(audio_bytes); temp_path = f.name
96
  try:
97
- lang = data.get("lang")
98
- res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
99
- stt_text = res["text"].strip()
100
  finally:
101
  if os.path.exists(temp_path): os.unlink(temp_path)
102
 
103
  if action == "stt": return {"text": stt_text}
104
 
105
- # πŸ”Š TTS (GPU)
106
  if action in ["tts", "s2st"]:
 
107
  text = (data.get("text") if action == "tts" else stt_text).strip()
108
  trans_text = text
109
  target = data.get("target_lang") or data.get("lang") or "en"
@@ -112,8 +107,6 @@ async def handle_process(request: Request):
112
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
113
  text = trans_text
114
 
115
- if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
116
-
117
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
118
  clean_lang = target.split('-')[0].lower()
119
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
@@ -134,28 +127,31 @@ async def handle_process(request: Request):
134
  if not os.path.exists(speaker_path): speaker_path = None
135
 
136
  try:
137
- audio_b64 = ephemeral_tts(text, mapped_lang, speaker_path)
 
 
138
  finally:
139
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
 
140
 
141
  if action == "tts": return {"audio": audio_b64}
142
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
143
 
144
  except Exception as e:
145
- print(f"❌ [v153] ERROR: {traceback.format_exc()}")
146
  return {"error": str(e)}
147
  finally:
148
- print(f"--- [v153] ✨ DONE ({time.time()-t1:.1f}s) ---")
149
 
150
  @app.post("/process")
151
  @app.post("/api/v1/process")
152
  async def api_process(request: Request): return await handle_process(request)
153
 
154
  @app.get("/health")
155
- def health(): return {"status": "ok", "v": "153", "gpu": torch.cuda.is_available()}
156
 
157
  @app.get("/", response_class=HTMLResponse)
158
- def root(): return "<h1>πŸš€ AI Engine v153 (EPHEMERAL)</h1>"
159
 
160
  if __name__ == "__main__":
161
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
 
14
+ # --- [v154] πŸš€ PRO STABLE ENGINE (GPU-STT + CPU-TTS) ---
15
+ print(f"--- [v154] πŸ“‘ BOOTING PRO STABLE ENGINE ---")
16
 
17
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
18
  from TTS.api import TTS
19
  from deep_translator import GoogleTranslator
20
 
 
42
  app = FastAPI()
43
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
44
 
45
+ MODELS = {"stt": None, "tts": None}
46
 
47
+ def load_tts_cpu():
48
+ global MODELS
49
+ if MODELS.get("tts") is None:
50
+ print("--- [v154] πŸ“₯ LOADING XTTS V2 (CPU MODE) ---")
51
+ # XTTS on CPU is stable and avoids ZeroGPU kernel crashes
52
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cpu")
53
+ print("--- [v154] βœ… XTTS READY (CPU) ---")
54
+
55
+ @spaces.GPU(duration=60)
56
+ def gpu_stt_inference(temp_path, lang):
57
  global MODELS
58
  if MODELS.get("stt") is None:
59
+ print("--- [v154] πŸ“₯ LOADING WHISPER (Large-v3-Turbo) ON GPU ---")
60
+ model_id = "openai/whisper-large-v3-turbo"
61
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
62
+ model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
63
+ ).to("cuda")
64
+ processor = AutoProcessor.from_pretrained(model_id)
65
+ MODELS["stt"] = pipeline(
66
+ "automatic-speech-recognition",
67
+ model=model,
68
+ tokenizer=processor.tokenizer,
69
+ feature_extractor=processor.feature_extractor,
70
+ torch_dtype=torch.float16,
71
+ device="cuda"
72
+ )
73
+
74
+ res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
75
+ return res["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  async def handle_process(request: Request):
78
  try:
79
  data = await request.json()
80
  action = data.get("action")
81
+ if action == "health": return {"status": "awake", "v": "154"}
82
 
83
+ print(f"--- [v154] πŸ› οΈ {action} ---")
84
  t1 = time.time()
85
 
86
+ # πŸŽ™οΈ STT (GPU)
87
  stt_text = ""
88
  if action in ["stt", "s2st"]:
 
89
  audio_bytes = base64.b64decode(data.get("file"))
90
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
91
  f.write(audio_bytes); temp_path = f.name
92
  try:
93
+ stt_text = gpu_stt_inference(temp_path, data.get("lang"))
 
 
94
  finally:
95
  if os.path.exists(temp_path): os.unlink(temp_path)
96
 
97
  if action == "stt": return {"text": stt_text}
98
 
99
+ # πŸ”Š TTS (CPU)
100
  if action in ["tts", "s2st"]:
101
+ load_tts_cpu()
102
  text = (data.get("text") if action == "tts" else stt_text).strip()
103
  trans_text = text
104
  target = data.get("target_lang") or data.get("lang") or "en"
 
107
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
108
  text = trans_text
109
 
 
 
110
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
111
  clean_lang = target.split('-')[0].lower()
112
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
 
127
  if not os.path.exists(speaker_path): speaker_path = None
128
 
129
  try:
130
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
131
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
132
+ with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
133
  finally:
134
  if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
135
+ if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
136
 
137
  if action == "tts": return {"audio": audio_b64}
138
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
139
 
140
  except Exception as e:
141
+ print(f"❌ [v154] ERROR: {traceback.format_exc()}")
142
  return {"error": str(e)}
143
  finally:
144
+ print(f"--- [v154] ✨ DONE ({time.time()-t1:.1f}s) ---")
145
 
146
  @app.post("/process")
147
  @app.post("/api/v1/process")
148
  async def api_process(request: Request): return await handle_process(request)
149
 
150
  @app.get("/health")
151
+ def health(): return {"status": "ok", "v": "154", "gpu": torch.cuda.is_available()}
152
 
153
  @app.get("/", response_class=HTMLResponse)
154
+ def root(): return "<h1>πŸš€ AI Engine v154 (PRO STABLE)</h1>"
155
 
156
  if __name__ == "__main__":
157
  uvicorn.run(app, host="0.0.0.0", port=7860)