TGPro1 commited on
Commit
639ffca
Β·
verified Β·
1 Parent(s): be298b3

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +105 -100
app.py CHANGED
@@ -5,13 +5,14 @@ import base64
5
  import torch
6
  import tempfile
7
  import traceback
 
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.responses import HTMLResponse
11
  import uvicorn
12
 
13
- # --- [v151] πŸš€ TTS DEBUG ENGINE ---
14
- print(f"--- [v151] πŸ“‘ BOOTING DEBUG ENGINE ---")
15
 
16
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
17
  from TTS.api import TTS
@@ -43,128 +44,132 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
43
 
44
  MODELS = {"stt": None, "tts": None}
45
 
46
- def load_stt_gpu():
 
47
  global MODELS
48
  if MODELS.get("stt") is None:
49
- print("--- [v151] πŸ“₯ LOADING WHISPER (Base) ---")
50
  model_id = "openai/whisper-base"
51
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
52
- model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
53
- ).to("cuda")
54
- processor = AutoProcessor.from_pretrained(model_id)
55
  MODELS["stt"] = pipeline(
56
  "automatic-speech-recognition",
57
- model=model,
58
- tokenizer=processor.tokenizer,
59
- feature_extractor=processor.feature_extractor,
60
- torch_dtype=torch.float32,
61
- device="cuda",
62
- model_kwargs={"attn_implementation": "eager"}
63
  )
64
- print("--- [v151] βœ… WHISPER READY ---")
65
 
66
  def load_tts_gpu():
67
  global MODELS
68
  if MODELS.get("tts") is None:
69
- print("--- [v151] πŸ“₯ LOADING XTTS V2 ---")
70
- try:
71
- # Try loading once and keeping in VRAM if possible (ZeroGPU might clear it)
72
- MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
73
- MODELS["tts"].to(torch.float32)
74
- print("--- [v151] βœ… XTTS LOADED SUCCESSFULLY ---")
75
- except Exception as e:
76
- print(f"--- [v151] ❌ XTTS FAILED TO LOAD: {e} ---")
77
- raise e
78
-
79
- @spaces.GPU(duration=180) # Longer duration for XTTS
80
- def process_full(action, data):
81
  global MODELS
82
- print(f"--- [v151] πŸš€ STARTING {action} on GPU ---")
 
 
 
 
 
 
 
83
 
84
- # πŸŽ™οΈ STT
85
- stt_text = ""
86
- if action in ["stt", "s2st"]:
87
- load_stt_gpu()
88
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  audio_bytes = base64.b64decode(data.get("file"))
90
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
91
  f.write(audio_bytes); temp_path = f.name
92
- lang = data.get("lang")
93
- stt_res = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
94
- stt_text = stt_res["text"].strip()
95
- print(f"--- [v151] πŸŽ™οΈ STT: {stt_text[:50]}... ---")
96
- if action == "stt": return {"text": stt_text}
97
- finally:
98
- if 'temp_path' in locals() and os.path.exists(temp_path): os.unlink(temp_path)
99
-
100
- # πŸ”Š TTS
101
- if action in ["tts", "s2st"]:
102
- text = (data.get("text") if action == "tts" else stt_text).strip()
103
- trans_text = text
104
- target = data.get("target_lang") or data.get("lang") or "en"
105
-
106
- if action == "s2st":
107
- print(f"--- [v151] 🌏 TRANSLATING TO {target}... ---")
108
- trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
109
- text = trans_text
110
- print(f"--- [v151] πŸ“ TRANS: {trans_text[:50]}... ---")
111
-
112
- if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
115
- clean_lang = target.split('-')[0].lower()
116
- mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
117
-
118
- if not mapped_lang:
119
- if HAS_CHATTERBOX:
120
- print(f"--- [v151] πŸ“¦ FALLBACK: CHATTERBOX ---")
121
- audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
122
- return {"text": stt_text, "translated": trans_text, "audio": base64.b64encode(audio_bytes).decode()}
123
- return {"error": f"Lang {clean_lang} unsupported"}
124
-
125
- print(f"--- [v151] πŸ“₯ LOADING XTTS... ---")
126
- load_tts_gpu()
127
-
128
- speaker_wav = data.get("speaker_wav")
129
- speaker_path = None
130
- if speaker_wav:
131
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
132
- f.write(base64.b64decode(speaker_wav)); speaker_path = f.name
133
- else:
134
- speaker_path = "default_speaker.wav"
135
- if not os.path.exists(speaker_path): speaker_path = None
136
-
137
- print(f"--- [v151] πŸ”Š RUNNING XTTS INFERENCE... ---")
138
- try:
139
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
140
- MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
141
- with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
142
- print(f"--- [v151] βœ… TTS SUCCESS! ---")
143
- finally:
144
- if speaker_wav and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
145
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
146
-
147
- if action == "tts": return {"audio": audio_b64}
148
- return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
149
 
150
  @app.post("/process")
151
  @app.post("/api/v1/process")
152
- async def api_process(request: Request):
153
- try:
154
- data = await request.json()
155
- action = data.get("action")
156
- if action == "health": return {"status": "awake", "v": "151"}
157
- return process_full(action, data)
158
- except Exception as e:
159
- print(f"❌ [v151] CRASH: {traceback.format_exc()}")
160
- return {"error": str(e)}
161
 
162
  @app.get("/health")
163
  def health():
164
- return {"status": "ok", "v": "151", "gpu": torch.cuda.is_available(), "chatterbox": HAS_CHATTERBOX}
165
 
166
  @app.get("/", response_class=HTMLResponse)
167
- def root(): return "<html><body><h1>πŸš€ AI Engine v151 (DEBUG)</h1></body></html>"
168
 
169
  if __name__ == "__main__":
170
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
5
  import torch
6
  import tempfile
7
  import traceback
8
+ import gc
9
  from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
 
14
+ # --- [v152] πŸš€ REFINED HYBRID ENGINE (CPU-STT + CPU-TRANS + GPU-TTS) ---
15
+ print(f"--- [v152] πŸ“‘ BOOTING REFINED HYBRID ENGINE ---")
16
 
17
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
18
  from TTS.api import TTS
 
44
 
45
  MODELS = {"stt": None, "tts": None}
46
 
47
+ def load_stt_cpu():
48
+ """STT on CPU is stable and fast for Whisper Base."""
49
  global MODELS
50
  if MODELS.get("stt") is None:
51
+ print("--- [v152] πŸ“₯ LOADING WHISPER (Base) ON CPU ---")
52
  model_id = "openai/whisper-base"
 
 
 
 
53
  MODELS["stt"] = pipeline(
54
  "automatic-speech-recognition",
55
+ model=model_id,
56
+ device="cpu", # Force CPU for stability
57
+ torch_dtype=torch.float32
 
 
 
58
  )
59
+ print("--- [v152] βœ… WHISPER READY (CPU) ---")
60
 
61
  def load_tts_gpu():
62
  global MODELS
63
  if MODELS.get("tts") is None:
64
+ print("--- [v152] πŸ“₯ LOADING XTTS V2 ---")
65
+ # Load to CPU first, then move to CUDA inside the decorated function if needed,
66
+ # or just load directly if ZeroGPU allows.
67
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
68
+ print("--- [v152] βœ… XTTS READY (CPU MEMORY) ---")
69
+
70
+ @spaces.GPU(duration=120)
71
+ def gpu_tts_inference(text, mapped_lang, speaker_path):
72
+ """Isolated GPU inference for XTTS."""
 
 
 
73
  global MODELS
74
+ if MODELS["tts"] is None:
75
+ load_tts_gpu()
76
+
77
+ # Move to GPU inside the decorated scope
78
+ MODELS["tts"].to("cuda")
79
+
80
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
81
+ out_p = out_f.name
82
 
83
+ try:
84
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
85
+ with open(out_p, "rb") as f:
86
+ audio_b64 = base64.b64encode(f.read()).decode()
87
+ return audio_b64
88
+ finally:
89
+ # Move back to CPU to release VRAM as per ZeroGPU guidelines
90
+ MODELS["tts"].to("cpu")
91
+ if os.path.exists(out_p): os.unlink(out_p)
92
+ torch.cuda.empty_cache()
93
+
94
+ async def handle_process(request: Request):
95
+ try:
96
+ data = await request.json()
97
+ action = data.get("action")
98
+ if action == "health": return {"status": "awake", "v": "152"}
99
+
100
+ print(f"--- [v152] πŸ› οΈ ENGINE ACTION: {action} ---")
101
+ t1 = time.time()
102
+
103
+ # πŸŽ™οΈ STT (CPU)
104
+ stt_text = None
105
+ if action in ["stt", "s2st"]:
106
+ load_stt_cpu()
107
  audio_bytes = base64.b64decode(data.get("file"))
108
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
109
  f.write(audio_bytes); temp_path = f.name
110
+ try:
111
+ lang = data.get("lang")
112
+ res = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
113
+ stt_text = res["text"].strip()
114
+ if action == "stt": return {"text": stt_text}
115
+ finally:
116
+ if os.path.exists(temp_path): os.unlink(temp_path)
117
+
118
+ # πŸ”Š TTS (Hybrid GPU)
119
+ if action in ["tts", "s2st"]:
120
+ text = (data.get("text") if action == "tts" else stt_text).strip()
121
+ trans_text = text
122
+ target = data.get("target_lang") or data.get("lang") or "en"
123
+
124
+ if action == "s2st":
125
+ trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
126
+ text = trans_text
127
+
128
+ if len(text) < 2: return {"text": stt_text, "translated": "", "audio": ""} if action == "s2st" else {"audio": ""}
129
+
130
+ XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
131
+ clean_lang = target.split('-')[0].lower()
132
+ mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
133
+
134
+ if not mapped_lang:
135
+ if HAS_CHATTERBOX:
136
+ audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
137
+ audio_b64 = base64.b64encode(audio_bytes).decode()
138
+ else: return {"error": f"Lang {clean_lang} unsupported"}
139
+ else:
140
+ speaker_wav_b64 = data.get("speaker_wav")
141
+ speaker_path = None
142
+ if speaker_wav_b64:
143
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
144
+ f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
145
+ else:
146
+ speaker_path = "default_speaker.wav"
147
+ if not os.path.exists(speaker_path): speaker_path = None
148
+
149
+ try:
150
+ audio_b64 = gpu_tts_inference(text, mapped_lang, speaker_path)
151
+ finally:
152
+ if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
153
+
154
+ if action == "tts": return {"audio": audio_b64}
155
+ return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
156
 
157
+ except Exception as e:
158
+ print(f"❌ [v152] ERROR: {traceback.format_exc()}")
159
+ return {"error": str(e)}
160
+ finally:
161
+ print(f"--- [v152] ✨ DONE ({time.time()-t1:.1f}s) ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  @app.post("/process")
164
  @app.post("/api/v1/process")
165
+ async def api_process(request: Request): return await handle_process(request)
 
 
 
 
 
 
 
 
166
 
167
  @app.get("/health")
168
  def health():
169
+ return {"status": "ok", "v": "152", "gpu": torch.cuda.is_available()}
170
 
171
  @app.get("/", response_class=HTMLResponse)
172
+ def root(): return "<h1>πŸš€ AI Engine v152 (REFINED HYBRID)</h1>"
173
 
174
  if __name__ == "__main__":
175
  uvicorn.run(app, host="0.0.0.0", port=7860)