TGPro1 commited on
Commit
0c19477
Β·
verified Β·
1 Parent(s): db6d860

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +41 -25
app.py CHANGED
@@ -10,15 +10,25 @@ from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
 
 
 
 
 
13
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
14
  from TTS.api import TTS
15
-
16
- # --- [v141] πŸš€ HYBRID STABLE ENGINE (CPU STT + GPU TTS) ---
17
- print(f"--- [v141] πŸ“‘ BOOTING HYBRID ENGINE ---")
 
 
 
18
 
19
  try:
20
  import spaces
 
21
  except ImportError:
 
22
  class spaces:
23
  @staticmethod
24
  def GPU(duration=60, f=None):
@@ -38,12 +48,11 @@ def load_cpu_stt():
38
  """Loads Whisper on CPU for maximum stability."""
39
  global MODELS
40
  if MODELS.get("stt") is None:
41
- print("--- [v141] πŸ“₯ LOADING WHISPER ON CPU ---")
42
  model_id = "openai/whisper-large-v3-turbo"
43
- # CPU loading (fp32)
44
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
45
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True
46
- )
47
  processor = AutoProcessor.from_pretrained(model_id)
48
  MODELS["stt"] = pipeline(
49
  "automatic-speech-recognition",
@@ -52,18 +61,18 @@ def load_cpu_stt():
52
  feature_extractor=processor.feature_extractor,
53
  device="cpu"
54
  )
55
- print("--- [v141] βœ… WHISPER LOADED (CPU) ---")
56
 
57
- @spaces.GPU(duration=90)
58
  def load_gpu_tts():
59
  """Loads XTTS on GPU for maximum speed."""
60
  global MODELS
61
  if MODELS.get("tts") is None:
62
- print("--- [v141] πŸ“₯ LOADING XTTS ON GPU ---")
63
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
64
- print("--- [v141] βœ… XTTS LOADED (GPU) ---")
65
 
66
- def stt_process(audio_b64, lang):
67
  load_cpu_stt()
68
  audio_bytes = base64.b64decode(audio_b64)
69
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
@@ -74,20 +83,27 @@ def stt_process(audio_b64, lang):
74
  finally:
75
  if os.path.exists(temp_path): os.unlink(temp_path)
76
 
77
- @spaces.GPU(duration=90)
78
- def tts_process(text, target_lang, speaker_wav_b64=None):
79
  load_gpu_tts()
80
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
81
  clean_lang = target_lang.split('-')[0].lower()
82
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
83
 
84
- if not mapped_lang: return {"error": f"Language {clean_lang} not supported."}
 
 
 
 
 
85
 
86
  speaker_wav_path = "default_speaker.wav"
87
  if speaker_wav_b64:
88
  sb = base64.b64decode(speaker_wav_b64)
89
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
90
  f.write(sb); speaker_wav_path = f.name
 
 
91
 
92
  try:
93
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
@@ -103,46 +119,46 @@ async def api_process(request: Request):
103
  try:
104
  data = await request.json()
105
  action = data.get("action")
106
- if action == "health": return {"status": "awake", "v": "141"}
107
 
108
- print(f"--- [v141] πŸ› οΈ HYBRID ENGINE: {action} ---")
109
  t1 = time.time()
110
 
111
- # πŸŽ™οΈ STT (CPU)
112
  stt_text = None
113
  if action in ["stt", "s2st"]:
114
- stt_text = stt_process(data.get("file"), data.get("lang"))
115
  if action == "stt": return {"text": stt_text}
116
 
117
- # πŸ”Š TTS (GPU)
118
  if action in ["tts", "s2st"]:
119
  text = (data.get("text") if action == "tts" else stt_text).strip()
120
  trans_text = text
 
121
  if action == "s2st":
122
- from deep_translator import GoogleTranslator
123
  target = data.get("target_lang") or "en"
124
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
125
  text = trans_text
126
 
127
  if len(text) < 2: return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
128
 
129
- audio_res = tts_process(text, (data.get("lang") if action == "tts" else data.get("target_lang")), data.get("speaker_wav"))
130
  if isinstance(audio_res, dict) and "error" in audio_res: return audio_res
131
 
132
  if action == "tts": return {"audio": audio_res}
133
  return {"text": stt_text, "translated": trans_text, "audio": audio_res}
134
 
135
  except Exception as e:
136
- print(f"❌ [v141] ERROR: {traceback.format_exc()}")
137
  return {"error": str(e)}
138
  finally:
139
- print(f"--- [v141] ✨ DONE ---")
140
 
141
  @app.get("/health")
142
- def health(): return {"status": "ok", "v": "141", "mode": "HYBRID"}
143
 
144
  @app.get("/", response_class=HTMLResponse)
145
- def root(): return "<html><body><h1>πŸš€ AI Engine v141 (HYBRID STABLE)</h1></body></html>"
146
 
147
  if __name__ == "__main__":
148
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
+
14
+ # --- [v142] πŸš€ REFINED HYBRID ENGINE (CPU STT + GPU TTS) ---
15
+ print(f"--- [v142] πŸ“‘ BOOTING REFINED HYBRID ---")
16
+
17
+ # Top-level imports for stability
18
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
19
  from TTS.api import TTS
20
+ from deep_translator import GoogleTranslator
21
+ try:
22
+ import chatterbox_utils
23
+ HAS_CHATTERBOX = True
24
+ except ImportError:
25
+ HAS_CHATTERBOX = False
26
 
27
  try:
28
  import spaces
29
+ HAS_SPACES = True
30
  except ImportError:
31
+ HAS_SPACES = False
32
  class spaces:
33
  @staticmethod
34
  def GPU(duration=60, f=None):
 
48
  """Loads Whisper on CPU for maximum stability."""
49
  global MODELS
50
  if MODELS.get("stt") is None:
51
+ print("--- [v142] πŸ“₯ LOADING WHISPER ON CPU ---")
52
  model_id = "openai/whisper-large-v3-turbo"
 
53
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
54
  model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True
55
+ ) # CPU handles FP32 best
56
  processor = AutoProcessor.from_pretrained(model_id)
57
  MODELS["stt"] = pipeline(
58
  "automatic-speech-recognition",
 
61
  feature_extractor=processor.feature_extractor,
62
  device="cpu"
63
  )
64
+ print("--- [v142] βœ… WHISPER LOADED (CPU) ---")
65
 
66
+ @spaces.GPU(duration=120)
67
  def load_gpu_tts():
68
  """Loads XTTS on GPU for maximum speed."""
69
  global MODELS
70
  if MODELS.get("tts") is None:
71
+ print("--- [v142] πŸ“₯ LOADING XTTS ON GPU ---")
72
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
73
+ print("--- [v142] βœ… XTTS LOADED (GPU) ---")
74
 
75
+ def stt_process_internal(audio_b64, lang):
76
  load_cpu_stt()
77
  audio_bytes = base64.b64decode(audio_b64)
78
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
 
83
  finally:
84
  if os.path.exists(temp_path): os.unlink(temp_path)
85
 
86
+ @spaces.GPU(duration=120)
87
+ def tts_process_internal(text, target_lang, speaker_wav_b64=None):
88
  load_gpu_tts()
89
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
90
  clean_lang = target_lang.split('-')[0].lower()
91
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
92
 
93
+ if not mapped_lang:
94
+ if HAS_CHATTERBOX:
95
+ print(f"--- [v142] πŸ“¦ USING CHATTERBOX FOR {clean_lang} ---")
96
+ audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
97
+ return base64.b64encode(audio_bytes).decode()
98
+ return {"error": f"Language {clean_lang} not supported."}
99
 
100
  speaker_wav_path = "default_speaker.wav"
101
  if speaker_wav_b64:
102
  sb = base64.b64decode(speaker_wav_b64)
103
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
104
  f.write(sb); speaker_wav_path = f.name
105
+ elif not os.path.exists(speaker_wav_path):
106
+ speaker_wav_path = None
107
 
108
  try:
109
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
 
119
  try:
120
  data = await request.json()
121
  action = data.get("action")
122
+ if action == "health": return {"status": "awake", "v": "142", "gpu": HAS_SPACES}
123
 
124
+ print(f"--- [v142] πŸ› οΈ HYBRID ENGINE: {action} ---")
125
  t1 = time.time()
126
 
127
+ # πŸŽ™οΈ STT (CPU Segment)
128
  stt_text = None
129
  if action in ["stt", "s2st"]:
130
+ stt_text = stt_process_internal(data.get("file"), data.get("lang"))
131
  if action == "stt": return {"text": stt_text}
132
 
133
+ # πŸ”Š TTS (GPU Segment)
134
  if action in ["tts", "s2st"]:
135
  text = (data.get("text") if action == "tts" else stt_text).strip()
136
  trans_text = text
137
+
138
  if action == "s2st":
 
139
  target = data.get("target_lang") or "en"
140
  trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
141
  text = trans_text
142
 
143
  if len(text) < 2: return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
144
 
145
+ audio_res = tts_process_internal(text, (data.get("lang") if action == "tts" else target), data.get("speaker_wav"))
146
  if isinstance(audio_res, dict) and "error" in audio_res: return audio_res
147
 
148
  if action == "tts": return {"audio": audio_res}
149
  return {"text": stt_text, "translated": trans_text, "audio": audio_res}
150
 
151
  except Exception as e:
152
+ print(f"❌ [v142] ERROR: {traceback.format_exc()}")
153
  return {"error": str(e)}
154
  finally:
155
+ print(f"--- [v142] ✨ DONE ({time.time()-t1:.1f}s) ---")
156
 
157
  @app.get("/health")
158
+ def health(): return {"status": "ok", "v": "142", "mode": "HYBRID", "gpu": HAS_SPACES}
159
 
160
  @app.get("/", response_class=HTMLResponse)
161
+ def root(): return "<html><body><h1>πŸš€ AI Engine v142 (PRODUCTION HYBRID)</h1></body></html>"
162
 
163
  if __name__ == "__main__":
164
  uvicorn.run(app, host="0.0.0.0", port=7860)