TGPro1 commited on
Commit
69d9eef
Β·
verified Β·
1 Parent(s): f494f8d

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +78 -80
app.py CHANGED
@@ -6,145 +6,143 @@ import torch
6
  import tempfile
7
  import traceback
8
  import gc
9
- from fastapi import FastAPI, Request, Response
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
14
  from TTS.api import TTS
15
 
16
- # --- [v140] πŸš€ H200 CORE STABILIZATION (Dynamic Workspace + Explicit FP32) ---
17
- print(f"--- [v140] πŸ“‘ BOOTING CORE ENGINE ---")
18
 
19
  try:
20
  import spaces
21
- HAS_SPACES = True
22
  except ImportError:
23
- HAS_SPACES = False
24
  class spaces:
25
  @staticmethod
26
  def GPU(duration=60, f=None):
27
  if f is None: return lambda x: x
28
  return f
29
 
30
- # --- System Config ---
31
  os.environ["COQUI_TOS_AGREED"] = "1"
32
  os.environ["PYTHONWARNINGS"] = "ignore"
33
- # REMOVED: CUBLAS_WORKSPACE_CONFIG (Let the driver decide)
34
- torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for better alignment on Hopper
35
- torch.backends.cudnn.allow_tf32 = True
36
- torch.backends.cudnn.benchmark = False # Avoid erratic kernel selection
37
 
38
  app = FastAPI()
39
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
40
 
41
  MODELS = {"stt": None, "tts": None}
42
 
43
- def load_gpu_models():
44
- """Persistent loading with explicit casting."""
45
  global MODELS
46
- device = "cuda"
47
-
48
  if MODELS.get("stt") is None:
49
- print("--- [v140] πŸ“₯ LOADING WHISPER (EXPLICIT FP32) ---")
50
  model_id = "openai/whisper-large-v3-turbo"
51
- # Load and force cast to float()
52
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
53
- model_id, low_cpu_mem_usage=True, use_safetensors=True
54
- ).to(device).float() # FORCE FLOAT32
55
-
56
  processor = AutoProcessor.from_pretrained(model_id)
57
-
58
  MODELS["stt"] = pipeline(
59
  "automatic-speech-recognition",
60
  model=model,
61
  tokenizer=processor.tokenizer,
62
  feature_extractor=processor.feature_extractor,
63
- torch_dtype=torch.float32,
64
- device=device,
65
- model_kwargs={"attn_implementation": "eager"}
66
  )
67
- print("--- [v140] βœ… WHISPER LOADED (FORCED FP32) ---")
68
 
69
- if MODELS.get("tts") is None:
70
- print("--- [v140] πŸ“₯ LOADING XTTS ---")
71
- MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
72
- print("--- [v140] βœ… XTTS LOADED ---")
73
-
74
- @spaces.GPU(duration=120)
75
- def core_process(request_dict):
76
  global MODELS
77
- action = request_dict.get("action")
78
- print(f"--- [v140] πŸ› οΈ CORE ENGINE: {action} ---")
79
- t1 = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
 
 
 
 
 
 
81
  try:
82
- load_gpu_models()
 
 
 
 
 
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if action in ["stt", "s2st"]:
85
- audio_bytes = base64.b64decode(request_dict.get("file"))
86
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
87
- f.write(audio_bytes); temp_path = f.name
88
- try:
89
- lang = request_dict.get("lang")
90
- result = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
91
- stt_text = result["text"].strip()
92
- finally:
93
- if os.path.exists(temp_path): os.unlink(temp_path)
94
-
95
  if action == "stt": return {"text": stt_text}
96
 
 
97
  if action in ["tts", "s2st"]:
98
- text = (request_dict.get("text") if action == "tts" else stt_text).strip()
99
  trans_text = text
100
  if action == "s2st":
101
  from deep_translator import GoogleTranslator
102
- target = request_dict.get("target_lang") or "en"
103
- text = GoogleTranslator(source='auto', target=target).translate(stt_text)
104
- trans_text = text
105
 
106
  if len(text) < 2: return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
107
-
108
- XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
109
- raw_lang = (request_dict.get("lang") if action == "tts" else target).strip().lower()
110
- clean_lang = raw_lang.split('-')[0]
111
- mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
112
 
113
- if mapped_lang:
114
- speaker_wav_path = "default_speaker.wav"
115
- if not os.path.exists(speaker_wav_path): speaker_wav_path = None
116
- try:
117
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
118
- MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
119
- with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
120
- finally:
121
- if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
122
- else: return {"error": f"Language {clean_lang} not supported."}
123
 
124
- if action == "tts": return {"audio": audio_b64}
125
- return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
126
 
127
  except Exception as e:
128
- print(f"❌ [v140] ERROR: {traceback.format_exc()}")
129
  return {"error": str(e)}
130
  finally:
131
- print(f"--- [v140] ✨ DONE ({time.time()-t1:.1f}s) ---")
132
- torch.cuda.empty_cache()
133
-
134
- @app.post("/process")
135
- async def api_process(request: Request):
136
- try:
137
- data = await request.json()
138
- if data.get("action") == "health": return {"status": "awake", "v": "140"}
139
- return core_process(data)
140
- except Exception as e: return {"error": str(e)}
141
 
142
  @app.get("/health")
143
- def health(): return {"status": "ok", "v": "140", "gpu": HAS_SPACES}
144
 
145
  @app.get("/", response_class=HTMLResponse)
146
- def root():
147
- return f"<html><body><h1>πŸš€ AI Engine v140 (STABLE BASELINE)</h1><p>GPU: {HAS_SPACES}</p></body></html>"
148
 
149
  if __name__ == "__main__":
150
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
6
  import tempfile
7
  import traceback
8
  import gc
9
+ from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
14
  from TTS.api import TTS
15
 
16
+ # --- [v141] πŸš€ HYBRID STABLE ENGINE (CPU STT + GPU TTS) ---
17
+ print(f"--- [v141] πŸ“‘ BOOTING HYBRID ENGINE ---")
18
 
19
  try:
20
  import spaces
 
21
  except ImportError:
 
22
  class spaces:
23
  @staticmethod
24
  def GPU(duration=60, f=None):
25
  if f is None: return lambda x: x
26
  return f
27
 
28
+ # --- Global Sync ---
29
  os.environ["COQUI_TOS_AGREED"] = "1"
30
  os.environ["PYTHONWARNINGS"] = "ignore"
 
 
 
 
31
 
32
  app = FastAPI()
33
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
34
 
35
  MODELS = {"stt": None, "tts": None}
36
 
37
+ def load_cpu_stt():
38
+ """Loads Whisper on CPU for maximum stability."""
39
  global MODELS
 
 
40
  if MODELS.get("stt") is None:
41
+ print("--- [v141] πŸ“₯ LOADING WHISPER ON CPU ---")
42
  model_id = "openai/whisper-large-v3-turbo"
43
+ # CPU loading (fp32)
44
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
45
+ model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True
46
+ )
 
47
  processor = AutoProcessor.from_pretrained(model_id)
 
48
  MODELS["stt"] = pipeline(
49
  "automatic-speech-recognition",
50
  model=model,
51
  tokenizer=processor.tokenizer,
52
  feature_extractor=processor.feature_extractor,
53
+ device="cpu"
 
 
54
  )
55
+ print("--- [v141] βœ… WHISPER LOADED (CPU) ---")
56
 
57
+ @spaces.GPU(duration=90)
58
+ def load_gpu_tts():
59
+ """Loads XTTS on GPU for maximum speed."""
 
 
 
 
60
  global MODELS
61
+ if MODELS.get("tts") is None:
62
+ print("--- [v141] πŸ“₯ LOADING XTTS ON GPU ---")
63
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
64
+ print("--- [v141] βœ… XTTS LOADED (GPU) ---")
65
+
66
+ def stt_process(audio_b64, lang):
67
+ load_cpu_stt()
68
+ audio_bytes = base64.b64decode(audio_b64)
69
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
70
+ f.write(audio_bytes); temp_path = f.name
71
+ try:
72
+ result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
73
+ return result["text"].strip()
74
+ finally:
75
+ if os.path.exists(temp_path): os.unlink(temp_path)
76
+
77
+ @spaces.GPU(duration=90)
78
+ def tts_process(text, target_lang, speaker_wav_b64=None):
79
+ load_gpu_tts()
80
+ XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
81
+ clean_lang = target_lang.split('-')[0].lower()
82
+ mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
83
+
84
+ if not mapped_lang: return {"error": f"Language {clean_lang} not supported."}
85
 
86
+ speaker_wav_path = "default_speaker.wav"
87
+ if speaker_wav_b64:
88
+ sb = base64.b64decode(speaker_wav_b64)
89
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
90
+ f.write(sb); speaker_wav_path = f.name
91
+
92
  try:
93
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name
94
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
95
+ with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
96
+ return audio_b64
97
+ finally:
98
+ if speaker_wav_b64 and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
99
+ if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
100
 
101
+ @app.post("/process")
102
+ async def api_process(request: Request):
103
+ try:
104
+ data = await request.json()
105
+ action = data.get("action")
106
+ if action == "health": return {"status": "awake", "v": "141"}
107
+
108
+ print(f"--- [v141] πŸ› οΈ HYBRID ENGINE: {action} ---")
109
+ t1 = time.time()
110
+
111
+ # πŸŽ™οΈ STT (CPU)
112
+ stt_text = None
113
  if action in ["stt", "s2st"]:
114
+ stt_text = stt_process(data.get("file"), data.get("lang"))
 
 
 
 
 
 
 
 
 
115
  if action == "stt": return {"text": stt_text}
116
 
117
+ # πŸ”Š TTS (GPU)
118
  if action in ["tts", "s2st"]:
119
+ text = (data.get("text") if action == "tts" else stt_text).strip()
120
  trans_text = text
121
  if action == "s2st":
122
  from deep_translator import GoogleTranslator
123
+ target = data.get("target_lang") or "en"
124
+ trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
125
+ text = trans_text
126
 
127
  if len(text) < 2: return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
 
 
 
 
 
128
 
129
+ audio_res = tts_process(text, (data.get("lang") if action == "tts" else data.get("target_lang")), data.get("speaker_wav"))
130
+ if isinstance(audio_res, dict) and "error" in audio_res: return audio_res
 
 
 
 
 
 
 
 
131
 
132
+ if action == "tts": return {"audio": audio_res}
133
+ return {"text": stt_text, "translated": trans_text, "audio": audio_res}
134
 
135
  except Exception as e:
136
+ print(f"❌ [v141] ERROR: {traceback.format_exc()}")
137
  return {"error": str(e)}
138
  finally:
139
+ print(f"--- [v141] ✨ DONE ---")
 
 
 
 
 
 
 
 
 
140
 
141
  @app.get("/health")
142
+ def health(): return {"status": "ok", "v": "141", "mode": "HYBRID"}
143
 
144
  @app.get("/", response_class=HTMLResponse)
145
+ def root(): return "<html><body><h1>πŸš€ AI Engine v141 (HYBRID STABLE)</h1></body></html>"
 
146
 
147
  if __name__ == "__main__":
148
  uvicorn.run(app, host="0.0.0.0", port=7860)