File size: 8,710 Bytes
7454cce
 
811f60e
 
 
 
 
6fd2aaf
7bc2a36
69d9eef
811f60e
 
0c19477
0695186
 
 
6fd2aaf
6ce2f3c
6fd2aaf
6ce2f3c
3d20be5
6ce2f3c
 
 
 
 
7bc2a36
6ce2f3c
 
4bfa772
6ce2f3c
 
 
 
 
0695186
0c19477
971c294
6a4b0e8
0c19477
376aa42
6a4b0e8
 
 
 
 
 
ea78b72
 
 
 
 
 
 
 
 
 
 
 
2ebc6b4
 
 
811f60e
 
 
4bfa772
b93eba5
4bfa772
 
 
 
 
 
971c294
7bc2a36
4bfa772
71c50e8
4bfa772
7bc2a36
cb0d204
0695186
7bc2a36
0695186
 
971c294
 
 
 
 
 
0695186
7bc2a36
971c294
7bc2a36
0695186
7bc2a36
 
0695186
7bc2a36
 
0695186
 
0af9862
7bc2a36
0695186
 
 
 
 
971c294
639ffca
4bfa772
 
7bc2a36
4bfa772
7bc2a36
 
0695186
7bc2a36
4bfa772
 
 
 
0695186
7bc2a36
 
 
 
 
 
 
 
4bfa772
 
 
 
 
 
7bc2a36
 
639ffca
d8da089
639ffca
 
 
0695186
639ffca
0695186
639ffca
22c6fab
4bfa772
639ffca
6ce2f3c
4bfa772
 
6ce2f3c
23b6539
7bc2a36
639ffca
4bfa772
 
639ffca
 
4bfa772
22c6fab
639ffca
4bfa772
639ffca
 
6ce2f3c
 
639ffca
7bc2a36
4bfa772
639ffca
0695186
639ffca
 
0695186
639ffca
 
 
 
 
 
 
 
 
4bfa772
639ffca
 
 
 
 
7bc2a36
639ffca
 
 
 
 
4bfa772
 
639ffca
 
 
 
 
71c50e8
639ffca
0695186
639ffca
 
0695186
b0d71b5
 
 
639ffca
32297a1
811f60e
7bc2a36
 
4bfa772
0695186
4bfa772
0af9862
0695186
 
0af9862
7bc2a36
811f60e
 
0695186
e0a0f24
92366fd
811f60e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import sys
import time
import base64
import torch
import tempfile
import traceback
import uvicorn
import gc
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse

# --- [v164] πŸš€ PRO GPU ENGINE (ULTIMATE STABILITY) ---
# This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1.
print(f"--- [v164] πŸ“‘ BOOTING ENGINE ---")

# πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
import torchaudio
import soundfile as sf
def HeroLoad(filepath, **kwargs):
    try:
        data, samplerate = sf.read(filepath)
        if len(data.shape) == 1:
            data = data.reshape(1, -1)
        else:
            data = data.T
        return torch.from_numpy(data).float(), samplerate
    except Exception as e:
        print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---")
        return torchaudio.load_orig(filepath, **kwargs)

if not hasattr(torchaudio, 'load_orig'):
    torchaudio.load_orig = torchaudio.load
    torchaudio.load = HeroLoad
    print("--- [v164] 🩹 TORCHAUDIO PATCH APPLIED ---")

from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from TTS.api import TTS
from deep_translator import GoogleTranslator

try:
    import chatterbox_utils
    HAS_CHATTERBOX = True
except ImportError:
    HAS_CHATTERBOX = False

try:
    import spaces
    HAS_SPACES = True
except ImportError:
    HAS_SPACES = False
    class spaces:
        @staticmethod
        def GPU(duration=60):
            def decorator(func):
                return func
            return decorator

os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["PYTHONWARNINGS"] = "ignore"

app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])

MODELS = {"stt": None, "tts": None, "gpu_id": 0}

def get_best_gpu():
    """Architecture for multi-GPU support (Switch)."""
    if not torch.cuda.is_available(): return "cpu"
    # Select GPU with most free memory if multiple exist
    # For ZeroGPU, this defaults to the allocated MIG instance.
    return f"cuda:{MODELS['gpu_id']}"

@spaces.GPU(duration=120)
def gpu_stt_full(temp_path, lang):
    global MODELS
    device = get_best_gpu()
    
    if MODELS.get("stt") is None:
        print(f"--- [v164] πŸ“₯ LOADING WHISPER LARGE (FP32) ON {device} ---")
        model_id = "openai/whisper-large-v3-turbo"
        # Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
        processor = AutoProcessor.from_pretrained(model_id)
        MODELS["stt"] = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            chunk_length_s=30, 
            device=device
        )
    
    print(f"--- [v164] πŸŽ™οΈ WHISPER INFERENCE (TEMP 0, BS 1) ---")
    res = MODELS["stt"](
        temp_path, 
        batch_size=1, # Ultimate stability
        generate_kwargs={
            "language": lang if lang and len(lang) <= 3 else None,
            "temperature": 0.0,
            "return_timestamps": True
        }
    )
    
    # Post-inference cleanup
    torch.cuda.empty_cache()
    gc.collect()
    
    return res["text"].strip()

@spaces.GPU(duration=180)
def gpu_tts_full(text, mapped_lang, speaker_path):
    global MODELS
    device = "cuda"
    
    if MODELS.get("tts") is None:
        print(f"--- [v164] πŸ“₯ LOADING XTTS V2 ON GPU ---")
        MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    else:
        try: MODELS["tts"].to(device)
        except: pass

    print(f"--- [v164] πŸ”Š XTTS GPU INFERENCE ---")
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
        out_p = out_f.name
    
    MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
    
    with open(out_p, "rb") as f:
        audio_b64 = base64.b64encode(f.read()).decode()
    
    if os.path.exists(out_p): os.unlink(out_p)
    
    # Cleanup to prevent ZeroGPU worker errors
    torch.cuda.empty_cache()
    gc.collect()
    
    return audio_b64

async def handle_process(request: Request):
    t1 = time.time()
    try:
        data = await request.json()
        action = data.get("action")
        if action == "health": return {"status": "awake", "v": "164"}
        
        print(f"--- [v164] πŸ› οΈ API REQUEST: {action.upper()} ---")
        
        stt_text = ""
        # 🟒 SPEECH-TO-TEXT
        if action in ["stt", "s2st"]:
            audio_b64 = data.get("file")
            if not audio_b64: return {"error": "Missing audio data"}
            
            audio_bytes = base64.b64decode(audio_b64)
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                f.write(audio_bytes); temp_path = f.name
            try:
                stt_text = gpu_stt_full(temp_path, data.get("lang"))
                print(f"--- [v162] πŸŽ™οΈ TEXT: {stt_text[:100]}... ---")
            finally:
                if os.path.exists(temp_path): os.unlink(temp_path)
            
            if action == "stt": return {"text": stt_text}

        # πŸ”΅ TEXT-TO-SPEECH
        if action in ["tts", "s2st"]:
            text = (data.get("text") if action == "tts" else stt_text).strip()
            if not text: return {"error": "Input text is empty"}
            
            target = data.get("target_lang") or data.get("lang") or "en"
            trans_text = text
            
            if action == "s2st":
                print(f"--- [v164] 🌏 TRANSLATING TO {target} ---")
                trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
                text = trans_text
                print(f"--- [v164] πŸ“ TRANS: {text[:100]}... ---")

            XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
            clean_lang = target.split('-')[0].lower()
            mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
            
            if not mapped_lang:
                if HAS_CHATTERBOX:
                    audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
                    audio_b64 = base64.b64encode(audio_bytes).decode()
                else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
            else:
                speaker_wav_b64 = data.get("speaker_wav")
                speaker_path = None
                if speaker_wav_b64:
                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                        f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
                else:
                    speaker_path = "default_speaker.wav"
                    if not os.path.exists(speaker_path): speaker_path = None
                
                try:
                    # EXECUTE GPU TTS
                    audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
                finally:
                    if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
            
            if action == "tts": return {"audio": audio_b64}
            return {"text": stt_text, "translated": trans_text, "audio": audio_b64}

    except Exception as e:
        print(f"❌ [v164] ENGINE ERROR: {traceback.format_exc()}")
        return {"error": str(e)}
    finally:
        print(f"--- [v164] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")

@app.post("/process")
@app.post("/api/v1/process")
async def api_process(request: Request): return await handle_process(request)

@app.get("/health")
def health(): 
    return {
        "status": "ready", 
        "v": "164", 
        "gpu": torch.cuda.is_available(),
        "devices": torch.cuda.device_count(),
        "engine": "Full GPU PRO (Stable)",
        "stt": "Whisper-v3-Turbo (FP32-GPU)",
        "tts": "XTTS-v2 (GPU)"
    }

@app.get("/", response_class=HTMLResponse)
def root(): return "<h1>πŸš€ PRO AI Engine v164 (GPU MODE)</h1>"

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)