talha77 commited on
Commit
23193ac
·
verified ·
1 Parent(s): e1abe38

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -259
app.py CHANGED
@@ -1,260 +1,267 @@
1
- import asyncio
2
- import os
3
- import tempfile
4
- import time
5
-
6
- from fastapi import FastAPI, File, Form, UploadFile, HTTPException
7
- from fastapi.responses import StreamingResponse, JSONResponse
8
-
9
- # Ensure OMP_NUM_THREADS is a valid positive integer before importing heavy libs.
10
- _omp_val = os.environ.get("OMP_NUM_THREADS")
11
- if not _omp_val or not _omp_val.isdigit() or int(_omp_val) <= 0:
12
- os.environ["OMP_NUM_THREADS"] = "1"
13
-
14
- from auralis import TTS, TTSRequest, AudioPreprocessingConfig
15
-
16
-
17
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
-
19
- # Default reference voices (you must add these files to the repo)
20
- DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.mp3")
21
- DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav")
22
-
23
-
24
- app = FastAPI(
25
- title="Auralis XTTS2-GPT TTS API",
26
- version="1.1.0",
27
- )
28
-
29
- tts: TTS | None = None
30
-
31
-
32
- @app.get("/")
33
- async def root():
34
- """
35
- Basic root endpoint so that GET / returns 200 instead of 404.
36
- Useful for Hugging Face's automatic health/log checks.
37
- """
38
- return {"status": "ok", "message": "Auralis TTS running. See /docs for API usage."}
39
-
40
-
41
- @app.on_event("startup")
42
- async def load_model() -> None:
43
- """
44
- Load the XTTSv2 + GPT model once when the application starts.
45
-
46
- We create the model inside a worker thread so Auralis can freely
47
- manage its own event loop without conflicting with FastAPI/uvicorn.
48
- """
49
- global tts
50
-
51
- if tts is not None:
52
- return
53
-
54
- loop = asyncio.get_event_loop()
55
-
56
- def _init_model() -> TTS:
57
- return TTS().from_pretrained(
58
- "AstraMindAI/xttsv2",
59
- gpt_model="AstraMindAI/xtts2-gpt",
60
- )
61
-
62
- tts = await loop.run_in_executor(None, _init_model)
63
-
64
-
65
- @app.get("/health")
66
- async def health():
67
- """
68
- Simple health check endpoint.
69
- """
70
- is_loaded = tts is not None
71
- return JSONResponse(
72
- {
73
- "status": "Model is ready" if is_loaded else "Model is loading",
74
- "model_loaded": is_loaded,
75
- }
76
- )
77
-
78
-
79
- @app.post("/tts")
80
- async def tts_endpoint(
81
- text: str = Form(..., description="Text to synthesize"),
82
- language: str = Form(
83
- "auto",
84
- description="Language code: 'auto', 'en', or 'ar'",
85
- ),
86
- gender: str = Form(
87
- "male",
88
- description="Used when no voice cloning file is provided: 'male' or 'female'",
89
- ),
90
- use_voice_cloning: bool = Form(
91
- False,
92
- description="If true, use uploaded speaker_file for cloning. "
93
- "If false or no file, use default male/female reference.",
94
- ),
95
- speaker_file: UploadFile | None = File(
96
- None,
97
- description="Optional reference speaker audio for voice cloning (WAV/FLAC/MP3). "
98
- "If omitted or use_voice_cloning=False, a default male/female voice is used.",
99
- ),
100
- ):
101
- """
102
- Generate speech from text.
103
-
104
- - If use_voice_cloning is true AND speaker_file is provided: use that as the voice.
105
- - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav.
106
-
107
- Returns raw WAV audio as the response body.
108
- """
109
- if tts is None:
110
- raise HTTPException(
111
- status_code=503,
112
- detail="Model is still loading, please try again in a few seconds.",
113
- )
114
-
115
- if not text.strip():
116
- raise HTTPException(status_code=400, detail="Text must not be empty.")
117
-
118
- # Normalize language selection
119
- lang = language.lower()
120
- if lang not in {"auto", "en", "ar"}:
121
- raise HTTPException(
122
- status_code=400,
123
- detail="Invalid language. Use 'auto', 'en', or 'ar'.",
124
- )
125
-
126
- # Decide which speaker reference file to use
127
- speaker_path = None
128
-
129
- if use_voice_cloning:
130
- # Require a valid uploaded file for cloning
131
- if speaker_file is None:
132
- raise HTTPException(
133
- status_code=400,
134
- detail="use_voice_cloning is true but no speaker_file was uploaded.",
135
- )
136
-
137
- # Basic content-type guard; Auralis can read various formats
138
- allowed_types = {
139
- "audio/wav",
140
- "audio/x-wav",
141
- "audio/flac",
142
- "audio/x-flac",
143
- "audio/mpeg",
144
- "audio/mp3",
145
- "audio/ogg",
146
- }
147
- if speaker_file.content_type not in allowed_types:
148
- raise HTTPException(
149
- status_code=400,
150
- detail=(
151
- "Unsupported speaker_file content-type: "
152
- f"{speaker_file.content_type}"
153
- ),
154
- )
155
-
156
- # Save uploaded speaker file to a temporary path Auralis can use
157
- try:
158
- data = await speaker_file.read()
159
- if not data:
160
- raise HTTPException(status_code=400, detail="Empty speaker_file.")
161
-
162
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
163
- tmp.write(data)
164
- speaker_path = tmp.name
165
-
166
- except HTTPException:
167
- raise
168
- except Exception as e:
169
- raise HTTPException(
170
- status_code=500,
171
- detail=f"Failed to read speaker_file: {e}",
172
- )
173
- else:
174
- # Use default bundled voice based on gender
175
- g = gender.lower()
176
- if g not in {"male", "female"}:
177
- raise HTTPException(
178
- status_code=400,
179
- detail="Invalid gender. Use 'male' or 'female'.",
180
- )
181
-
182
- speaker_path = (
183
- DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE
184
- )
185
-
186
- if not os.path.exists(speaker_path):
187
- # This is a deployment/config error; make it clear.
188
- raise HTTPException(
189
- status_code=500,
190
- detail=(
191
- f"Default reference voice file not found at {speaker_path}. "
192
- "Make sure malear.wav and femalten.wav are present next to app.py."
193
- ),
194
- )
195
-
196
- # Build TTSRequest with audio enhancement config
197
- request = TTSRequest(
198
- text=text,
199
- speaker_files=[speaker_path],
200
- language=lang,
201
- audio_config=AudioPreprocessingConfig(
202
- # Use fixed, sensible defaults; no need to expose as API params
203
- normalize=True,
204
- trim_silence=True,
205
- enhance_speech=True,
206
- ),
207
- # Generation parameters; tweak if needed
208
- temperature=0.75,
209
- top_p=0.85,
210
- top_k=50,
211
- stream=False,
212
- )
213
-
214
- # Run blocking generation in a thread so FastAPI's event loop is not blocked
215
- loop = asyncio.get_event_loop()
216
-
217
- def _generate():
218
- return tts.generate_speech(request)
219
-
220
- try:
221
- start = time.perf_counter()
222
- output = await loop.run_in_executor(None, _generate)
223
- elapsed_ms = int((time.perf_counter() - start) * 1000)
224
-
225
- # Get audio duration information for the client
226
- _num_samples, _sr, duration = output.get_info()
227
-
228
- audio_bytes = output.to_bytes() # WAV bytes
229
- except RuntimeError as exc:
230
- # Gracefully surface CUDA OOM errors instead of crashing the app
231
- message = str(exc)
232
- if "CUDA out of memory" in message:
233
- raise HTTPException(
234
- status_code=503,
235
- detail="CUDA out of memory on the Space GPU. Try shorter text, shorter speaker audio, or fewer concurrent requests.",
236
- )
237
- raise
238
- finally:
239
- # Cleanup temp file used for cloning (if any)
240
- if use_voice_cloning and speaker_path and os.path.isfile(speaker_path):
241
- try:
242
- os.remove(speaker_path)
243
- except OSError:
244
- pass
245
-
246
- return StreamingResponse(
247
- iter([audio_bytes]),
248
- media_type="audio/wav",
249
- headers={
250
- "Content-Disposition": 'attachment; filename="output.wav"',
251
- "X-Generation-Time-ms": str(elapsed_ms),
252
- "X-Audio-Duration-sec": f"{duration:.3f}",
253
- },
254
- )
255
-
256
-
257
- if __name__ == "__main__":
258
- import uvicorn
259
-
 
 
 
 
 
 
 
260
  uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
 
1
+ import asyncio
2
+ import os
3
+ import tempfile
4
+ import time
5
+ import logging
6
+
7
+ from fastapi import FastAPI, File, Form, UploadFile, HTTPException
8
+ from fastapi.responses import StreamingResponse, JSONResponse
9
+
10
+ # Ensure OMP_NUM_THREADS is a valid positive integer before importing heavy libs.
11
+ _omp_val = os.environ.get("OMP_NUM_THREADS")
12
+ if not _omp_val or not _omp_val.isdigit() or int(_omp_val) <= 0:
13
+ os.environ["OMP_NUM_THREADS"] = "1"
14
+
15
+ from auralis import TTS, TTSRequest, AudioPreprocessingConfig
16
+
17
+
18
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
19
+
20
+ # Default reference voices (you must add these files to the repo)
21
+ DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.wav")
22
+ DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav")
23
+
24
+
25
+ app = FastAPI(
26
+ title="TTS API",
27
+ version="1.1.0",
28
+ )
29
+
30
+ logger = logging.getLogger("uvicorn.error")
31
+
32
+ tts: TTS | None = None
33
+
34
+
35
+ @app.get("/")
36
+ async def root():
37
+ """
38
+ Basic root endpoint so that GET / returns 200 instead of 404.
39
+ Useful for Hugging Face's automatic health/log checks and quick status.
40
+ """
41
+ is_loaded = tts is not None
42
+ return {
43
+ "status": "Model is ready" if is_loaded else "Model is loading",
44
+ "model_loaded": is_loaded,
45
+ }
46
+
47
+
48
+ @app.on_event("startup")
49
+ async def load_model() -> None:
50
+ """
51
+ Load the model once when the application starts.
52
+
53
+ We create the model inside a worker thread so can freely
54
+ manage its own event loop without conflicting with FastAPI/uvicorn.
55
+ """
56
+ global tts
57
+
58
+ if tts is not None:
59
+ return
60
+
61
+ loop = asyncio.get_event_loop()
62
+
63
+ def _init_model() -> TTS:
64
+ return TTS().from_pretrained(
65
+ "AstraMindAI/xttsv2",
66
+ gpt_model="AstraMindAI/xtts2-gpt",
67
+ )
68
+
69
+ tts = await loop.run_in_executor(None, _init_model)
70
+
71
+
72
+ @app.get("/health")
73
+ async def health():
74
+ """
75
+ Simple health check endpoint.
76
+ """
77
+ is_loaded = tts is not None
78
+ return JSONResponse(
79
+ {
80
+ "status": "Model is ready" if is_loaded else "Model is loading",
81
+ "model_loaded": is_loaded,
82
+ }
83
+ )
84
+
85
+
86
+ @app.post("/tts")
87
+ async def tts_endpoint(
88
+ text: str = Form(..., description="Text to synthesize"),
89
+ language: str = Form(
90
+ "English",
91
+ description="Language name, e.g. 'English' or 'Arabic' (case-insensitive).",
92
+ ),
93
+ gender: str = Form(
94
+ "Male",
95
+ description="Used when no clone_voice file is provided: 'Male' or 'Female'.",
96
+ ),
97
+ clone_voice: UploadFile | None = File(
98
+ None,
99
+ description=(
100
+ "Optional reference audio for voice cloning (WAV/FLAC/MP3). "
101
+ "If omitted, a default male/female voice is used."
102
+ ),
103
+ ),
104
+ ):
105
+ """
106
+ Generate speech from text.
107
+
108
+ - If use_voice_cloning is true AND speaker_file is provided: use that as the voice.
109
+ - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav.
110
+
111
+ Returns raw WAV audio as the response body.
112
+ """
113
+ if tts is None:
114
+ raise HTTPException(
115
+ status_code=503,
116
+ detail="Model is still loading, please try again in a few seconds.",
117
+ )
118
+
119
+ if not text.strip():
120
+ raise HTTPException(status_code=400, detail="Text must not be empty.")
121
+
122
+ # Normalize language selection to values expected by Auralis
123
+ lang_name = language.strip().lower()
124
+ if lang_name in {"english", "en", "eng"}:
125
+ lang = "en"
126
+ elif lang_name in {"arabic", "ar", "arb"}:
127
+ lang = "ar"
128
+ elif lang_name in {"auto", ""}:
129
+ lang = "auto"
130
+ else:
131
+ # Fallback: pass through as auto, but keep behavior predictable
132
+ lang = "auto"
133
+
134
+ # Decide which speaker reference file to use
135
+ speaker_path = None
136
+
137
+ if clone_voice is not None:
138
+ # Basic content-type guard; Auralis can read various formats
139
+ allowed_types = {
140
+ "audio/wav",
141
+ "audio/x-wav",
142
+ "audio/flac",
143
+ "audio/x-flac",
144
+ "audio/mpeg",
145
+ "audio/mp3",
146
+ "audio/ogg",
147
+ }
148
+ if clone_voice.content_type not in allowed_types:
149
+ raise HTTPException(
150
+ status_code=400,
151
+ detail=(
152
+ "Unsupported speaker_file content-type: "
153
+ f"{clone_voice.content_type}"
154
+ ),
155
+ )
156
+
157
+ # Save uploaded speaker file to a temporary path Auralis can use
158
+ try:
159
+ data = await clone_voice.read()
160
+ if not data:
161
+ raise HTTPException(status_code=400, detail="Empty speaker_file.")
162
+
163
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
164
+ tmp.write(data)
165
+ speaker_path = tmp.name
166
+
167
+ except HTTPException:
168
+ raise
169
+ except Exception as e:
170
+ raise HTTPException(
171
+ status_code=500,
172
+ detail=f"Failed to read speaker_file: {e}",
173
+ )
174
+ else:
175
+ # Use default bundled voice based on gender
176
+ g = gender.lower()
177
+ if g not in {"male", "female"}:
178
+ raise HTTPException(
179
+ status_code=400,
180
+ detail="Invalid gender. Use 'male' or 'female'.",
181
+ )
182
+
183
+ speaker_path = (
184
+ DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE
185
+ )
186
+
187
+ if not os.path.exists(speaker_path):
188
+ # This is a deployment/config error; make it clear.
189
+ raise HTTPException(
190
+ status_code=500,
191
+ detail=(
192
+ f"Default reference voice file not found at {speaker_path}. "
193
+ "Make sure malear.wav and femalten.wav are present next to app.py."
194
+ ),
195
+ )
196
+
197
+ # Build TTSRequest with audio enhancement config
198
+ request = TTSRequest(
199
+ text=text,
200
+ speaker_files=[speaker_path],
201
+ language=lang,
202
+ audio_config=AudioPreprocessingConfig(
203
+ # Use fixed, sensible defaults; no need to expose as API params
204
+ normalize=True,
205
+ trim_silence=True,
206
+ enhance_speech=True,
207
+ ),
208
+ # Generation parameters; tweak if needed
209
+ temperature=0.75,
210
+ top_p=0.85,
211
+ top_k=50,
212
+ stream=False,
213
+ )
214
+
215
+ # Run blocking generation in a thread so FastAPI's event loop is not blocked
216
+ loop = asyncio.get_event_loop()
217
+
218
+ def _generate():
219
+ return tts.generate_speech(request)
220
+
221
+ try:
222
+ start = time.perf_counter()
223
+ output = await loop.run_in_executor(None, _generate)
224
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
225
+
226
+ # Get audio duration information for the client
227
+ _num_samples, _sr, duration = output.get_info()
228
+
229
+ audio_bytes = output.to_bytes() # WAV bytes
230
+ except RuntimeError as exc:
231
+ # Gracefully surface CUDA OOM errors instead of crashing the app
232
+ message = str(exc)
233
+ if "CUDA out of memory" in message:
234
+ raise HTTPException(
235
+ status_code=503,
236
+ detail="CUDA out of memory on the Space GPU. Try shorter text, shorter speaker audio, or fewer concurrent requests.",
237
+ )
238
+ raise
239
+ finally:
240
+ # Cleanup temp file used for cloning (if any)
241
+ if clone_voice is not None and speaker_path and os.path.isfile(speaker_path):
242
+ try:
243
+ os.remove(speaker_path)
244
+ except OSError:
245
+ pass
246
+
247
+ logger.info(
248
+ "Generated audio in %.3f seconds (duration=%.3f sec)",
249
+ elapsed_ms / 1000.0,
250
+ duration,
251
+ )
252
+
253
+ return StreamingResponse(
254
+ iter([audio_bytes]),
255
+ media_type="audio/wav",
256
+ headers={
257
+ "Content-Disposition": 'attachment; filename="output.wav"',
258
+ "X-Generation-Time-ms": str(elapsed_ms),
259
+ "X-Audio-Duration-sec": f"{duration:.3f}",
260
+ },
261
+ )
262
+
263
+
264
+ if __name__ == "__main__":
265
+ import uvicorn
266
+
267
  uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))