talha77 commited on
Commit
2ae6553
·
verified ·
1 Parent(s): d79a47d

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +22 -0
  2. app.py +220 -0
  3. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ HF_HOME=/data/.cache/huggingface
5
+
6
+ WORKDIR /app
7
+
8
+ # Basic system deps (ffmpeg needed for some audio operations)
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ ffmpeg \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ COPY . .
18
+
19
+ EXPOSE 7860
20
+
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
22
+
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import tempfile
4
+
5
+ from fastapi import FastAPI, File, Form, UploadFile, HTTPException
6
+ from fastapi.responses import StreamingResponse, JSONResponse
7
+
8
+ from auralis import TTS, TTSRequest, AudioPreprocessingConfig
9
+
10
+
11
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
12
+
13
+ # Default reference voices (you must add these files to the repo)
14
+ DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.wav")
15
+ DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav")
16
+
17
+
18
+ app = FastAPI(
19
+ title="Auralis XTTS2-GPT TTS API",
20
+ version="1.1.0",
21
+ )
22
+
23
+ # Global TTS model instance
24
+ tts = None
25
+
26
+
27
+ @app.on_event("startup")
28
+ async def load_model() -> None:
29
+ """
30
+ Load the XTTSv2 + GPT model once when the Space starts.
31
+ """
32
+ global tts
33
+ tts = TTS().from_pretrained(
34
+ "AstraMindAI/xttsv2",
35
+ gpt_model="AstraMindAI/xtts2-gpt",
36
+ )
37
+
38
+
39
+ @app.get("/health")
40
+ async def health():
41
+ """
42
+ Simple health check endpoint.
43
+ """
44
+ return JSONResponse({"status": "ok", "model_loaded": tts is not None})
45
+
46
+
47
+ @app.post("/tts")
48
+ async def tts_endpoint(
49
+ text: str = Form(..., description="Text to synthesize"),
50
+ language: str = Form(
51
+ "auto",
52
+ description="Language code: 'auto', 'en', or 'ar'",
53
+ ),
54
+ gender: str = Form(
55
+ "male",
56
+ description="Used when no voice cloning file is provided: 'male' or 'female'",
57
+ ),
58
+ use_voice_cloning: bool = Form(
59
+ False,
60
+ description="If true, use uploaded speaker_file for cloning. "
61
+ "If false or no file, use default male/female reference.",
62
+ ),
63
+ enhance_speech: bool = Form(
64
+ True,
65
+ description="Apply speech enhancement/denoising",
66
+ ),
67
+ normalize: bool = Form(
68
+ True,
69
+ description="Normalize loudness",
70
+ ),
71
+ trim_silence: bool = Form(
72
+ True,
73
+ description="Trim leading/trailing silence",
74
+ ),
75
+ speaker_file: UploadFile | None = File(
76
+ None,
77
+ description="Optional reference speaker audio for voice cloning (WAV/FLAC/MP3). "
78
+ "If omitted or use_voice_cloning=False, a default male/female voice is used.",
79
+ ),
80
+ ):
81
+ """
82
+ Generate speech from text.
83
+
84
+ - If use_voice_cloning is true AND speaker_file is provided: use that as the voice.
85
+ - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav.
86
+
87
+ Returns raw WAV audio as the response body.
88
+ """
89
+ if tts is None:
90
+ raise HTTPException(
91
+ status_code=503,
92
+ detail="Model is still loading, please try again in a few seconds.",
93
+ )
94
+
95
+ if not text.strip():
96
+ raise HTTPException(status_code=400, detail="Text must not be empty.")
97
+
98
+ # Normalize language selection
99
+ lang = language.lower()
100
+ if lang not in {"auto", "en", "ar"}:
101
+ raise HTTPException(
102
+ status_code=400,
103
+ detail="Invalid language. Use 'auto', 'en', or 'ar'.",
104
+ )
105
+
106
+ # Decide which speaker reference file to use
107
+ speaker_path = None
108
+
109
+ if use_voice_cloning:
110
+ # Require a valid uploaded file for cloning
111
+ if speaker_file is None:
112
+ raise HTTPException(
113
+ status_code=400,
114
+ detail="use_voice_cloning is true but no speaker_file was uploaded.",
115
+ )
116
+
117
+ # Basic content-type guard; Auralis can read various formats
118
+ allowed_types = {
119
+ "audio/wav",
120
+ "audio/x-wav",
121
+ "audio/flac",
122
+ "audio/x-flac",
123
+ "audio/mpeg",
124
+ "audio/mp3",
125
+ "audio/ogg",
126
+ }
127
+ if speaker_file.content_type not in allowed_types:
128
+ raise HTTPException(
129
+ status_code=400,
130
+ detail=(
131
+ "Unsupported speaker_file content-type: "
132
+ f"{speaker_file.content_type}"
133
+ ),
134
+ )
135
+
136
+ # Save uploaded speaker file to a temporary path Auralis can use
137
+ try:
138
+ data = await speaker_file.read()
139
+ if not data:
140
+ raise HTTPException(status_code=400, detail="Empty speaker_file.")
141
+
142
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
143
+ tmp.write(data)
144
+ speaker_path = tmp.name
145
+
146
+ except HTTPException:
147
+ raise
148
+ except Exception as e:
149
+ raise HTTPException(
150
+ status_code=500,
151
+ detail=f"Failed to read speaker_file: {e}",
152
+ )
153
+ else:
154
+ # Use default bundled voice based on gender
155
+ g = gender.lower()
156
+ if g not in {"male", "female"}:
157
+ raise HTTPException(
158
+ status_code=400,
159
+ detail="Invalid gender. Use 'male' or 'female'.",
160
+ )
161
+
162
+ speaker_path = (
163
+ DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE
164
+ )
165
+
166
+ if not os.path.exists(speaker_path):
167
+ # This is a deployment/config error; make it clear.
168
+ raise HTTPException(
169
+ status_code=500,
170
+ detail=(
171
+ f"Default reference voice file not found at {speaker_path}. "
172
+ "Make sure malear.wav and femalten.wav are present next to app.py."
173
+ ),
174
+ )
175
+
176
+ # Build TTSRequest with audio enhancement config
177
+ request = TTSRequest(
178
+ text=text,
179
+ speaker_files=[speaker_path],
180
+ language=lang,
181
+ audio_config=AudioPreprocessingConfig(
182
+ normalize=normalize,
183
+ trim_silence=trim_silence,
184
+ enhance_speech=enhance_speech,
185
+ ),
186
+ # Generation parameters; tweak if needed
187
+ temperature=0.75,
188
+ top_p=0.85,
189
+ top_k=50,
190
+ stream=False,
191
+ )
192
+
193
+ # Run blocking generation in a thread so FastAPI's event loop is not blocked
194
+ loop = asyncio.get_event_loop()
195
+
196
+ def _generate():
197
+ return tts.generate_speech(request)
198
+
199
+ try:
200
+ output = await loop.run_in_executor(None, _generate)
201
+ audio_bytes = output.to_bytes() # WAV bytes
202
+ finally:
203
+ # Cleanup temp file used for cloning (if any)
204
+ if use_voice_cloning and speaker_path and os.path.isfile(speaker_path):
205
+ try:
206
+ os.remove(speaker_path)
207
+ except OSError:
208
+ pass
209
+
210
+ return StreamingResponse(
211
+ iter([audio_bytes]),
212
+ media_type="audio/wav",
213
+ headers={"Content-Disposition": 'inline; filename="output.wav"'},
214
+ )
215
+
216
+
217
+ if __name__ == "__main__":
218
+ import uvicorn
219
+
220
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ auralis
5
+ nest_asyncio
6
+
7
+ transformers==4.46.2
8
+ vllm==0.6.4.post1