muhammadnoman76 commited on
Commit
9ebf02b
·
1 Parent(s): 20291d1
Files changed (3) hide show
  1. Dockerfile +44 -0
  2. app.py +338 -0
  3. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ ffmpeg \
6
+ libsndfile1 \
7
+ wget \
8
+ git \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Copy requirements first for better caching
16
+ COPY requirements.txt .
17
+
18
+ # Install PyTorch CPU-only version first (much smaller and faster)
19
+ RUN pip install --no-cache-dir torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
20
+
21
+ # Install remaining Python dependencies
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Download spacy model during build to avoid runtime network calls
25
+ RUN python -m spacy download en_core_web_sm
26
+
27
+ # Copy application code
28
+ COPY app.py .
29
+
30
+ # Create directory for models (they will be downloaded on first run)
31
+ RUN mkdir -p /app/models
32
+
33
+ # Set environment variables
34
+ ENV PYTHONUNBUFFERED=1
35
+
36
+ # Expose port
37
+ EXPOSE 7860
38
+
39
+ # Health check
40
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
41
+ CMD curl -f http://localhost:7860/health || exit 1
42
+
43
+ # Run the application
44
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import torch
4
+ import numpy as np
5
+ from fastapi import FastAPI, HTTPException, Depends, Security
6
+ from fastapi.security import APIKeyHeader
7
+ from fastapi.responses import Response
8
+ from pydantic import BaseModel
9
+ from typing import Optional, List
10
+ import soundfile as sf
11
+ from pydub import AudioSegment
12
+ from kokoro import KModel, KPipeline
13
+ import logging
14
+ import re
15
+ import asyncio
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import time
18
+
19
+ # Setup logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Configuration
24
+ SECRET_KEY = os.getenv("API_SECRET_KEY", "your-default-secret-key")
25
+ CUDA_AVAILABLE = torch.cuda.is_available()
26
+
27
+ try:
28
+ char_limit_env = os.getenv("CHAR_LIMIT", "5000")
29
+ CHAR_LIMIT = int(char_limit_env) if char_limit_env.isdigit() else 5000
30
+ except (ValueError, AttributeError):
31
+ CHAR_LIMIT = 5000
32
+
33
+ # FastAPI app
34
+ app = FastAPI(title="Kokoro TTS API", version="1.0.0")
35
+
36
+ # API Key Security
37
+ api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
38
+
39
+ async def verify_api_key(api_key: str = Security(api_key_header)):
40
+ if api_key != SECRET_KEY:
41
+ raise HTTPException(
42
+ status_code=403,
43
+ detail="Invalid API Key"
44
+ )
45
+ return api_key
46
+
47
+ # Initialize models and pipelines
48
+ logger.info(f"Initializing models... CUDA Available: {CUDA_AVAILABLE}")
49
+ models = {}
50
+ pipelines = {}
51
+
52
+ # Voice choices mapping
53
+ VOICE_CHOICES = {
54
+ 'af_heart': '🇺🇸 🚺 Heart ❤️',
55
+ 'af_bella': '🇺🇸 🚺 Bella 🔥',
56
+ 'af_nicole': '🇺🇸 🚺 Nicole 🎧',
57
+ 'af_aoede': '🇺🇸 🚺 Aoede',
58
+ 'af_kore': '🇺🇸 🚺 Kore',
59
+ 'af_sarah': '🇺🇸 🚺 Sarah',
60
+ 'af_nova': '🇺🇸 🚺 Nova',
61
+ 'af_sky': '🇺🇸 🚺 Sky',
62
+ 'af_alloy': '🇺🇸 🚺 Alloy',
63
+ 'af_jessica': '🇺🇸 🚺 Jessica',
64
+ 'af_river': '🇺🇸 🚺 River',
65
+ 'am_michael': '🇺🇸 🚹 Michael',
66
+ 'am_fenrir': '🇺🇸 🚹 Fenrir',
67
+ 'am_puck': '🇺🇸 🚹 Puck',
68
+ 'am_echo': '🇺🇸 🚹 Echo',
69
+ 'am_eric': '🇺🇸 🚹 Eric',
70
+ 'am_liam': '🇺🇸 🚹 Liam',
71
+ 'am_onyx': '🇺🇸 🚹 Onyx',
72
+ 'am_santa': '🇺🇸 🚹 Santa',
73
+ 'am_adam': '🇺🇸 🚹 Adam',
74
+ 'bf_emma': '🇬🇧 🚺 Emma',
75
+ 'bf_isabella': '🇬🇧 🚺 Isabella',
76
+ 'bf_alice': '🇬🇧 🚺 Alice',
77
+ 'bf_lily': '🇬🇧 🚺 Lily',
78
+ 'bm_george': '🇬🇧 🚹 George',
79
+ 'bm_fable': '🇬🇧 🚹 Fable',
80
+ 'bm_lewis': '🇬🇧 🚹 Lewis',
81
+ 'bm_daniel': '🇬🇧 🚹 Daniel',
82
+ }
83
+
84
+ # Request/Response Models
85
+ class TTSRequest(BaseModel):
86
+ text: str
87
+ voice: str = "af_heart"
88
+ use_gpu: Optional[bool] = None
89
+ speed: float = 1.0
90
+
91
+ class TTSResponse(BaseModel):
92
+ success: bool
93
+ message: str
94
+
95
+ # Initialize models on startup
96
+ @app.on_event("startup")
97
+ async def startup_event():
98
+ global models, pipelines
99
+
100
+ try:
101
+ # Initialize models for CPU and GPU if available
102
+ models = {
103
+ False: KModel().to('cpu').eval()
104
+ }
105
+ if CUDA_AVAILABLE:
106
+ models[True] = KModel().to('cuda').eval()
107
+
108
+ # Initialize pipelines
109
+ pipelines = {
110
+ 'a': KPipeline(lang_code='a', model=False),
111
+ 'b': KPipeline(lang_code='b', model=False)
112
+ }
113
+
114
+ # Set up lexicon
115
+ pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
116
+ pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
117
+
118
+ # Preload voices
119
+ for voice_code in VOICE_CHOICES.keys():
120
+ try:
121
+ pipelines[voice_code[0]].load_voice(voice_code)
122
+ except Exception as e:
123
+ logger.warning(f"Could not preload voice {voice_code}: {e}")
124
+
125
+ logger.info("Models and pipelines initialized successfully")
126
+ except Exception as e:
127
+ logger.error(f"Failed to initialize models: {e}")
128
+ raise
129
+
130
+ def split_text_into_chunks(text: str, max_chars: int = 500) -> List[str]:
131
+ """Split text into chunks at sentence boundaries"""
132
+ sentences = re.split(r'(?<=[.!?])\s+', text)
133
+ chunks = []
134
+ current_chunk = ""
135
+
136
+ for sentence in sentences:
137
+ if len(current_chunk) + len(sentence) + 1 <= max_chars:
138
+ current_chunk += (" " if current_chunk else "") + sentence
139
+ else:
140
+ if current_chunk:
141
+ chunks.append(current_chunk)
142
+ if len(sentence) > max_chars:
143
+ words = sentence.split()
144
+ current_chunk = ""
145
+ for word in words:
146
+ if len(current_chunk) + len(word) + 1 <= max_chars:
147
+ current_chunk += (" " if current_chunk else "") + word
148
+ else:
149
+ if current_chunk:
150
+ chunks.append(current_chunk)
151
+ current_chunk = word
152
+ else:
153
+ current_chunk = sentence
154
+
155
+ if current_chunk:
156
+ chunks.append(current_chunk)
157
+
158
+ return chunks
159
+
160
+ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool):
161
+ """Generate audio for a single text chunk with optimized processing"""
162
+ pipeline = pipelines[voice[0]]
163
+ pack = pipeline.load_voice(voice)
164
+
165
+ for _, ps, _ in pipeline(text, voice, speed):
166
+ ref_s = pack[len(ps)-1]
167
+
168
+ try:
169
+ with torch.no_grad():
170
+ if use_gpu:
171
+ audio = models[True](ps, ref_s, speed)
172
+ else:
173
+ audio = models[False](ps, ref_s, speed)
174
+
175
+ return audio.numpy()
176
+ except Exception as e:
177
+ if use_gpu:
178
+ logger.warning(f"GPU processing failed, falling back to CPU: {e}")
179
+ with torch.no_grad():
180
+ audio = models[False](ps, ref_s, speed)
181
+ return audio.numpy()
182
+ else:
183
+ raise e
184
+
185
+ return None
186
+
187
+ async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None):
188
+ """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""
189
+
190
+ text = text.strip()
191
+
192
+ if use_gpu is None:
193
+ use_gpu = CUDA_AVAILABLE
194
+ else:
195
+ use_gpu = use_gpu and CUDA_AVAILABLE
196
+
197
+ chunks = split_text_into_chunks(text, max_chars=500)
198
+ logger.info(f"Split text into {len(chunks)} chunks for parallel processing")
199
+
200
+ start_time = time.time()
201
+
202
+ loop = asyncio.get_event_loop()
203
+ max_parallel = min(len(chunks), 4)
204
+ with ThreadPoolExecutor(max_workers=max_parallel) as executor:
205
+ tasks = []
206
+ for i, chunk in enumerate(chunks):
207
+ task = loop.run_in_executor(
208
+ executor,
209
+ generate_audio_chunk,
210
+ chunk,
211
+ voice,
212
+ speed,
213
+ use_gpu
214
+ )
215
+ tasks.append(task)
216
+
217
+ audio_results = await asyncio.gather(*tasks)
218
+
219
+ process_time = time.time() - start_time
220
+ logger.info(f"Parallel processing completed in {process_time:.2f}s")
221
+
222
+ sample_rate = 24000
223
+ silence_gap = np.zeros(int(0.1 * sample_rate), dtype=np.float32)
224
+
225
+ audio_chunks = []
226
+ for i, audio_chunk in enumerate(audio_results):
227
+ if audio_chunk is not None:
228
+ audio_chunks.append(audio_chunk)
229
+ if i < len(audio_results) - 1:
230
+ audio_chunks.append(silence_gap)
231
+
232
+ if not audio_chunks:
233
+ return None
234
+
235
+ if len(audio_chunks) == 1:
236
+ return audio_chunks[0]
237
+
238
+ merged_audio = np.concatenate(audio_chunks)
239
+ logger.info(f"Successfully merged {len(chunks)} chunks into final audio of {len(merged_audio)} samples ({process_time:.2f}s total)")
240
+
241
+ return merged_audio
242
+
243
+ def numpy_to_mp3(audio_array: np.ndarray, sample_rate: int = 24000) -> bytes:
244
+ """Convert numpy array to MP3 bytes"""
245
+
246
+ # Convert to int16 for better compatibility
247
+ audio_int16 = (audio_array * 32767).astype(np.int16)
248
+
249
+ # Create WAV in memory
250
+ wav_buffer = io.BytesIO()
251
+ sf.write(wav_buffer, audio_int16, sample_rate, format='WAV', subtype='PCM_16')
252
+ wav_buffer.seek(0)
253
+
254
+ # Convert WAV to MP3 using pydub
255
+ audio_segment = AudioSegment.from_wav(wav_buffer)
256
+
257
+ # Export as MP3
258
+ mp3_buffer = io.BytesIO()
259
+ audio_segment.export(mp3_buffer, format="mp3", bitrate="192k")
260
+ mp3_buffer.seek(0)
261
+
262
+ return mp3_buffer.read()
263
+
264
+ # API Endpoints
265
+ @app.get("/")
266
+ async def root():
267
+ return {"message": "Kokoro TTS API is running", "cuda_available": CUDA_AVAILABLE}
268
+
269
+ @app.get("/health")
270
+ async def health_check():
271
+ return {"status": "healthy", "cuda_available": CUDA_AVAILABLE}
272
+
273
+ @app.post("/generate")
274
+ async def generate_tts(
275
+ request: TTSRequest,
276
+ api_key: str = Depends(verify_api_key)
277
+ ):
278
+ """Generate TTS audio from text"""
279
+
280
+ try:
281
+ # Validate voice
282
+ if request.voice not in VOICE_CHOICES:
283
+ raise HTTPException(
284
+ status_code=400,
285
+ detail=f"Invalid voice. Available voices: {list(VOICE_CHOICES.keys())}"
286
+ )
287
+
288
+ # Validate text
289
+ if not request.text or len(request.text.strip()) == 0:
290
+ raise HTTPException(
291
+ status_code=400,
292
+ detail="Text cannot be empty"
293
+ )
294
+
295
+ # Generate audio
296
+ logger.info(f"Generating audio for voice: {request.voice}, text length: {len(request.text)}")
297
+ audio_array = await generate_audio(
298
+ text=request.text,
299
+ voice=request.voice,
300
+ speed=request.speed,
301
+ use_gpu=request.use_gpu
302
+ )
303
+
304
+ if audio_array is None:
305
+ raise HTTPException(
306
+ status_code=500,
307
+ detail="Failed to generate audio"
308
+ )
309
+
310
+ # Convert to MP3
311
+ mp3_bytes = numpy_to_mp3(audio_array)
312
+
313
+ # Return MP3 file
314
+ return Response(
315
+ content=mp3_bytes,
316
+ media_type="audio/mpeg",
317
+ headers={
318
+ "Content-Disposition": f"attachment; filename=tts_output.mp3"
319
+ }
320
+ )
321
+
322
+ except HTTPException:
323
+ raise
324
+ except Exception as e:
325
+ logger.error(f"Error generating TTS: {e}")
326
+ raise HTTPException(
327
+ status_code=500,
328
+ detail=f"Internal server error: {str(e)}"
329
+ )
330
+
331
+ @app.get("/voices")
332
+ async def get_voices(api_key: str = Depends(verify_api_key)):
333
+ """Get available voices"""
334
+ return {"voices": VOICE_CHOICES}
335
+
336
+ if __name__ == "__main__":
337
+ import uvicorn
338
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.6
2
+ uvicorn[standard]==0.34.0
3
+ python-multipart==0.0.6
4
+ kokoro==0.9.4
5
+ numpy>=1.26.0
6
+ soundfile==0.13.0
7
+ pydub>=0.25.1
8
+ pydantic==2.10.4
9
+ scipy==1.14.1
10
+ munch==4.0.0
11
+ huggingface-hub>=0.20.0
12
+ espeakng-loader==0.2.4
13
+ misaki==0.9.4
14
+ spacy==3.8.5