muhammadnoman76 commited on
Commit
62b2615
·
1 Parent(s): 96099ed
Files changed (3) hide show
  1. Dockerfile +44 -0
  2. app.py +381 -0
  3. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ ffmpeg \
6
+ libsndfile1 \
7
+ wget \
8
+ git \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Copy requirements first for better caching
16
+ COPY requirements.txt .
17
+
18
+ # Install PyTorch CPU-only version first (much smaller and faster)
19
+ RUN pip install --no-cache-dir torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
20
+
21
+ # Install remaining Python dependencies
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Download spacy model during build to avoid runtime network calls
25
+ RUN python -m spacy download en_core_web_sm
26
+
27
+ # Copy application code
28
+ COPY app.py .
29
+
30
+ # Create directory for models (they will be downloaded on first run)
31
+ RUN mkdir -p /app/models
32
+
33
+ # Set environment variables
34
+ ENV PYTHONUNBUFFERED=1
35
+
36
+ # Expose port
37
+ EXPOSE 7860
38
+
39
+ # Health check
40
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
41
+ CMD curl -f http://localhost:7860/health || exit 1
42
+
43
+ # Run the application
44
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import torch
4
+ import numpy as np
5
+ from fastapi import FastAPI, HTTPException, Depends, Security
6
+ from fastapi.security import APIKeyHeader
7
+ from fastapi.responses import Response
8
+ from pydantic import BaseModel
9
+ from typing import Optional, List
10
+ import soundfile as sf
11
+ from pydub import AudioSegment
12
+ from kokoro import KModel, KPipeline
13
+ import logging
14
+ import re
15
+ import asyncio
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import time
18
+
19
+ # Setup logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Configuration
24
+ SECRET_KEY = os.getenv("API_SECRET_KEY", "your-default-secret-key")
25
+ CUDA_AVAILABLE = torch.cuda.is_available()
26
+
27
+ try:
28
+ char_limit_env = os.getenv("CHAR_LIMIT", "5000")
29
+ CHAR_LIMIT = int(char_limit_env) if char_limit_env.isdigit() else 5000
30
+ except (ValueError, AttributeError):
31
+ CHAR_LIMIT = 5000
32
+
33
+ # FastAPI app
34
+ app = FastAPI(title="Kokoro TTS API", version="1.0.0")
35
+
36
+ # API Key Security
37
+ api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
38
+
39
+ async def verify_api_key(api_key: str = Security(api_key_header)):
40
+ if api_key != SECRET_KEY:
41
+ raise HTTPException(
42
+ status_code=403,
43
+ detail="Invalid API Key"
44
+ )
45
+ return api_key
46
+
47
+ # Initialize models and pipelines
48
+ logger.info(f"Initializing models... CUDA Available: {CUDA_AVAILABLE}")
49
+ models = {}
50
+ pipelines = {}
51
+
52
+ LANGUAGES = {
53
+ 'a': '🇺🇸 American English',
54
+ 'b': '🇬🇧 British English',
55
+ 'e': '🇪🇸 Spanish',
56
+ 'f': '🇫🇷 French',
57
+ 'h': '🇮🇳 Hindi',
58
+ 'i': '🇮🇹 Italian',
59
+ 'j': '🇯🇵 Japanese',
60
+ 'p': '🇧🇷 Brazilian Portuguese',
61
+ 'z': '🇨🇳 Mandarin Chinese'
62
+ }
63
+
64
+ VOICE_CHOICES = {
65
+ 'af_heart': '🇺🇸 🚺 Heart ❤️',
66
+ 'af_bella': '🇺🇸 🚺 Bella 🔥',
67
+ 'af_nicole': '🇺🇸 🚺 Nicole 🎧',
68
+ 'af_aoede': '🇺🇸 🚺 Aoede',
69
+ 'af_kore': '🇺🇸 🚺 Kore',
70
+ 'af_sarah': '🇺🇸 🚺 Sarah',
71
+ 'af_nova': '🇺🇸 🚺 Nova',
72
+ 'af_sky': '🇺🇸 🚺 Sky',
73
+ 'af_alloy': '🇺🇸 🚺 Alloy',
74
+ 'af_jessica': '🇺🇸 🚺 Jessica',
75
+ 'af_river': '🇺🇸 🚺 River',
76
+ 'am_michael': '🇺🇸 🚹 Michael',
77
+ 'am_fenrir': '🇺🇸 🚹 Fenrir',
78
+ 'am_puck': '🇺🇸 🚹 Puck',
79
+ 'am_echo': '🇺🇸 🚹 Echo',
80
+ 'am_eric': '🇺🇸 🚹 Eric',
81
+ 'am_liam': '🇺🇸 🚹 Liam',
82
+ 'am_onyx': '🇺🇸 🚹 Onyx',
83
+ 'am_santa': '🇺🇸 🚹 Santa',
84
+ 'am_adam': '🇺🇸 🚹 Adam',
85
+ 'bf_emma': '🇬🇧 🚺 Emma',
86
+ 'bf_isabella': '🇬🇧 🚺 Isabella',
87
+ 'bf_alice': '🇬🇧 🚺 Alice',
88
+ 'bf_lily': '🇬🇧 🚺 Lily',
89
+ 'bm_george': '🇬🇧 🚹 George',
90
+ 'bm_fable': '🇬🇧 🚹 Fable',
91
+ 'bm_lewis': '🇬🇧 🚹 Lewis',
92
+ 'bm_daniel': '🇬🇧 🚹 Daniel',
93
+ }
94
+
95
+ # Request/Response Models
96
+ class TTSRequest(BaseModel):
97
+ text: str
98
+ voice: str = "af_heart"
99
+ language: Optional[str] = None
100
+ use_gpu: Optional[bool] = None
101
+ speed: float = 1.0
102
+
103
+
104
+
105
+ # Initialize models on startup
106
+ @app.on_event("startup")
107
+ async def startup_event():
108
+ global models, pipelines
109
+
110
+ try:
111
+ # Initialize models for CPU and GPU if available
112
+ models = {
113
+ False: KModel().to('cpu').eval()
114
+ }
115
+ if CUDA_AVAILABLE:
116
+ models[True] = KModel().to('cuda').eval()
117
+
118
+ # Initialize pipelines for all supported languages
119
+ for lang_code in LANGUAGES.keys():
120
+ try:
121
+ pipelines[lang_code] = KPipeline(lang_code=lang_code, model=False)
122
+ logger.info(f"Initialized pipeline for language: {lang_code} - {LANGUAGES[lang_code]}")
123
+ except Exception as e:
124
+ logger.warning(f"Could not initialize pipeline for {lang_code}: {e}")
125
+
126
+ # Set up lexicon for English variants
127
+ if 'a' in pipelines:
128
+ pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
129
+ if 'b' in pipelines:
130
+ pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
131
+
132
+ # Preload voices
133
+ for voice_code in VOICE_CHOICES.keys():
134
+ try:
135
+ pipelines[voice_code[0]].load_voice(voice_code)
136
+ except Exception as e:
137
+ logger.warning(f"Could not preload voice {voice_code}: {e}")
138
+
139
+ logger.info("Models and pipelines initialized successfully")
140
+ except Exception as e:
141
+ logger.error(f"Failed to initialize models: {e}")
142
+ raise
143
+
144
+ def split_text_into_chunks(text: str, max_chars: int = 500) -> List[str]:
145
+ """Split text into chunks at sentence boundaries"""
146
+ sentences = re.split(r'(?<=[.!?])\s+', text)
147
+ chunks = []
148
+ current_chunk = ""
149
+
150
+ for sentence in sentences:
151
+ if len(current_chunk) + len(sentence) + 1 <= max_chars:
152
+ current_chunk += (" " if current_chunk else "") + sentence
153
+ else:
154
+ if current_chunk:
155
+ chunks.append(current_chunk)
156
+ if len(sentence) > max_chars:
157
+ words = sentence.split()
158
+ current_chunk = ""
159
+ for word in words:
160
+ if len(current_chunk) + len(word) + 1 <= max_chars:
161
+ current_chunk += (" " if current_chunk else "") + word
162
+ else:
163
+ if current_chunk:
164
+ chunks.append(current_chunk)
165
+ current_chunk = word
166
+ else:
167
+ current_chunk = sentence
168
+
169
+ if current_chunk:
170
+ chunks.append(current_chunk)
171
+
172
+ return chunks
173
+
174
+ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
175
+ """Generate audio for a single text chunk with optimized processing"""
176
+ pipeline = pipelines[lang_code]
177
+ pack = pipeline.load_voice(voice)
178
+
179
+ for _, ps, _ in pipeline(text, voice, speed):
180
+ ref_s = pack[len(ps)-1]
181
+
182
+ try:
183
+ with torch.no_grad():
184
+ if use_gpu:
185
+ audio = models[True](ps, ref_s, speed)
186
+ else:
187
+ audio = models[False](ps, ref_s, speed)
188
+
189
+ return audio.numpy()
190
+ except Exception as e:
191
+ if use_gpu:
192
+ logger.warning(f"GPU processing failed, falling back to CPU: {e}")
193
+ with torch.no_grad():
194
+ audio = models[False](ps, ref_s, speed)
195
+ return audio.numpy()
196
+ else:
197
+ raise e
198
+
199
+ return None
200
+
201
+ async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
202
+ """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""
203
+
204
+ text = text.strip()
205
+
206
+ if use_gpu is None:
207
+ use_gpu = CUDA_AVAILABLE
208
+ else:
209
+ use_gpu = use_gpu and CUDA_AVAILABLE
210
+
211
+ if lang_code not in pipelines:
212
+ raise ValueError(f"Language '{lang_code}' not supported or not initialized")
213
+
214
+ chunks = split_text_into_chunks(text, max_chars=500)
215
+ logger.info(f"Split text into {len(chunks)} chunks for parallel processing")
216
+
217
+ start_time = time.time()
218
+
219
+ loop = asyncio.get_event_loop()
220
+ max_parallel = min(len(chunks), 4)
221
+ with ThreadPoolExecutor(max_workers=max_parallel) as executor:
222
+ tasks = []
223
+ for i, chunk in enumerate(chunks):
224
+ task = loop.run_in_executor(
225
+ executor,
226
+ generate_audio_chunk,
227
+ chunk,
228
+ voice,
229
+ speed,
230
+ use_gpu,
231
+ lang_code
232
+ )
233
+ tasks.append(task)
234
+
235
+ audio_results = await asyncio.gather(*tasks)
236
+
237
+ process_time = time.time() - start_time
238
+ logger.info(f"Parallel processing completed in {process_time:.2f}s")
239
+
240
+ sample_rate = 24000
241
+ silence_gap = np.zeros(int(0.1 * sample_rate), dtype=np.float32)
242
+
243
+ audio_chunks = []
244
+ for i, audio_chunk in enumerate(audio_results):
245
+ if audio_chunk is not None:
246
+ audio_chunks.append(audio_chunk)
247
+ if i < len(audio_results) - 1:
248
+ audio_chunks.append(silence_gap)
249
+
250
+ if not audio_chunks:
251
+ return None, 0
252
+
253
+ if len(audio_chunks) == 1:
254
+ return audio_chunks[0], process_time
255
+
256
+ merged_audio = np.concatenate(audio_chunks)
257
+ logger.info(f"Successfully merged {len(chunks)} chunks into final audio of {len(merged_audio)} samples ({process_time:.2f}s total)")
258
+
259
+ return merged_audio, process_time
260
+
261
+ def numpy_to_mp3(audio_array: np.ndarray, sample_rate: int = 24000) -> bytes:
262
+ """Convert numpy array to MP3 bytes"""
263
+
264
+ # Convert to int16 for better compatibility
265
+ audio_int16 = (audio_array * 32767).astype(np.int16)
266
+
267
+ # Create WAV in memory
268
+ wav_buffer = io.BytesIO()
269
+ sf.write(wav_buffer, audio_int16, sample_rate, format='WAV', subtype='PCM_16')
270
+ wav_buffer.seek(0)
271
+
272
+ # Convert WAV to MP3 using pydub
273
+ audio_segment = AudioSegment.from_wav(wav_buffer)
274
+
275
+ # Export as MP3
276
+ mp3_buffer = io.BytesIO()
277
+ audio_segment.export(mp3_buffer, format="mp3", bitrate="192k")
278
+ mp3_buffer.seek(0)
279
+
280
+ return mp3_buffer.read()
281
+
282
+ # API Endpoints
283
+ @app.get("/")
284
+ async def root():
285
+ return {"message": "Kokoro TTS API is running", "cuda_available": CUDA_AVAILABLE}
286
+
287
+ @app.get("/health")
288
+ async def health_check():
289
+ return {"status": "healthy", "cuda_available": CUDA_AVAILABLE}
290
+
291
+ @app.post("/generate")
292
+ async def generate_tts(
293
+ request: TTSRequest,
294
+ api_key: str = Depends(verify_api_key)
295
+ ):
296
+ """Generate TTS audio from text"""
297
+
298
+ try:
299
+ # Validate voice
300
+ if request.voice not in VOICE_CHOICES:
301
+ raise HTTPException(
302
+ status_code=400,
303
+ detail=f"Invalid voice. Available voices: {list(VOICE_CHOICES.keys())}"
304
+ )
305
+
306
+ # Determine language from voice or use provided language
307
+ lang_code = request.language
308
+ if lang_code is None:
309
+ lang_code = request.voice[0]
310
+
311
+ # Validate language
312
+ if lang_code not in LANGUAGES:
313
+ raise HTTPException(
314
+ status_code=400,
315
+ detail=f"Invalid language. Available languages: {list(LANGUAGES.keys())}"
316
+ )
317
+
318
+ # Validate text
319
+ if not request.text or len(request.text.strip()) == 0:
320
+ raise HTTPException(
321
+ status_code=400,
322
+ detail="Text cannot be empty"
323
+ )
324
+
325
+ # Generate audio
326
+ logger.info(f"Generating audio for voice: {request.voice}, language: {lang_code}, text length: {len(request.text)}")
327
+ audio_array, generation_time = await generate_audio(
328
+ text=request.text,
329
+ voice=request.voice,
330
+ speed=request.speed,
331
+ use_gpu=request.use_gpu,
332
+ lang_code=lang_code
333
+ )
334
+
335
+ if audio_array is None:
336
+ raise HTTPException(
337
+ status_code=500,
338
+ detail="Failed to generate audio"
339
+ )
340
+
341
+ # Calculate audio duration
342
+ sample_rate = 24000
343
+ audio_duration = len(audio_array) / sample_rate
344
+
345
+ # Convert to MP3
346
+ mp3_bytes = numpy_to_mp3(audio_array, sample_rate)
347
+
348
+ # Return MP3 file with metadata in headers
349
+ return Response(
350
+ content=mp3_bytes,
351
+ media_type="audio/mpeg",
352
+ headers={
353
+ "Content-Disposition": "attachment; filename=tts_output.mp3",
354
+ "X-Audio-Duration": str(audio_duration),
355
+ "X-Generation-Time": str(generation_time),
356
+ "X-Sample-Rate": str(sample_rate)
357
+ }
358
+ )
359
+
360
+ except HTTPException:
361
+ raise
362
+ except Exception as e:
363
+ logger.error(f"Error generating TTS: {e}")
364
+ raise HTTPException(
365
+ status_code=500,
366
+ detail=f"Internal server error: {str(e)}"
367
+ )
368
+
369
+ @app.get("/voices")
370
+ async def get_voices(api_key: str = Depends(verify_api_key)):
371
+ """Get available voices"""
372
+ return {"voices": VOICE_CHOICES}
373
+
374
+ @app.get("/languages")
375
+ async def get_languages(api_key: str = Depends(verify_api_key)):
376
+ """Get available languages"""
377
+ return {"languages": LANGUAGES}
378
+
379
+ if __name__ == "__main__":
380
+ import uvicorn
381
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.6
2
+ uvicorn[standard]==0.34.0
3
+ python-multipart==0.0.6
4
+ kokoro==0.9.4
5
+ numpy>=1.26.0
6
+ soundfile==0.13.0
7
+ pydub>=0.25.1
8
+ pydantic==2.10.4
9
+ scipy==1.14.1
10
+ munch==4.0.0
11
+ huggingface-hub>=0.20.0
12
+ espeakng-loader==0.2.4
13
+ misaki==0.9.4
14
+ spacy==3.8.5