Spaces:

muhammadnoman76
/

text_to_speech_2

Running

App Files Files Community

muhammadnoman76 commited on 24 days ago

Commit

62b2615

1 Parent(s): 96099ed

update

Browse files

Files changed (3) hide show

Dockerfile +44 -0
app.py +381 -0
requirements.txt +14 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    wget \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install PyTorch CPU-only version first (much smaller and faster)
+RUN pip install --no-cache-dir torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu
+# Install remaining Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Download spacy model during build to avoid runtime network calls
+RUN python -m spacy download en_core_web_sm
+# Copy application code
+COPY app.py .
+# Create directory for models (they will be downloaded on first run)
+RUN mkdir -p /app/models
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import os
+import io
+import torch
+import numpy as np
+from fastapi import FastAPI, HTTPException, Depends, Security
+from fastapi.security import APIKeyHeader
+from fastapi.responses import Response
+from pydantic import BaseModel
+from typing import Optional, List
+import soundfile as sf
+from pydub import AudioSegment
+from kokoro import KModel, KPipeline
+import logging
+import re
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import time
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configuration
+SECRET_KEY = os.getenv("API_SECRET_KEY", "your-default-secret-key")
+CUDA_AVAILABLE = torch.cuda.is_available()
+try:
+    char_limit_env = os.getenv("CHAR_LIMIT", "5000")
+    CHAR_LIMIT = int(char_limit_env) if char_limit_env.isdigit() else 5000
+except (ValueError, AttributeError):
+    CHAR_LIMIT = 5000
+# FastAPI app
+app = FastAPI(title="Kokoro TTS API", version="1.0.0")
+# API Key Security
+api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
+async def verify_api_key(api_key: str = Security(api_key_header)):
+    if api_key != SECRET_KEY:
+        raise HTTPException(
+            status_code=403,
+            detail="Invalid API Key"
+        )
+    return api_key
+# Initialize models and pipelines
+logger.info(f"Initializing models... CUDA Available: {CUDA_AVAILABLE}")
+models = {}
+pipelines = {}
+LANGUAGES = {
+    'a': '🇺🇸 American English',
+    'b': '🇬🇧 British English',
+    'e': '🇪🇸 Spanish',
+    'f': '🇫🇷 French',
+    'h': '🇮🇳 Hindi',
+    'i': '🇮🇹 Italian',
+    'j': '🇯🇵 Japanese',
+    'p': '🇧🇷 Brazilian Portuguese',
+    'z': '🇨🇳 Mandarin Chinese'
+}
+VOICE_CHOICES = {
+    'af_heart': '🇺🇸 🚺 Heart ❤️',
+    'af_bella': '🇺🇸 🚺 Bella 🔥',
+    'af_nicole': '🇺🇸 🚺 Nicole 🎧',
+    'af_aoede': '🇺🇸 🚺 Aoede',
+    'af_kore': '🇺🇸 🚺 Kore',
+    'af_sarah': '🇺🇸 🚺 Sarah',
+    'af_nova': '🇺🇸 🚺 Nova',
+    'af_sky': '🇺🇸 🚺 Sky',
+    'af_alloy': '🇺🇸 🚺 Alloy',
+    'af_jessica': '🇺🇸 🚺 Jessica',
+    'af_river': '🇺🇸 🚺 River',
+    'am_michael': '🇺🇸 🚹 Michael',
+    'am_fenrir': '🇺🇸 🚹 Fenrir',
+    'am_puck': '🇺🇸 🚹 Puck',
+    'am_echo': '🇺🇸 🚹 Echo',
+    'am_eric': '🇺🇸 🚹 Eric',
+    'am_liam': '🇺🇸 🚹 Liam',
+    'am_onyx': '🇺🇸 🚹 Onyx',
+    'am_santa': '🇺🇸 🚹 Santa',
+    'am_adam': '🇺🇸 🚹 Adam',
+    'bf_emma': '🇬🇧 🚺 Emma',
+    'bf_isabella': '🇬🇧 🚺 Isabella',
+    'bf_alice': '🇬🇧 🚺 Alice',
+    'bf_lily': '🇬🇧 🚺 Lily',
+    'bm_george': '🇬🇧 🚹 George',
+    'bm_fable': '🇬🇧 🚹 Fable',
+    'bm_lewis': '🇬🇧 🚹 Lewis',
+    'bm_daniel': '🇬🇧 🚹 Daniel',
+}
+# Request/Response Models
+class TTSRequest(BaseModel):
+    text: str
+    voice: str = "af_heart"
+    language: Optional[str] = None
+    use_gpu: Optional[bool] = None
+    speed: float = 1.0
+# Initialize models on startup
+@app.on_event("startup")
+async def startup_event():
+    global models, pipelines
+    try:
+        # Initialize models for CPU and GPU if available
+        models = {
+            False: KModel().to('cpu').eval()
+        }
+        if CUDA_AVAILABLE:
+            models[True] = KModel().to('cuda').eval()
+        # Initialize pipelines for all supported languages
+        for lang_code in LANGUAGES.keys():
+            try:
+                pipelines[lang_code] = KPipeline(lang_code=lang_code, model=False)
+                logger.info(f"Initialized pipeline for language: {lang_code} - {LANGUAGES[lang_code]}")
+            except Exception as e:
+                logger.warning(f"Could not initialize pipeline for {lang_code}: {e}")
+        # Set up lexicon for English variants
+        if 'a' in pipelines:
+            pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
+        if 'b' in pipelines:
+            pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
+        # Preload voices
+        for voice_code in VOICE_CHOICES.keys():
+            try:
+                pipelines[voice_code[0]].load_voice(voice_code)
+            except Exception as e:
+                logger.warning(f"Could not preload voice {voice_code}: {e}")
+        logger.info("Models and pipelines initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize models: {e}")
+        raise
+def split_text_into_chunks(text: str, max_chars: int = 500) -> List[str]:
+    """Split text into chunks at sentence boundaries"""
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) + 1 <= max_chars:
+            current_chunk += (" " if current_chunk else "") + sentence
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            if len(sentence) > max_chars:
+                words = sentence.split()
+                current_chunk = ""
+                for word in words:
+                    if len(current_chunk) + len(word) + 1 <= max_chars:
+                        current_chunk += (" " if current_chunk else "") + word
+                    else:
+                        if current_chunk:
+                            chunks.append(current_chunk)
+                        current_chunk = word
+            else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
+    """Generate audio for a single text chunk with optimized processing"""
+    pipeline = pipelines[lang_code]
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps)-1]
+        try:
+            with torch.no_grad():
+                if use_gpu:
+                    audio = models[True](ps, ref_s, speed)
+                else:
+                    audio = models[False](ps, ref_s, speed)
+            return audio.numpy()
+        except Exception as e:
+            if use_gpu:
+                logger.warning(f"GPU processing failed, falling back to CPU: {e}")
+                with torch.no_grad():
+                    audio = models[False](ps, ref_s, speed)
+                return audio.numpy()
+            else:
+                raise e
+    return None
+async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
+    """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""
+    text = text.strip()
+    if use_gpu is None:
+        use_gpu = CUDA_AVAILABLE
+    else:
+        use_gpu = use_gpu and CUDA_AVAILABLE
+    if lang_code not in pipelines:
+        raise ValueError(f"Language '{lang_code}' not supported or not initialized")
+    chunks = split_text_into_chunks(text, max_chars=500)
+    logger.info(f"Split text into {len(chunks)} chunks for parallel processing")
+    start_time = time.time()
+    loop = asyncio.get_event_loop()
+    max_parallel = min(len(chunks), 4)
+    with ThreadPoolExecutor(max_workers=max_parallel) as executor:
+        tasks = []
+        for i, chunk in enumerate(chunks):
+            task = loop.run_in_executor(
+                executor,
+                generate_audio_chunk,
+                chunk,
+                voice,
+                speed,
+                use_gpu,
+                lang_code
+            )
+            tasks.append(task)
+        audio_results = await asyncio.gather(*tasks)
+    process_time = time.time() - start_time
+    logger.info(f"Parallel processing completed in {process_time:.2f}s")
+    sample_rate = 24000
+    silence_gap = np.zeros(int(0.1 * sample_rate), dtype=np.float32)
+    audio_chunks = []
+    for i, audio_chunk in enumerate(audio_results):
+        if audio_chunk is not None:
+            audio_chunks.append(audio_chunk)
+            if i < len(audio_results) - 1:
+                audio_chunks.append(silence_gap)
+    if not audio_chunks:
+        return None, 0
+    if len(audio_chunks) == 1:
+        return audio_chunks[0], process_time
+    merged_audio = np.concatenate(audio_chunks)
+    logger.info(f"Successfully merged {len(chunks)} chunks into final audio of {len(merged_audio)} samples ({process_time:.2f}s total)")
+    return merged_audio, process_time
+def numpy_to_mp3(audio_array: np.ndarray, sample_rate: int = 24000) -> bytes:
+    """Convert numpy array to MP3 bytes"""
+    # Convert to int16 for better compatibility
+    audio_int16 = (audio_array * 32767).astype(np.int16)
+    # Create WAV in memory
+    wav_buffer = io.BytesIO()
+    sf.write(wav_buffer, audio_int16, sample_rate, format='WAV', subtype='PCM_16')
+    wav_buffer.seek(0)
+    # Convert WAV to MP3 using pydub
+    audio_segment = AudioSegment.from_wav(wav_buffer)
+    # Export as MP3
+    mp3_buffer = io.BytesIO()
+    audio_segment.export(mp3_buffer, format="mp3", bitrate="192k")
+    mp3_buffer.seek(0)
+    return mp3_buffer.read()
+# API Endpoints
+@app.get("/")
+async def root():
+    return {"message": "Kokoro TTS API is running", "cuda_available": CUDA_AVAILABLE}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "cuda_available": CUDA_AVAILABLE}
+@app.post("/generate")
+async def generate_tts(
+    request: TTSRequest,
+    api_key: str = Depends(verify_api_key)
+):
+    """Generate TTS audio from text"""
+    try:
+        # Validate voice
+        if request.voice not in VOICE_CHOICES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid voice. Available voices: {list(VOICE_CHOICES.keys())}"
+            )
+        # Determine language from voice or use provided language
+        lang_code = request.language
+        if lang_code is None:
+            lang_code = request.voice[0]
+        # Validate language
+        if lang_code not in LANGUAGES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid language. Available languages: {list(LANGUAGES.keys())}"
+            )
+        # Validate text
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail="Text cannot be empty"
+            )
+        # Generate audio
+        logger.info(f"Generating audio for voice: {request.voice}, language: {lang_code}, text length: {len(request.text)}")
+        audio_array, generation_time = await generate_audio(
+            text=request.text,
+            voice=request.voice,
+            speed=request.speed,
+            use_gpu=request.use_gpu,
+            lang_code=lang_code
+        )
+        if audio_array is None:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to generate audio"
+            )
+        # Calculate audio duration
+        sample_rate = 24000
+        audio_duration = len(audio_array) / sample_rate
+        # Convert to MP3
+        mp3_bytes = numpy_to_mp3(audio_array, sample_rate)
+        # Return MP3 file with metadata in headers
+        return Response(
+            content=mp3_bytes,
+            media_type="audio/mpeg",
+            headers={
+                "Content-Disposition": "attachment; filename=tts_output.mp3",
+                "X-Audio-Duration": str(audio_duration),
+                "X-Generation-Time": str(generation_time),
+                "X-Sample-Rate": str(sample_rate)
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error generating TTS: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
+@app.get("/voices")
+async def get_voices(api_key: str = Depends(verify_api_key)):
+    """Get available voices"""
+    return {"voices": VOICE_CHOICES}
+@app.get("/languages")
+async def get_languages(api_key: str = Depends(verify_api_key)):
+    """Get available languages"""
+    return {"languages": LANGUAGES}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi==0.115.6
+uvicorn[standard]==0.34.0
+python-multipart==0.0.6
+kokoro==0.9.4
+numpy>=1.26.0
+soundfile==0.13.0
+pydub>=0.25.1
+pydantic==2.10.4
+scipy==1.14.1
+munch==4.0.0
+huggingface-hub>=0.20.0
+espeakng-loader==0.2.4
+misaki==0.9.4
+spacy==3.8.5