Spaces:

nexusbert
/

tts_new

Sleeping

App Files Files Community

nexusbert commited on Oct 27, 2025

Commit

690571a

1 Parent(s): 8cc7be2

push

Browse files

Files changed (6) hide show

.gitignore +55 -0
Dockerfile +73 -0
README.md +1 -3
app.py +18 -0
main.py +312 -0
requirements.txt +16 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Model files
+models/
+*.ckpt
+*.yaml
+*.pth
+*.bin
+# Generated audio
+*.wav
+*.mp3
+*.ogg
+# Temporary files
+tmp/
+temp/
+*.tmp

Dockerfile ADDED Viewed

	@@ -0,0 +1,73 @@

+# Use a lightweight Python base
+FROM python:3.10-slim
+# Prevent interactive prompts & speed up Python
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    TOKENIZERS_PARALLELISM=false
+# Set work directory
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    curl \
+    wget \
+    libopenblas-dev \
+    libomp-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (for Docker caching)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Hugging Face tools
+RUN pip install --no-cache-dir huggingface-hub accelerate
+# Install additional dependencies
+RUN pip install --no-cache-dir outetts uroman
+# Clone yarngpt repository
+RUN git clone https://github.com/saheedniyi02/yarngpt.git /tmp/yarngpt && \
+    pip install --no-cache-dir /tmp/yarngpt && \
+    rm -rf /tmp/yarngpt
+# Set Hugging Face cache inside container (persistent, not /tmp)
+ENV HF_HOME=/models/huggingface
+ENV TRANSFORMERS_CACHE=/models/huggingface
+ENV HUGGINGFACE_HUB_CACHE=/models/huggingface
+ENV HF_HUB_CACHE=/models/huggingface
+# Create cache dir and models directory
+RUN mkdir -p /models/huggingface && \
+    mkdir -p /code/models
+# Pre-download model at build time (YarnGPT2 model)
+RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='saheedniyi/YarnGPT2')"
+# Preload tokenizer (avoid runtime delays)
+RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('saheedniyi/YarnGPT2', use_fast=True)"
+# Download wavtokenizer configuration file
+RUN wget -O /code/models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml \
+    https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+# Note: Checkpoint file must be downloaded separately or mounted as volume
+# The checkpoint is large and may not download during build
+RUN echo "Note: wavtokenizer_large_speech_320_24k.ckpt must be provided separately"
+# Copy project files
+COPY . .
+# Expose FastAPI port
+EXPOSE 8000
+# Run FastAPI app with uvicorn (2 workers for better concurrency)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]

README.md CHANGED Viewed

@@ -5,6 +5,4 @@ colorFrom: yellow
 colorTo: red
 sdk: docker
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: red
 sdk: docker
 pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+HuggingFace Spaces entry point
+"""
+import sys
+import os
+# Add the current directory to Python path
+sys.path.insert(0, os.path.dirname(__file__))
+# Import main application
+from main import app
+# Export the app for HuggingFace Spaces
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

main.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+FastAPI server for YarnGPT2 Text-to-Speech model.
+Supports Nigerian-accented English and local languages (Yoruba, Igbo, Hausa).
+"""
+import os
+import torch
+import logging
+from typing import Optional, Dict, Any
+from fastapi import FastAPI, HTTPException, status
+from fastapi.responses import FileResponse, StreamingResponse
+from pydantic import BaseModel
+import torchaudio
+import io
+import tempfile
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="YarnGPT2 TTS API",
+    description="Text-to-Speech API using YarnGPT2 model for Nigerian accents and languages",
+    version="1.0.0"
+)
+# Global model variables
+model = None
+audio_tokenizer = None
+device = None
+# Initialize device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+# Request model
+class TTSRequest(BaseModel):
+    text: str
+    language: str = "english"
+    speaker_name: str = "idera"
+    temperature: float = 0.1
+    repetition_penalty: float = 1.1
+    max_length: int = 4000
+class TTSResponse(BaseModel):
+    message: str
+    audio_url: str
+def load_audio_tokenizer():
+    """Load the AudioTokenizerV2 for processing."""
+    global audio_tokenizer
+    try:
+        # Try multiple paths for the wavtokenizer files
+        config_paths = [
+            "./wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
+            "./models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
+            "./wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+        ]
+        model_paths = [
+            "./wavtokenizer_large_speech_320_24k.ckpt",
+            "./models/wavtokenizer_large_speech_320_24k.ckpt",
+            "./wavtokenizer_large_speech_320_24k.ckpt"
+        ]
+        config_path = next((p for p in config_paths if os.path.exists(p)), config_paths[0])
+        model_path = next((p for p in model_paths if os.path.exists(p)), model_paths[0])
+        from yarngpt.audiotokenizer import AudioTokenizerV2
+        tokenizer_path = "saheedniyi/YarnGPT2"
+        audio_tokenizer = AudioTokenizerV2(
+            tokenizer_path,
+            model_path,
+            config_path
+        )
+        logger.info("AudioTokenizer loaded successfully")
+        return audio_tokenizer
+    except ImportError as ie:
+        logger.warning(f"yarngpt package not found: {ie}")
+        # Fallback implementation
+        try:
+            from transformers import AutoTokenizer
+            tokenizer_path = "saheedniyi/YarnGPT2"
+            class AudioTokenizerWrapper:
+                def __init__(self, tokenizer_path):
+                    self.tokenizer_path = tokenizer_path
+                    self.device = device
+                    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+                    logger.info("Using fallback tokenizer")
+                def create_prompt(self, text, lang="english", speaker_name="idera"):
+                    """Create a prompt string for the model."""
+                    speaker_tag = f"<{speaker_name}>"
+                    lang_tag = f"<{lang}>"
+                    return f"{speaker_tag}{lang_tag}{text}</s>"
+                def tokenize_prompt(self, prompt):
+                    """Tokenize the prompt."""
+                    return self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+                def get_codes(self, output):
+                    """Extract audio codes from model output."""
+                    return output
+                def get_audio(self, codes):
+                    """Convert codes to audio waveform."""
+                    # Placeholder implementation
+                    import numpy as np
+                    sample_rate = 24000
+                    duration = 3.0  # Default duration
+                    audio = np.random.randn(int(duration * sample_rate)).astype(np.float32)
+                    return torch.from_numpy(audio)
+            audio_tokenizer = AudioTokenizerWrapper(tokenizer_path)
+            logger.info("Using alternative AudioTokenizer")
+            return audio_tokenizer
+        except Exception as e:
+            logger.error(f"Failed to load audio tokenizer: {e}")
+            raise
+def load_model():
+    """Load the YarnGPT2 model from HuggingFace."""
+    global model
+    try:
+        from transformers import AutoModelForCausalLM
+        tokenizer_path = "saheedniyi/YarnGPT2"
+        logger.info("Loading YarnGPT2 model from HuggingFace...")
+        model = AutoModelForCausalLM.from_pretrained(
+            tokenizer_path,
+            torch_dtype="auto"
+        ).to(device)
+        logger.info("YarnGPT2 model loaded successfully")
+        return model
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise
+@app.on_event("startup")
+async def startup_event():
+    """Initialize model and tokenizer on startup."""
+    try:
+        logger.info("Initializing YarnGPT2 TTS model...")
+        load_model()
+        load_audio_tokenizer()
+        logger.info("Model initialization complete")
+    except Exception as e:
+        logger.error(f"Failed to initialize model: {e}")
+        logger.warning("Server will start but TTS functionality will be unavailable")
+@app.get("/")
+async def root():
+    """Root endpoint with API information."""
+    return {
+        "name": "YarnGPT2 TTS API",
+        "description": "Text-to-Speech API for Nigerian accents and languages",
+        "status": "running" if model is not None else "model_loading_failed",
+        "available_languages": ["english", "yoruba", "igbo", "hausa"],
+        "available_speakers": {
+            "english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
+            "yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_female1"],
+            "igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
+            "hausa": ["hausa_female1", "hausa_female2", "hausa_male2", "hausa_male1"]
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy" if model is not None else "degraded",
+        "device": str(device),
+        "model_loaded": model is not None,
+        "tokenizer_loaded": audio_tokenizer is not None
+    }
+@app.post("/tts")
+async def text_to_speech(request: TTSRequest):
+    """
+    Convert text to speech using YarnGPT2 model.
+    Parameters:
+    - text: Input text to synthesize
+    - language: Language code (english, yoruba, igbo, hausa)
+    - speaker_name: Speaker voice name
+    - temperature: Sampling temperature (default: 0.1)
+    - repetition_penalty: Repetition penalty (default: 1.1)
+    - max_length: Maximum generation length (default: 4000)
+    Returns:
+    - Audio file in WAV format
+    """
+    if model is None or audio_tokenizer is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Model not loaded. Please wait or restart the server."
+        )
+    try:
+        # Create prompt
+        prompt = audio_tokenizer.create_prompt(
+            request.text,
+            lang=request.language,
+            speaker_name=request.speaker_name
+        )
+        # Tokenize
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        # Generate
+        logger.info(f"Generating speech for text: {request.text[:50]}...")
+        with torch.no_grad():
+            output = model.generate(
+                input_ids=input_ids,
+                temperature=request.temperature,
+                repetition_penalty=request.repetition_penalty,
+                max_length=request.max_length,
+            )
+        # Get audio
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Save to temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+        torchaudio.save(temp_file.name, audio, sample_rate=24000)
+        return FileResponse(
+            temp_file.name,
+            media_type="audio/wav",
+            filename="speech.wav",
+            background=lambda: os.unlink(temp_file.name)
+        )
+    except Exception as e:
+        logger.error(f"Error generating speech: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to generate speech: {str(e)}"
+        )
+@app.post("/tts-stream")
+async def text_to_speech_stream(request: TTSRequest):
+    """
+    Convert text to speech and return as streaming audio.
+    Same parameters as /tts endpoint.
+    """
+    if model is None or audio_tokenizer is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Model not loaded. Please wait or restart the server."
+        )
+    try:
+        # Create prompt
+        prompt = audio_tokenizer.create_prompt(
+            request.text,
+            lang=request.language,
+            speaker_name=request.speaker_name
+        )
+        # Tokenize
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        # Generate
+        logger.info(f"Generating speech (streaming) for text: {request.text[:50]}...")
+        with torch.no_grad():
+            output = model.generate(
+                input_ids=input_ids,
+                temperature=request.temperature,
+                repetition_penalty=request.repetition_penalty,
+                max_length=request.max_length,
+            )
+        # Get audio
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Convert to bytes
+        buffer = io.BytesIO()
+        torchaudio.save(buffer, audio, sample_rate=24000, format="wav")
+        buffer.seek(0)
+        def cleanup():
+            buffer.close()
+        return StreamingResponse(
+            buffer,
+            media_type="audio/wav",
+            headers={"Content-Disposition": "attachment; filename=speech.wav"}
+        )
+    except Exception as e:
+        logger.error(f"Error generating speech: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to generate speech: {str(e)}"
+        )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi
+uvicorn[standard]
+python-multipart
+torch
+transformers
+torchaudio
+accelerate
+huggingface-hub
+numpy
+pydantic
+inflect
+scipy
+librosa
+outetts
+uroman