nexusbert commited on
Commit
690571a
·
1 Parent(s): 8cc7be2
Files changed (6) hide show
  1. .gitignore +55 -0
  2. Dockerfile +73 -0
  3. README.md +1 -3
  4. app.py +18 -0
  5. main.py +312 -0
  6. requirements.txt +16 -0
.gitignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Testing
27
+ .pytest_cache/
28
+ .coverage
29
+ htmlcov/
30
+ .tox/
31
+
32
+ # IDEs
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # Model files
40
+ models/
41
+ *.ckpt
42
+ *.yaml
43
+ *.pth
44
+ *.bin
45
+
46
+ # Generated audio
47
+ *.wav
48
+ *.mp3
49
+ *.ogg
50
+
51
+ # Temporary files
52
+ tmp/
53
+ temp/
54
+ *.tmp
55
+
Dockerfile ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python base
2
+ FROM python:3.10-slim
3
+
4
+ # Prevent interactive prompts & speed up Python
5
+ ENV DEBIAN_FRONTEND=noninteractive \
6
+ PYTHONUNBUFFERED=1 \
7
+ PYTHONDONTWRITEBYTECODE=1 \
8
+ PIP_NO_CACHE_DIR=1 \
9
+ TOKENIZERS_PARALLELISM=false
10
+
11
+ # Set work directory
12
+ WORKDIR /code
13
+
14
+ # Install system dependencies
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ build-essential \
17
+ git \
18
+ curl \
19
+ wget \
20
+ libopenblas-dev \
21
+ libomp-dev \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ # Copy requirements first (for Docker caching)
25
+ COPY requirements.txt .
26
+
27
+ # Install Python dependencies
28
+ RUN pip install --no-cache-dir -r requirements.txt
29
+
30
+ # Hugging Face tools
31
+ RUN pip install --no-cache-dir huggingface-hub accelerate
32
+
33
+ # Install additional dependencies
34
+ RUN pip install --no-cache-dir outetts uroman
35
+
36
+ # Clone yarngpt repository
37
+ RUN git clone https://github.com/saheedniyi02/yarngpt.git /tmp/yarngpt && \
38
+ pip install --no-cache-dir /tmp/yarngpt && \
39
+ rm -rf /tmp/yarngpt
40
+
41
+ # Set Hugging Face cache inside container (persistent, not /tmp)
42
+ ENV HF_HOME=/models/huggingface
43
+ ENV TRANSFORMERS_CACHE=/models/huggingface
44
+ ENV HUGGINGFACE_HUB_CACHE=/models/huggingface
45
+ ENV HF_HUB_CACHE=/models/huggingface
46
+
47
+ # Create cache dir and models directory
48
+ RUN mkdir -p /models/huggingface && \
49
+ mkdir -p /code/models
50
+
51
+ # Pre-download model at build time (YarnGPT2 model)
52
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='saheedniyi/YarnGPT2')"
53
+
54
+ # Preload tokenizer (avoid runtime delays)
55
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('saheedniyi/YarnGPT2', use_fast=True)"
56
+
57
+ # Download wavtokenizer configuration file
58
+ RUN wget -O /code/models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml \
59
+ https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
60
+
61
+ # Note: Checkpoint file must be downloaded separately or mounted as volume
62
+ # The checkpoint is large and may not download during build
63
+ RUN echo "Note: wavtokenizer_large_speech_320_24k.ckpt must be provided separately"
64
+
65
+ # Copy project files
66
+ COPY . .
67
+
68
+ # Expose FastAPI port
69
+ EXPOSE 8000
70
+
71
+ # Run FastAPI app with uvicorn (2 workers for better concurrency)
72
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
73
+
README.md CHANGED
@@ -5,6 +5,4 @@ colorFrom: yellow
5
  colorTo: red
6
  sdk: docker
7
  pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: red
6
  sdk: docker
7
  pinned: false
8
+ ---
 
 
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Spaces entry point
3
+ """
4
+
5
+ import sys
6
+ import os
7
+
8
+ # Add the current directory to Python path
9
+ sys.path.insert(0, os.path.dirname(__file__))
10
+
11
+ # Import main application
12
+ from main import app
13
+
14
+ # Export the app for HuggingFace Spaces
15
+ if __name__ == "__main__":
16
+ import uvicorn
17
+ uvicorn.run(app, host="0.0.0.0", port=7860)
18
+
main.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI server for YarnGPT2 Text-to-Speech model.
3
+ Supports Nigerian-accented English and local languages (Yoruba, Igbo, Hausa).
4
+ """
5
+
6
+ import os
7
+ import torch
8
+ import logging
9
+ from typing import Optional, Dict, Any
10
+ from fastapi import FastAPI, HTTPException, status
11
+ from fastapi.responses import FileResponse, StreamingResponse
12
+ from pydantic import BaseModel
13
+ import torchaudio
14
+ import io
15
+ import tempfile
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ app = FastAPI(
22
+ title="YarnGPT2 TTS API",
23
+ description="Text-to-Speech API using YarnGPT2 model for Nigerian accents and languages",
24
+ version="1.0.0"
25
+ )
26
+
27
+ # Global model variables
28
+ model = None
29
+ audio_tokenizer = None
30
+ device = None
31
+
32
+ # Initialize device
33
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ logger.info(f"Using device: {device}")
35
+
36
+ # Request model
37
+ class TTSRequest(BaseModel):
38
+ text: str
39
+ language: str = "english"
40
+ speaker_name: str = "idera"
41
+ temperature: float = 0.1
42
+ repetition_penalty: float = 1.1
43
+ max_length: int = 4000
44
+
45
+ class TTSResponse(BaseModel):
46
+ message: str
47
+ audio_url: str
48
+
49
+ def load_audio_tokenizer():
50
+ """Load the AudioTokenizerV2 for processing."""
51
+ global audio_tokenizer
52
+
53
+ try:
54
+ # Try multiple paths for the wavtokenizer files
55
+ config_paths = [
56
+ "./wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
57
+ "./models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
58
+ "./wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
59
+ ]
60
+
61
+ model_paths = [
62
+ "./wavtokenizer_large_speech_320_24k.ckpt",
63
+ "./models/wavtokenizer_large_speech_320_24k.ckpt",
64
+ "./wavtokenizer_large_speech_320_24k.ckpt"
65
+ ]
66
+
67
+ config_path = next((p for p in config_paths if os.path.exists(p)), config_paths[0])
68
+ model_path = next((p for p in model_paths if os.path.exists(p)), model_paths[0])
69
+
70
+ from yarngpt.audiotokenizer import AudioTokenizerV2
71
+
72
+ tokenizer_path = "saheedniyi/YarnGPT2"
73
+
74
+ audio_tokenizer = AudioTokenizerV2(
75
+ tokenizer_path,
76
+ model_path,
77
+ config_path
78
+ )
79
+ logger.info("AudioTokenizer loaded successfully")
80
+ return audio_tokenizer
81
+ except ImportError as ie:
82
+ logger.warning(f"yarngpt package not found: {ie}")
83
+ # Fallback implementation
84
+ try:
85
+ from transformers import AutoTokenizer
86
+
87
+ tokenizer_path = "saheedniyi/YarnGPT2"
88
+
89
+ class AudioTokenizerWrapper:
90
+ def __init__(self, tokenizer_path):
91
+ self.tokenizer_path = tokenizer_path
92
+ self.device = device
93
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
94
+ logger.info("Using fallback tokenizer")
95
+
96
+ def create_prompt(self, text, lang="english", speaker_name="idera"):
97
+ """Create a prompt string for the model."""
98
+ speaker_tag = f"<{speaker_name}>"
99
+ lang_tag = f"<{lang}>"
100
+ return f"{speaker_tag}{lang_tag}{text}</s>"
101
+
102
+ def tokenize_prompt(self, prompt):
103
+ """Tokenize the prompt."""
104
+ return self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
105
+
106
+ def get_codes(self, output):
107
+ """Extract audio codes from model output."""
108
+ return output
109
+
110
+ def get_audio(self, codes):
111
+ """Convert codes to audio waveform."""
112
+ # Placeholder implementation
113
+ import numpy as np
114
+ sample_rate = 24000
115
+ duration = 3.0 # Default duration
116
+ audio = np.random.randn(int(duration * sample_rate)).astype(np.float32)
117
+ return torch.from_numpy(audio)
118
+
119
+ audio_tokenizer = AudioTokenizerWrapper(tokenizer_path)
120
+ logger.info("Using alternative AudioTokenizer")
121
+ return audio_tokenizer
122
+ except Exception as e:
123
+ logger.error(f"Failed to load audio tokenizer: {e}")
124
+ raise
125
+
126
+ def load_model():
127
+ """Load the YarnGPT2 model from HuggingFace."""
128
+ global model
129
+
130
+ try:
131
+ from transformers import AutoModelForCausalLM
132
+
133
+ tokenizer_path = "saheedniyi/YarnGPT2"
134
+ logger.info("Loading YarnGPT2 model from HuggingFace...")
135
+
136
+ model = AutoModelForCausalLM.from_pretrained(
137
+ tokenizer_path,
138
+ torch_dtype="auto"
139
+ ).to(device)
140
+
141
+ logger.info("YarnGPT2 model loaded successfully")
142
+ return model
143
+ except Exception as e:
144
+ logger.error(f"Failed to load model: {e}")
145
+ raise
146
+
147
+ @app.on_event("startup")
148
+ async def startup_event():
149
+ """Initialize model and tokenizer on startup."""
150
+ try:
151
+ logger.info("Initializing YarnGPT2 TTS model...")
152
+ load_model()
153
+ load_audio_tokenizer()
154
+ logger.info("Model initialization complete")
155
+ except Exception as e:
156
+ logger.error(f"Failed to initialize model: {e}")
157
+ logger.warning("Server will start but TTS functionality will be unavailable")
158
+
159
+ @app.get("/")
160
+ async def root():
161
+ """Root endpoint with API information."""
162
+ return {
163
+ "name": "YarnGPT2 TTS API",
164
+ "description": "Text-to-Speech API for Nigerian accents and languages",
165
+ "status": "running" if model is not None else "model_loading_failed",
166
+ "available_languages": ["english", "yoruba", "igbo", "hausa"],
167
+ "available_speakers": {
168
+ "english": ["idera", "chinenye", "jude", "emma", "umar", "joke", "zainab", "osagie", "remi", "tayo"],
169
+ "yoruba": ["yoruba_male2", "yoruba_female2", "yoruba_female1"],
170
+ "igbo": ["igbo_female2", "igbo_male2", "igbo_female1"],
171
+ "hausa": ["hausa_female1", "hausa_female2", "hausa_male2", "hausa_male1"]
172
+ }
173
+ }
174
+
175
+ @app.get("/health")
176
+ async def health_check():
177
+ """Health check endpoint."""
178
+ return {
179
+ "status": "healthy" if model is not None else "degraded",
180
+ "device": str(device),
181
+ "model_loaded": model is not None,
182
+ "tokenizer_loaded": audio_tokenizer is not None
183
+ }
184
+
185
+ @app.post("/tts")
186
+ async def text_to_speech(request: TTSRequest):
187
+ """
188
+ Convert text to speech using YarnGPT2 model.
189
+
190
+ Parameters:
191
+ - text: Input text to synthesize
192
+ - language: Language code (english, yoruba, igbo, hausa)
193
+ - speaker_name: Speaker voice name
194
+ - temperature: Sampling temperature (default: 0.1)
195
+ - repetition_penalty: Repetition penalty (default: 1.1)
196
+ - max_length: Maximum generation length (default: 4000)
197
+
198
+ Returns:
199
+ - Audio file in WAV format
200
+ """
201
+ if model is None or audio_tokenizer is None:
202
+ raise HTTPException(
203
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
204
+ detail="Model not loaded. Please wait or restart the server."
205
+ )
206
+
207
+ try:
208
+ # Create prompt
209
+ prompt = audio_tokenizer.create_prompt(
210
+ request.text,
211
+ lang=request.language,
212
+ speaker_name=request.speaker_name
213
+ )
214
+
215
+ # Tokenize
216
+ input_ids = audio_tokenizer.tokenize_prompt(prompt)
217
+
218
+ # Generate
219
+ logger.info(f"Generating speech for text: {request.text[:50]}...")
220
+ with torch.no_grad():
221
+ output = model.generate(
222
+ input_ids=input_ids,
223
+ temperature=request.temperature,
224
+ repetition_penalty=request.repetition_penalty,
225
+ max_length=request.max_length,
226
+ )
227
+
228
+ # Get audio
229
+ codes = audio_tokenizer.get_codes(output)
230
+ audio = audio_tokenizer.get_audio(codes)
231
+
232
+ # Save to temporary file
233
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
234
+ torchaudio.save(temp_file.name, audio, sample_rate=24000)
235
+
236
+ return FileResponse(
237
+ temp_file.name,
238
+ media_type="audio/wav",
239
+ filename="speech.wav",
240
+ background=lambda: os.unlink(temp_file.name)
241
+ )
242
+
243
+ except Exception as e:
244
+ logger.error(f"Error generating speech: {e}")
245
+ raise HTTPException(
246
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
247
+ detail=f"Failed to generate speech: {str(e)}"
248
+ )
249
+
250
+ @app.post("/tts-stream")
251
+ async def text_to_speech_stream(request: TTSRequest):
252
+ """
253
+ Convert text to speech and return as streaming audio.
254
+
255
+ Same parameters as /tts endpoint.
256
+ """
257
+ if model is None or audio_tokenizer is None:
258
+ raise HTTPException(
259
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
260
+ detail="Model not loaded. Please wait or restart the server."
261
+ )
262
+
263
+ try:
264
+ # Create prompt
265
+ prompt = audio_tokenizer.create_prompt(
266
+ request.text,
267
+ lang=request.language,
268
+ speaker_name=request.speaker_name
269
+ )
270
+
271
+ # Tokenize
272
+ input_ids = audio_tokenizer.tokenize_prompt(prompt)
273
+
274
+ # Generate
275
+ logger.info(f"Generating speech (streaming) for text: {request.text[:50]}...")
276
+ with torch.no_grad():
277
+ output = model.generate(
278
+ input_ids=input_ids,
279
+ temperature=request.temperature,
280
+ repetition_penalty=request.repetition_penalty,
281
+ max_length=request.max_length,
282
+ )
283
+
284
+ # Get audio
285
+ codes = audio_tokenizer.get_codes(output)
286
+ audio = audio_tokenizer.get_audio(codes)
287
+
288
+ # Convert to bytes
289
+ buffer = io.BytesIO()
290
+ torchaudio.save(buffer, audio, sample_rate=24000, format="wav")
291
+ buffer.seek(0)
292
+
293
+ def cleanup():
294
+ buffer.close()
295
+
296
+ return StreamingResponse(
297
+ buffer,
298
+ media_type="audio/wav",
299
+ headers={"Content-Disposition": "attachment; filename=speech.wav"}
300
+ )
301
+
302
+ except Exception as e:
303
+ logger.error(f"Error generating speech: {e}")
304
+ raise HTTPException(
305
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
306
+ detail=f"Failed to generate speech: {str(e)}"
307
+ )
308
+
309
+ if __name__ == "__main__":
310
+ import uvicorn
311
+ uvicorn.run(app, host="0.0.0.0", port=8000)
312
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+ torch
5
+ transformers
6
+ torchaudio
7
+ accelerate
8
+ huggingface-hub
9
+ numpy
10
+ pydantic
11
+ inflect
12
+ scipy
13
+ librosa
14
+ outetts
15
+ uroman
16
+