Spaces:

Hameed13
/

Huggingface_News_Podcast

Build error

App Files Files Community

Hameed13 commited on May 19, 2025

Commit

6b625c9

verified ·

1 Parent(s): ea51340

Update main.py

Browse files

Files changed (1) hide show

main.py +106 -253

main.py CHANGED Viewed

@@ -1,43 +1,31 @@
 import os
 import sys
 import time
-import uuid
-import logging
-import traceback
-import requests
-from pathlib import Path
-from datetime import datetime, timedelta
-from typing import Optional
 import torch
 import torchaudio
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
-from fastapi.responses import FileResponse, JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import uvicorn
 from huggingface_hub import hf_hub_download
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler("app.log")
-    ]
 )
 logger = logging.getLogger(__name__)
-# Create necessary directories
-REQUIRED_DIRS = ["audio_files", "models", "saheedniyi_YarnGPT2"]
-for directory in REQUIRED_DIRS:
-    os.makedirs(directory, exist_ok=True)
 # Initialize FastAPI app
 app = FastAPI(
-    title="Nigerian Text-to-Speech API",
-    description="Convert text to Nigerian-accented speech using YarnGPT",
     version="1.0.0"
 )
@@ -50,232 +38,111 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Input validation models
-class TTSRequest(BaseModel):
-    text: str
-    accent: str = "nigerian"
-    voice: str = None
-    language: str = "english"
-    speed: float = 1.0
-    class Config:
-        schema_extra = {
-            "example": {
-                "text": "Welcome to Nigeria, the giant of Africa.",
-                "accent": "nigerian",
-                "voice": "tayo",
-                "language": "english",
-                "speed": 1.0
-            }
-        }
-class TTSResponse(BaseModel):
-    audio_url: str
-    audio_base64: str = None
-    text: str
-    voice: str
-    language: str
-# Define available voices and languages
 AVAILABLE_VOICES = {
     "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
     "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
 }
-ACCENT_TO_VOICE = {
-    "nigerian": "tayo",
-    "yoruba": "idera",
-    "igbo": "emma",
-    "hausa": "umar"
-}
 AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
-# Initialize global variables for model components
-model = None
-audio_tokenizer = None
-tts_engine = None
-def download_model_files():
-    """Download required model files from Hugging Face Hub."""
-    files_to_download = [
-        {
-            "repo_id": "novateur/WavTokenizer-small-speech-320token",
-            "filename": "wavtokenizer_large_speech_320_24k.ckpt",
-            "output_path": "models/wavtokenizer_large_speech_320_24k.ckpt"
-        },
-        {
-            "repo_id": "saheedniyi/YarnGPT2",
-            "filename": "config.json",
-            "output_path": "saheedniyi_YarnGPT2/config.json"
-        },
-        {
-            "repo_id": "saheedniyi/YarnGPT2",
-            "filename": "tokenizer_config.json",
-            "output_path": "saheedniyi_YarnGPT2/tokenizer_config.json"
-        },
-        {
-            "repo_id": "saheedniyi/YarnGPT2",
-            "filename": "pytorch_model.bin",
-            "output_path": "saheedniyi_YarnGPT2/pytorch_model.bin"
-        }
-    ]
-    for file_info in files_to_download:
-        try:
-            if not os.path.exists(file_info["output_path"]):
-                logger.info(f"Downloading {file_info['filename']} from {file_info['repo_id']}")
-                hf_hub_download(
-                    repo_id=file_info["repo_id"],
-                    filename=file_info["filename"],
-                    local_dir=".",
-                    local_dir_use_symlinks=False
-                )
-                logger.info(f"Successfully downloaded {file_info['filename']}")
-            else:
-                logger.info(f"File already exists: {file_info['output_path']}")
-        except Exception as e:
-            logger.error(f"Error downloading {file_info['filename']}: {str(e)}")
-            raise
-def load_tts_engine():
-    """Initialize the TTS engine with proper error handling."""
-    global model, audio_tokenizer, tts_engine
     try:
-        # Import required modules
-        from transformers import AutoModelForCausalLM
-        from yarngpt.audiotokenizer import AudioTokenizerV2
-        # Set device
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {device}")
-        # Load tokenizer and model
-        tokenizer_path = "saheedniyi_YarnGPT2"
-        wav_tokenizer_config = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
-        wav_tokenizer_model = "models/wavtokenizer_large_speech_320_24k.ckpt"
-        logger.info("Loading audio tokenizer...")
         audio_tokenizer = AudioTokenizerV2(
-            tokenizer_path,
             wav_tokenizer_model,
             wav_tokenizer_config
         )
-        logger.info("Loading model...")
         model = AutoModelForCausalLM.from_pretrained(
-            tokenizer_path,
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32
-        ).to(device)
-        class TextToSpeech:
-            def __init__(self):
-                self.audio_tokenizer = audio_tokenizer
-                self.model = model
-                self.device = device
-            def generate_speech(self, text, language="english", speaker_name="tayo", speed=1.0):
-                prompt = self.audio_tokenizer.create_prompt(
-                    text,
-                    lang=language,
-                    speaker_name=speaker_name
-                )
-                input_ids = self.audio_tokenizer.tokenize_prompt(prompt)
-                with torch.no_grad():
-                    output = self.model.generate(
-                        input_ids=input_ids,
-                        temperature=0.1,
-                        repetition_penalty=1.1,
-                        max_length=4000,
-                        do_sample=True,
-                        top_k=50,
-                        top_p=0.95
-                    )
-                codes = self.audio_tokenizer.get_codes(output)
-                audio = self.audio_tokenizer.get_audio(codes)
-                if speed != 1.0:
-                    import librosa
-                    audio = librosa.effects.time_stretch(audio.numpy().squeeze(), rate=speed)
-                    audio = torch.from_numpy(audio).unsqueeze(0)
-                return audio
-        tts_engine = TextToSpeech()
-        logger.info("TTS engine initialized successfully!")
-        return True
     except Exception as e:
-        logger.error(f"Error initializing TTS engine: {str(e)}")
-        logger.error(traceback.format_exc())
-        return False
-def cleanup_old_files(max_age_hours=6):
-    """Remove audio files older than the specified hours."""
-    try:
-        now = datetime.now()
-        for filename in os.listdir("audio_files"):
-            if filename.endswith(".wav"):
-                file_path = os.path.join("audio_files", filename)
-                if now - datetime.fromtimestamp(os.path.getmtime(file_path)) > timedelta(hours=max_age_hours):
-                    os.remove(file_path)
-                    logger.info(f"Deleted old audio file: {filename}")
-    except Exception as e:
-        logger.error(f"Error cleaning up files: {str(e)}")
-@app.on_event("startup")
-async def startup_event():
-    """Initialize the application on startup."""
     try:
-        # Download model files
-        download_model_files()
-        # Initialize TTS engine
-        success = load_tts_engine()
-        if not success:
-            logger.error("Failed to initialize TTS engine")
-            raise RuntimeError("TTS engine initialization failed")
     except Exception as e:
-        logger.error(f"Startup failed: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise
 @app.get("/")
 async def root():
-    """API health check and info endpoint."""
     return {
-        "status": "ok" if tts_engine is not None else "model_loading_failed",
-        "message": "Nigerian TTS API is running",
         "available_languages": AVAILABLE_LANGUAGES,
-        "available_voices": AVAILABLE_VOICES,
-        "accent_mapping": ACCENT_TO_VOICE
     }
 @app.post("/tts", response_model=TTSResponse)
 async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
-    """Generate speech from text."""
-    if tts_engine is None:
-        raise HTTPException(
-            status_code=503,
-            detail="TTS engine is not initialized"
-        )
-    # Validate and process parameters
-    voice = request.voice or ACCENT_TO_VOICE.get(request.accent.lower(), "tayo")
-    language = request.language.lower()
-    if language not in AVAILABLE_LANGUAGES:
         raise HTTPException(
             status_code=400,
             detail=f"Language must be one of {AVAILABLE_LANGUAGES}"
         )
     all_voices = AVAILABLE_VOICES["female"] + AVAILABLE_VOICES["male"]
-    if voice not in all_voices:
         raise HTTPException(
             status_code=400,
             detail=f"Voice must be one of {all_voices}"
@@ -284,63 +151,49 @@ async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks)
     try:
         # Generate unique filename
         audio_id = str(uuid.uuid4())
-        output_path = f"audio_files/{audio_id}.wav"
         # Generate audio
-        audio = tts_engine.generate_speech(
-            text=request.text,
-            language=language,
-            speaker_name=voice,
-            speed=request.speed
         )
         # Save audio file
         torchaudio.save(output_path, audio, sample_rate=24000)
-        # Generate base64 representation
-        import base64
         with open(output_path, "rb") as audio_file:
-            audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
         # Schedule cleanup
         background_tasks.add_task(cleanup_old_files)
-        return {
-            "audio_url": f"/audio/{audio_id}.wav",
-            "audio_base64": audio_base64,
-            "text": request.text,
-            "voice": voice,
-            "language": language
-        }
     except Exception as e:
         logger.error(f"Error generating audio: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(
-            status_code=500,
-            detail=f"Error generating audio: {str(e)}"
-        )
-@app.get("/audio/{filename}")
-async def get_audio(filename: str):
-    """Serve generated audio files."""
-    file_path = f"audio_files/{filename}"
-    if not os.path.exists(file_path):
-        raise HTTPException(
-            status_code=404,
-            detail="Audio file not found"
-        )
-    return FileResponse(file_path, media_type="audio/wav")
-@app.exception_handler(Exception)
-async def global_exception_handler(request: Request, exc: Exception):
-    """Global exception handler."""
-    logger.error(f"Unhandled exception: {str(exc)}")
-    logger.error(traceback.format_exc())
-    return JSONResponse(
-        status_code=500,
-        content={"detail": f"An unexpected error occurred: {str(exc)}"}
-    )
 if __name__ == "__main__":
-    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 import os
 import sys
 import time
 import torch
 import torchaudio
+import base64
+from transformers import AutoModelForCausalLM
 from huggingface_hub import hf_hub_download
+import logging
+from datetime import datetime, timedelta
+import uuid
+from typing import Optional
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Initialize FastAPI app
 app = FastAPI(
+    title="Nigerian TTS API",
+    description="API for Nigerian Text-to-Speech using YarnGPT",
     version="1.0.0"
 )
     allow_headers=["*"],
 )
+# Constants
+MODEL_ID = "saheedniyi/YarnGPT2"
+AUDIO_DIR = "audio_files"
+os.makedirs(AUDIO_DIR, exist_ok=True)
+# Available voices and languages
 AVAILABLE_VOICES = {
     "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
     "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
 }
 AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
+# Model initialization
+def initialize_model():
     try:
+        logger.info("Loading YarnGPT model and tokenizer...")
+        # Download necessary files from HuggingFace Hub
+        wav_tokenizer_config = hf_hub_download(
+            repo_id=MODEL_ID,
+            filename="wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+        )
+        wav_tokenizer_model = hf_hub_download(
+            repo_id=MODEL_ID,
+            filename="wavtokenizer_large_speech_320_24k.ckpt"
+        )
+        # Import AudioTokenizer here to ensure files are downloaded first
+        from yarngpt.audiotokenizer import AudioTokenizerV2
         audio_tokenizer = AudioTokenizerV2(
+            MODEL_ID,
             wav_tokenizer_model,
             wav_tokenizer_config
         )
         model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype="auto"
+        ).to(audio_tokenizer.device)
+        logger.info("Model loaded successfully!")
+        return audio_tokenizer, model
     except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        raise
+# Initialize model at startup
+audio_tokenizer, model = initialize_model()
+# Pydantic models
+class TTSRequest(BaseModel):
+    text: str
+    language: str = "english"
+    voice: str = "idera"
+class TTSResponse(BaseModel):
+    audio_base64: str
+    text: str
+    voice: str
+    language: str
+# Cleanup function
+def cleanup_old_files(max_age_hours: int = 6):
+    """Delete audio files older than specified hours"""
     try:
+        now = datetime.now()
+        for filename in os.listdir(AUDIO_DIR):
+            if not filename.endswith(".wav"):
+                continue
+            file_path = os.path.join(AUDIO_DIR, filename)
+            file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
+            if now - file_mod_time > timedelta(hours=max_age_hours):
+                os.remove(file_path)
+                logger.info(f"Deleted old audio file: {filename}")
     except Exception as e:
+        logger.error(f"Error cleaning up files: {str(e)}")
+# API endpoints
 @app.get("/")
 async def root():
+    """Health check endpoint"""
     return {
+        "status": "healthy",
+        "model": MODEL_ID,
         "available_languages": AVAILABLE_LANGUAGES,
+        "available_voices": AVAILABLE_VOICES
     }
 @app.post("/tts", response_model=TTSResponse)
 async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
+    """Generate Nigerian-accented speech from text"""
+    # Validate inputs
+    if request.language not in AVAILABLE_LANGUAGES:
         raise HTTPException(
             status_code=400,
             detail=f"Language must be one of {AVAILABLE_LANGUAGES}"
         )
     all_voices = AVAILABLE_VOICES["female"] + AVAILABLE_VOICES["male"]
+    if request.voice not in all_voices:
         raise HTTPException(
             status_code=400,
             detail=f"Voice must be one of {all_voices}"
     try:
         # Generate unique filename
         audio_id = str(uuid.uuid4())
+        output_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
         # Generate audio
+        prompt = audio_tokenizer.create_prompt(
+            request.text,
+            lang=request.language,
+            speaker_name=request.voice
         )
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        with torch.no_grad():
+            output = model.generate(
+                input_ids=input_ids,
+                temperature=0.1,
+                repetition_penalty=1.1,
+                max_length=4000,
+            )
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
         # Save audio file
         torchaudio.save(output_path, audio, sample_rate=24000)
+        # Read and encode as base64
         with open(output_path, "rb") as audio_file:
+            audio_bytes = audio_file.read()
+            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
         # Schedule cleanup
         background_tasks.add_task(cleanup_old_files)
+        return TTSResponse(
+            audio_base64=audio_base64,
+            text=request.text,
+            voice=request.voice,
+            language=request.language
+        )
     except Exception as e:
         logger.error(f"Error generating audio: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)