Spaces:

Hameed13
/

Huggingface_News_Podcast

Build error

App Files Files Community

Hameed13 commited on May 19, 2025

Commit

ea51340

verified ·

1 Parent(s): 91499fa

Update main.py

Browse files

Files changed (1) hide show

main.py +268 -130

main.py CHANGED Viewed

@@ -1,41 +1,47 @@
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
-from fastapi.responses import FileResponse, JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 import os
 import sys
 import time
 import uuid
-import base64
-import datetime
 import logging
 import traceback
 from typing import Optional
 import torch
-import soundfile as sf
 # Configure logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Set environment variable to handle PortAudio issues
-os.environ["OUTETTS_NO_PORTAUDIO"] = "1"
-# Import the TextToSpeech class from generate.py
-try:
-    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-    from generate import TextToSpeech
-    logger.info("Successfully imported TextToSpeech class from generate.py")
-except ImportError as e:
-    logger.error(f"Failed to import TextToSpeech class: {e}")
-    traceback.print_exc()
-    sys.exit(1)
-# Create the FastAPI app
-app = FastAPI(title="Nigerian Text-to-Speech API")
-# Add CORS middleware to allow cross-origin requests
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -44,165 +50,297 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# YarnGPT TTS configuration
-MODEL_CONFIG = {
-    "model_name_or_path": "yarngpt/yarn-tts-demo",
-    "processor_name_or_path": "yarngpt/yarn-tts-demo"
-}
-# Available voices and languages
 AVAILABLE_VOICES = {
     "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
     "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
 }
 AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
-# Initialize YarnGPT
-yarngpt = None
-class TTSRequest(BaseModel):
-    text: str
-    language: str = "english"
-    voice: str = "idera"
-    speed: float = 1.0
-# Define the directory for storing generated audio files
-AUDIO_DIR = "audio_files"
-os.makedirs(AUDIO_DIR, exist_ok=True)
-def initialize_yarngpt():
-    """Initialize the YarnGPT TTS model with proper error handling."""
-    global yarngpt
     try:
-        logger.info("Initializing YarnGPT TTS model...")
-        # Create TextToSpeech instance with option to disable playback
-        yarngpt = TextToSpeech(
-            model_name_or_path=MODEL_CONFIG["model_name_or_path"],
-            processor_name_or_path=MODEL_CONFIG["processor_name_or_path"],
-            disable_playback=True  # Disable playback to avoid PortAudio issues
         )
-        logger.info("YarnGPT TTS model initialized successfully")
         return True
     except Exception as e:
-        logger.error(f"Failed to initialize YarnGPT: {str(e)}")
-        traceback.print_exc()
         return False
 def cleanup_old_files(max_age_hours=6):
     """Remove audio files older than the specified hours."""
     try:
-        now = time.time()
-        count = 0
-        for filename in os.listdir(AUDIO_DIR):
-            file_path = os.path.join(AUDIO_DIR, filename)
-            if os.path.isfile(file_path):
-                if now - os.path.getmtime(file_path) > max_age_hours * 3600:
                     os.remove(file_path)
-                    count += 1
-        logger.info(f"Cleaned up {count} old audio files")
     except Exception as e:
-        logger.error(f"Error during file cleanup: {str(e)}")
 @app.on_event("startup")
 async def startup_event():
-    """Initialize required components on startup."""
-    success = initialize_yarngpt()
-    if not success:
-        logger.warning("YarnGPT failed to initialize. The API may not function correctly.")
 @app.get("/")
-async def health_check():
-    """API health check endpoint."""
-    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    yarngpt_status = "initialized" if yarngpt is not None else "not initialized"
     return {
-        "status": "online",
-        "timestamp": current_time,
-        "yarngpt_status": yarngpt_status,
         "available_languages": AVAILABLE_LANGUAGES,
-        "available_voices": AVAILABLE_VOICES
     }
-@app.post("/tts")
 async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
-    """Convert text to speech using YarnGPT model."""
-    if yarngpt is None:
-        logger.error("YarnGPT model not initialized")
-        success = initialize_yarngpt()
-        if not success:
-            raise HTTPException(status_code=500, detail="YarnGPT model initialization failed. Please check logs.")
     try:
-        # Generate a unique filename
         audio_id = str(uuid.uuid4())
-        output_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
-        logger.info(f"Processing TTS request: '{request.text[:50]}...' with voice '{request.voice}' and language '{request.language}'")
-        # Create prompt from voice and language
-        # This adapts to the colab-style API even though we're using a different backend
-        accent = request.language if request.language in ["nigerian"] else "nigerian"
-        # Generate speech
-        try:
-            audio_data, sample_rate = yarngpt.tts(
-                text=request.text,
-                accent=accent,
-                save_path=output_path,
-                speed=request.speed,
-                get_array=True
-            )
-            # Convert audio to base64
-            sf.write(output_path, audio_data, sample_rate)
-            with open(output_path, "rb") as audio_file:
-                audio_bytes = audio_file.read()
-                audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-        except Exception as e:
-            logger.error(f"Error in speech generation: {str(e)}")
-            traceback.print_exc()
-            raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
-        # Schedule cleanup for old files
         background_tasks.add_task(cleanup_old_files)
-        # Check if file exists
-        if not os.path.exists(output_path):
-            logger.error(f"Output file was not created: {output_path}")
-            raise HTTPException(status_code=500, detail="Failed to create audio file")
-        logger.info(f"Successfully generated audio file: {audio_id}.wav")
-        # Return both file URL and base64 data for compatibility with both APIs
         return {
-            "audio_base64": audio_base64,
             "audio_url": f"/audio/{audio_id}.wav",
             "text": request.text,
-            "voice": request.voice,
-            "language": request.language
         }
     except Exception as e:
-        logger.error(f"Error in TTS processing: {str(e)}")
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
-# File serving endpoint (for backward compatibility)
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):
-    file_path = os.path.join(AUDIO_DIR, filename)
     if not os.path.exists(file_path):
-        raise HTTPException(status_code=404, detail="Audio file not found")
     return FileResponse(file_path, media_type="audio/wav")
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

 import os
 import sys
 import time
 import uuid
 import logging
 import traceback
+import requests
+from pathlib import Path
+from datetime import datetime, timedelta
 from typing import Optional
 import torch
+import torchaudio
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import uvicorn
+from huggingface_hub import hf_hub_download
 # Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler("app.log")
+    ]
+)
 logger = logging.getLogger(__name__)
+# Create necessary directories
+REQUIRED_DIRS = ["audio_files", "models", "saheedniyi_YarnGPT2"]
+for directory in REQUIRED_DIRS:
+    os.makedirs(directory, exist_ok=True)
+# Initialize FastAPI app
+app = FastAPI(
+    title="Nigerian Text-to-Speech API",
+    description="Convert text to Nigerian-accented speech using YarnGPT",
+    version="1.0.0"
+)
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Input validation models
+class TTSRequest(BaseModel):
+    text: str
+    accent: str = "nigerian"
+    voice: str = None
+    language: str = "english"
+    speed: float = 1.0
+    class Config:
+        schema_extra = {
+            "example": {
+                "text": "Welcome to Nigeria, the giant of Africa.",
+                "accent": "nigerian",
+                "voice": "tayo",
+                "language": "english",
+                "speed": 1.0
+            }
+        }
+class TTSResponse(BaseModel):
+    audio_url: str
+    audio_base64: str = None
+    text: str
+    voice: str
+    language: str
+# Define available voices and languages
 AVAILABLE_VOICES = {
     "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
     "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
 }
+ACCENT_TO_VOICE = {
+    "nigerian": "tayo",
+    "yoruba": "idera",
+    "igbo": "emma",
+    "hausa": "umar"
+}
 AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
+# Initialize global variables for model components
+model = None
+audio_tokenizer = None
+tts_engine = None
+def download_model_files():
+    """Download required model files from Hugging Face Hub."""
+    files_to_download = [
+        {
+            "repo_id": "novateur/WavTokenizer-small-speech-320token",
+            "filename": "wavtokenizer_large_speech_320_24k.ckpt",
+            "output_path": "models/wavtokenizer_large_speech_320_24k.ckpt"
+        },
+        {
+            "repo_id": "saheedniyi/YarnGPT2",
+            "filename": "config.json",
+            "output_path": "saheedniyi_YarnGPT2/config.json"
+        },
+        {
+            "repo_id": "saheedniyi/YarnGPT2",
+            "filename": "tokenizer_config.json",
+            "output_path": "saheedniyi_YarnGPT2/tokenizer_config.json"
+        },
+        {
+            "repo_id": "saheedniyi/YarnGPT2",
+            "filename": "pytorch_model.bin",
+            "output_path": "saheedniyi_YarnGPT2/pytorch_model.bin"
+        }
+    ]
+    for file_info in files_to_download:
+        try:
+            if not os.path.exists(file_info["output_path"]):
+                logger.info(f"Downloading {file_info['filename']} from {file_info['repo_id']}")
+                hf_hub_download(
+                    repo_id=file_info["repo_id"],
+                    filename=file_info["filename"],
+                    local_dir=".",
+                    local_dir_use_symlinks=False
+                )
+                logger.info(f"Successfully downloaded {file_info['filename']}")
+            else:
+                logger.info(f"File already exists: {file_info['output_path']}")
+        except Exception as e:
+            logger.error(f"Error downloading {file_info['filename']}: {str(e)}")
+            raise
+def load_tts_engine():
+    """Initialize the TTS engine with proper error handling."""
+    global model, audio_tokenizer, tts_engine
     try:
+        # Import required modules
+        from transformers import AutoModelForCausalLM
+        from yarngpt.audiotokenizer import AudioTokenizerV2
+        # Set device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
+        # Load tokenizer and model
+        tokenizer_path = "saheedniyi_YarnGPT2"
+        wav_tokenizer_config = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+        wav_tokenizer_model = "models/wavtokenizer_large_speech_320_24k.ckpt"
+        logger.info("Loading audio tokenizer...")
+        audio_tokenizer = AudioTokenizerV2(
+            tokenizer_path,
+            wav_tokenizer_model,
+            wav_tokenizer_config
         )
+        logger.info("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            tokenizer_path,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32
+        ).to(device)
+        class TextToSpeech:
+            def __init__(self):
+                self.audio_tokenizer = audio_tokenizer
+                self.model = model
+                self.device = device
+            def generate_speech(self, text, language="english", speaker_name="tayo", speed=1.0):
+                prompt = self.audio_tokenizer.create_prompt(
+                    text,
+                    lang=language,
+                    speaker_name=speaker_name
+                )
+                input_ids = self.audio_tokenizer.tokenize_prompt(prompt)
+                with torch.no_grad():
+                    output = self.model.generate(
+                        input_ids=input_ids,
+                        temperature=0.1,
+                        repetition_penalty=1.1,
+                        max_length=4000,
+                        do_sample=True,
+                        top_k=50,
+                        top_p=0.95
+                    )
+                codes = self.audio_tokenizer.get_codes(output)
+                audio = self.audio_tokenizer.get_audio(codes)
+                if speed != 1.0:
+                    import librosa
+                    audio = librosa.effects.time_stretch(audio.numpy().squeeze(), rate=speed)
+                    audio = torch.from_numpy(audio).unsqueeze(0)
+                return audio
+        tts_engine = TextToSpeech()
+        logger.info("TTS engine initialized successfully!")
         return True
     except Exception as e:
+        logger.error(f"Error initializing TTS engine: {str(e)}")
+        logger.error(traceback.format_exc())
         return False
 def cleanup_old_files(max_age_hours=6):
     """Remove audio files older than the specified hours."""
     try:
+        now = datetime.now()
+        for filename in os.listdir("audio_files"):
+            if filename.endswith(".wav"):
+                file_path = os.path.join("audio_files", filename)
+                if now - datetime.fromtimestamp(os.path.getmtime(file_path)) > timedelta(hours=max_age_hours):
                     os.remove(file_path)
+                    logger.info(f"Deleted old audio file: {filename}")
     except Exception as e:
+        logger.error(f"Error cleaning up files: {str(e)}")
 @app.on_event("startup")
 async def startup_event():
+    """Initialize the application on startup."""
+    try:
+        # Download model files
+        download_model_files()
+        # Initialize TTS engine
+        success = load_tts_engine()
+        if not success:
+            logger.error("Failed to initialize TTS engine")
+            raise RuntimeError("TTS engine initialization failed")
+    except Exception as e:
+        logger.error(f"Startup failed: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise
 @app.get("/")
+async def root():
+    """API health check and info endpoint."""
     return {
+        "status": "ok" if tts_engine is not None else "model_loading_failed",
+        "message": "Nigerian TTS API is running",
         "available_languages": AVAILABLE_LANGUAGES,
+        "available_voices": AVAILABLE_VOICES,
+        "accent_mapping": ACCENT_TO_VOICE
     }
+@app.post("/tts", response_model=TTSResponse)
 async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
+    """Generate speech from text."""
+    if tts_engine is None:
+        raise HTTPException(
+            status_code=503,
+            detail="TTS engine is not initialized"
+        )
+    # Validate and process parameters
+    voice = request.voice or ACCENT_TO_VOICE.get(request.accent.lower(), "tayo")
+    language = request.language.lower()
+    if language not in AVAILABLE_LANGUAGES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Language must be one of {AVAILABLE_LANGUAGES}"
+        )
+    all_voices = AVAILABLE_VOICES["female"] + AVAILABLE_VOICES["male"]
+    if voice not in all_voices:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Voice must be one of {all_voices}"
+        )
     try:
+        # Generate unique filename
         audio_id = str(uuid.uuid4())
+        output_path = f"audio_files/{audio_id}.wav"
+        # Generate audio
+        audio = tts_engine.generate_speech(
+            text=request.text,
+            language=language,
+            speaker_name=voice,
+            speed=request.speed
+        )
+        # Save audio file
+        torchaudio.save(output_path, audio, sample_rate=24000)
+        # Generate base64 representation
+        import base64
+        with open(output_path, "rb") as audio_file:
+            audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
+        # Schedule cleanup
         background_tasks.add_task(cleanup_old_files)
         return {
             "audio_url": f"/audio/{audio_id}.wav",
+            "audio_base64": audio_base64,
             "text": request.text,
+            "voice": voice,
+            "language": language
         }
     except Exception as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error generating audio: {str(e)}"
+        )
 @app.get("/audio/{filename}")
 async def get_audio(filename: str):
+    """Serve generated audio files."""
+    file_path = f"audio_files/{filename}"
     if not os.path.exists(file_path):
+        raise HTTPException(
+            status_code=404,
+            detail="Audio file not found"
+        )
     return FileResponse(file_path, media_type="audio/wav")
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """Global exception handler."""
+    logger.error(f"Unhandled exception: {str(exc)}")
+    logger.error(traceback.format_exc())
+    return JSONResponse(
+        status_code=500,
+        content={"detail": f"An unexpected error occurred: {str(exc)}"}
+    )
 if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)