Spaces:

Hameed13
/

Huggingface_News_Podcast

Build error

App Files Files Community

Hameed13 commited on May 21, 2025

Commit

71f917b

verified ·

1 Parent(s): 109c3b2

Update main.py

Browse files

Files changed (1) hide show

main.py +135 -173

main.py CHANGED Viewed

@@ -1,105 +1,29 @@
-import os
-from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-import sys
-import time
-import numpy as np
 import torch
 import logging
-from pathlib import Path
-from huggingface_hub import hf_hub_download
-from datetime import datetime
 # Setup logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Current timestamp and user
-CURRENT_TIMESTAMP = "2025-05-21 01:58:55"
 CURRENT_USER = "Abdulhameed556"
-# Set cache directory to a writable location
-os.environ['HF_HOME'] = '/code/cache'
-os.environ['TRANSFORMERS_CACHE'] = '/code/cache/transformers'
-# Define all required directories
-CACHE_DIR = '/code/cache'
-MODEL_DIR = '/code/models'
-AUDIO_DIR = '/code/audio_files'
-# Create directories if they don't exist
-for directory in [CACHE_DIR, MODEL_DIR, AUDIO_DIR]:
-    os.makedirs(directory, exist_ok=True)
-# Model configuration
-MODEL_REPO = "Hameed13/News_Podcast_Model"
-MODEL_FILENAME = "model.ckpt"
-# Model config
-MODEL_CONFIG = {
-    "model_type": "speech_to_text_2",
-    "architectures": ["Speech2Text2ForConditionalGeneration"],
-    "activation_dropout": 0.1,
-    "activation_function": "relu",
-    "attention_dropout": 0.1,
-    "d_model": 512,
-    "decoder_attention_heads": 8,
-    "decoder_ffn_dim": 2048,
-    "decoder_layers": 6,
-    "dropout": 0.1,
-    "encoder_attention_heads": 8,
-    "encoder_ffn_dim": 2048,
-    "encoder_layers": 6,
-    "init_std": 0.02,
-    "max_speech_positions": 4000,
-    "max_text_positions": 1024,
-    "num_conv_layers": 2,
-    "num_hidden_layers": 12,
-    "speech_vocab_size": 4096,
-    "vocab_size": 50265,
-    "use_cache": True,
-    "tie_word_embeddings": True,
-    "is_encoder_decoder": True,
-    "pad_token_id": 1,
-    "bos_token_id": 0,
-    "eos_token_id": 2,
-    "_name_or_path": MODEL_REPO,
-    "model_creation_date": CURRENT_TIMESTAMP,
-    "model_creator": CURRENT_USER
-}
-# Download model from Hub
-try:
-    logger.info("Downloading model from Hugging Face Hub")
-    MODEL_CHECKPOINT = hf_hub_download(
-        repo_id=MODEL_REPO,
-        filename=MODEL_FILENAME,
-        token=os.getenv('HF_TOKEN'),
-        cache_dir=CACHE_DIR
-    )
-    logger.info(f"Model downloaded successfully to: {MODEL_CHECKPOINT}")
-except Exception as e:
-    logger.error(f"Failed to download model: {e}")
-    raise
-# Import TTS modules
-try:
-    from yarngpt.generate import TextToSpeech
-    logger.info("Successfully imported yarngpt modules")
-except ImportError as e:
-    logger.error(f"Failed to import YarnGPT modules: {e}")
-    raise
-# Create FastAPI app
-app = FastAPI(
-    title="Nigerian Text-to-Speech API",
-    description="A text-to-speech API for Nigerian English",
-    version="1.0.0"
-)
-# Configure CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -108,107 +32,145 @@ app.add_middleware(
     allow_headers=["*"],
 )
 class TTSRequest(BaseModel):
     text: str
-    temperature: float = 0.2
-    top_p: float = 0.7
-    top_k: int = 50
-    speed: float = 1.0
-    seed: int = 42
 class TTSResponse(BaseModel):
-    audio: str
-    generation_time: float
-    timestamp: str
-    user: str
 @app.get("/")
-def read_root():
-    """Root endpoint returning API status"""
     return {
-        "message": "Nigerian Text-to-Speech API is running",
-        "status": "active",
         "timestamp": CURRENT_TIMESTAMP,
-        "user": CURRENT_USER,
-        "model": MODEL_REPO,
-        "version": "1.0.0"
     }
 @app.post("/tts", response_model=TTSResponse)
-async def text_to_speech(request: TTSRequest):
-    """Generate speech from text"""
     try:
-        logger.info(f"Processing TTS request: {request.text[:50]}...")
-        # Set random seed if provided
-        if request.seed is not None:
-            torch.manual_seed(request.seed)
-            np.random.seed(request.seed)
-        # Initialize TTS with model
-        tts = TextToSpeech(
-            MODEL_REPO,
-            processor_name_or_path=MODEL_REPO
-        )
-        # Generate audio
-        start_time = time.time()
-        audio = tts.tts(
-            request.text,
-            speed=request.speed
         )
-        # Convert audio to base64
-        import base64
-        import io
-        audio_io = io.BytesIO()
-        import scipy.io.wavfile as wav
-        wav.write(audio_io, 24000, audio)
-        audio_io.seek(0)
-        audio_base64 = base64.b64encode(audio_io.read()).decode('utf-8')
-        generation_time = time.time() - start_time
-        logger.info(f"Generated audio in {generation_time:.2f} seconds")
         return TTSResponse(
-            audio=audio_base64,
-            generation_time=generation_time,
-            timestamp=CURRENT_TIMESTAMP,
-            user=CURRENT_USER
         )
     except Exception as e:
-        logger.error(f"Error generating speech: {str(e)}", exc_info=True)
-        raise HTTPException(
-            status_code=500,
-            detail=f"Error generating speech: {str(e)}"
-        )
-@app.get("/health")
-def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "ok",
-        "models_loaded": True,
-        "timestamp": CURRENT_TIMESTAMP,
-        "user": CURRENT_USER,
-        "model": {
-            "repo": MODEL_REPO,
-            "checkpoint": MODEL_CHECKPOINT,
-            "cache_dir": CACHE_DIR
-        },
-        "system": {
-            "cuda_available": torch.cuda.is_available(),
-            "device": "cuda" if torch.cuda.is_available() else "cpu",
-            "python_version": sys.version
-        }
-    }
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=7860,
-        log_level="info"
-    )

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+import os
+import uuid
 import torch
+import torchaudio
+import base64
+from transformers import AutoModelForCausalLM
+from yarngpt.audiotokenizer import AudioTokenizerV2
+from datetime import datetime, timedelta
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Constants
+CURRENT_TIMESTAMP = "2025-05-21 02:39:34"
 CURRENT_USER = "Abdulhameed556"
+app = FastAPI(title="Nigerian TTS API")
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Model configuration - Using your Hugging Face model
+model_path = "Hameed13/News_Podcast_Model"
+tokenizer_path = "saheedniyi/YarnGPT2"
+wav_tokenizer_config_path = "/code/models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+wav_tokenizer_model_path = "/code/models/wavtokenizer_large_speech_320_24k.ckpt"
+# Initialize model
+logger.info("Loading YarnGPT model and tokenizer...")
+try:
+    audio_tokenizer = AudioTokenizerV2(
+        tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        token=os.getenv('HF_TOKEN')  # In case the model requires authentication
+    ).to(audio_tokenizer.device)
+    logger.info("Model loaded successfully!")
+except Exception as e:
+    logger.error(f"Error loading model: {e}")
+    raise
+# Available voices and languages
+AVAILABLE_VOICES = {
+    "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
+    "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
+}
+AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
 class TTSRequest(BaseModel):
     text: str
+    language: str = "english"
+    voice: str = "idera"
 class TTSResponse(BaseModel):
+    audio_base64: str
+    audio_url: str
+    text: str
+    voice: str
+    language: str
 @app.get("/")
+async def root():
+    """API health check and info"""
     return {
+        "status": "ok",
+        "message": "Nigerian TTS API is running",
+        "available_languages": AVAILABLE_LANGUAGES,
+        "available_voices": AVAILABLE_VOICES,
+        "model_path": model_path,
         "timestamp": CURRENT_TIMESTAMP,
+        "user": CURRENT_USER
     }
 @app.post("/tts", response_model=TTSResponse)
+async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
+    """Convert text to Nigerian-accented speech"""
+    # Validate inputs
+    if request.language not in AVAILABLE_LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Language must be one of {AVAILABLE_LANGUAGES}")
+    all_voices = AVAILABLE_VOICES["female"] + AVAILABLE_VOICES["male"]
+    if request.voice not in all_voices:
+        raise HTTPException(status_code=400, detail=f"Voice must be one of {all_voices}")
+    # Generate unique filename
+    audio_id = str(uuid.uuid4())
+    output_path = f"audio_files/{audio_id}.wav"
+    os.makedirs("audio_files", exist_ok=True)
     try:
+        # Create prompt and generate audio
+        prompt = audio_tokenizer.create_prompt(request.text, lang=request.language, speaker_name=request.voice)
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        output = model.generate(
+            input_ids=input_ids,
+            temperature=0.1,
+            repetition_penalty=1.1,
+            max_length=4000,
         )
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Save audio file
+        torchaudio.save(output_path, audio, sample_rate=24000)
+        # Read the file and encode as base64
+        with open(output_path, "rb") as audio_file:
+            audio_bytes = audio_file.read()
+            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+        # Clean up old files after a while
+        background_tasks.add_task(cleanup_old_files)
         return TTSResponse(
+            audio_base64=audio_base64,
+            audio_url=f"/audio/{audio_id}.wav",
+            text=request.text,
+            voice=request.voice,
+            language=request.language
         )
     except Exception as e:
+        logger.error(f"Error generating audio: {e}")
+        raise HTTPException(status_code=500, detail=f"Error generating audio: {str(e)}")
+@app.get("/audio/{filename}")
+async def get_audio(filename: str):
+    file_path = f"audio_files/{filename}"
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="Audio file not found")
+    return FileResponse(file_path, media_type="audio/wav")
+def cleanup_old_files():
+    """Delete audio files older than 6 hours to manage disk space"""
+    try:
+        now = datetime.now()
+        audio_dir = "audio_files"
+        if not os.path.exists(audio_dir):
+            return
+        for filename in os.listdir(audio_dir):
+            if not filename.endswith(".wav"):
+                continue
+            file_path = os.path.join(audio_dir, filename)
+            file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
+            # Delete files older than 6 hours
+            if now - file_mod_time > timedelta(hours=6):
+                os.remove(file_path)
+                logger.info(f"Deleted old audio file: {filename}")
+    except Exception as e:
+        logger.error(f"Error cleaning up old files: {e}")
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)