Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Text-to-Speech API using Edge-TTS with FastAPI | |
| Optimized for Hugging Face Spaces deployment | |
| """ | |
| import edge_tts | |
| import asyncio | |
| import os | |
| import tempfile | |
| import uuid | |
| import re | |
| from fastapi import FastAPI, HTTPException, Form, UploadFile | |
| from fastapi.responses import FileResponse, JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field, validator | |
| import logging | |
| from typing import Optional | |
| import aiofiles | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # FastAPI app initialization | |
| app = FastAPI( | |
| title="Text-to-Speech API", | |
| description="Convert text to speech using Microsoft Edge TTS with customizable voice, pitch, and rate", | |
| version="1.0.0", | |
| docs_url="/", # Swagger UI at root for easy access | |
| redoc_url="/redoc" | |
| ) | |
| # Add CORS middleware for public API access | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # Allow all origins for public API | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Configuration | |
| TEMP_DIR = tempfile.gettempdir() | |
| MAX_TEXT_LENGTH = 5000 | |
| # Pydantic models for request validation | |
| class TTSRequest(BaseModel): | |
| text: str = Field(..., min_length=1, max_length=MAX_TEXT_LENGTH, description="Text to convert to speech") | |
| voice: str = Field(default="en-US-AriaNeural", description="Voice identifier (e.g., 'en-GB-SoniaNeural')") | |
| pitch: str = Field(default="+0Hz", description="Pitch adjustment (e.g., '+10Hz', '-15Hz')") | |
| rate: str = Field(default="+0%", description="Rate adjustment (e.g., '+20%', '-10%')") | |
| def validate_pitch(cls, v): | |
| if not re.match(r'^[+-]?\d+Hz$', v): | |
| raise ValueError("Pitch must be in format like '+10Hz' or '-15Hz'") | |
| pitch_value = int(v.replace('Hz', '').replace('+', '')) | |
| if not -50 <= pitch_value <= 50: | |
| raise ValueError("Pitch value must be between -50 and 50") | |
| return v | |
| def validate_rate(cls, v): | |
| if not re.match(r'^[+-]?\d+%$', v): | |
| raise ValueError("Rate must be in format like '+15%' or '-20%'") | |
| rate_value = int(v.replace('%', '').replace('+', '')) | |
| if not -50 <= rate_value <= 50: | |
| raise ValueError("Rate value must be between -50 and 50") | |
| return v | |
| class VoiceInfo(BaseModel): | |
| name: str | |
| short_name: str | |
| gender: str | |
| locale: str | |
| language: str | |
| display_name: str | |
| class HealthResponse(BaseModel): | |
| status: str | |
| service: str | |
| version: str | |
| class VoicesResponse(BaseModel): | |
| voices: list[VoiceInfo] | |
| count: int | |
| # Utility functions | |
| async def generate_speech_async(text: str, voice: str, pitch: str, rate: str, output_file: str) -> bool: | |
| """Generate speech asynchronously""" | |
| try: | |
| # Use edge_tts.Communicate with direct parameters (no SSML needed) | |
| communicate = edge_tts.Communicate(text=text, voice=voice, rate=rate, pitch=pitch) | |
| await communicate.save(output_file) | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error generating speech: {str(e)}") | |
| return False | |
| def cleanup_file(file_path: str): | |
| """Clean up temporary file""" | |
| try: | |
| if os.path.exists(file_path): | |
| os.remove(file_path) | |
| logger.info(f"Cleaned up temporary file: {file_path}") | |
| except Exception as e: | |
| logger.warning(f"Failed to clean up temp file {file_path}: {str(e)}") | |
| # API Endpoints | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return HealthResponse( | |
| status="healthy", | |
| service="TTS API", | |
| version="1.0.0" | |
| ) | |
| async def get_voices(): | |
| """Get list of available voices""" | |
| try: | |
| voices = await edge_tts.list_voices() | |
| voice_list = [ | |
| VoiceInfo( | |
| name=voice["Name"], | |
| short_name=voice["ShortName"], | |
| gender=voice["Gender"], | |
| locale=voice["Locale"], | |
| language=voice.get("Language", ""), | |
| display_name=voice.get("DisplayName", "") | |
| ) | |
| for voice in voices | |
| ] | |
| return VoicesResponse(voices=voice_list, count=len(voice_list)) | |
| except Exception as e: | |
| logger.error(f"Error fetching voices: {str(e)}") | |
| raise HTTPException(status_code=500, detail="Failed to fetch voices") | |
| async def synthesize_speech(request: TTSRequest): | |
| """ | |
| Convert text to speech and return audio file | |
| - **text**: Text to convert to speech (required) | |
| - **voice**: Voice identifier (default: en-US-AriaNeural) | |
| - **pitch**: Pitch adjustment like '+10Hz' or '-15Hz' (default: +0Hz) | |
| - **rate**: Rate adjustment like '+20%' or '-10%' (default: +0%) | |
| """ | |
| output_file = None | |
| try: | |
| # Generate unique filename | |
| file_id = str(uuid.uuid4()) | |
| output_file = os.path.join(TEMP_DIR, f"tts_{file_id}.mp3") | |
| # Generate speech | |
| success = await generate_speech_async( | |
| request.text, request.voice, request.pitch, request.rate, output_file | |
| ) | |
| if not success: | |
| raise HTTPException(status_code=500, detail="Failed to generate speech") | |
| if not os.path.exists(output_file): | |
| raise HTTPException(status_code=500, detail="Audio file was not generated") | |
| # Return the audio file directly | |
| return FileResponse( | |
| output_file, | |
| media_type="audio/mpeg", | |
| filename=f"speech_{file_id}.mp3", | |
| background=None # Don't cleanup immediately, let the response complete first | |
| ) | |
| except HTTPException: | |
| if output_file: | |
| cleanup_file(output_file) | |
| raise | |
| except Exception as e: | |
| if output_file: | |
| cleanup_file(output_file) | |
| logger.error(f"Error in synthesize_speech: {str(e)}") | |
| raise HTTPException(status_code=500, detail="Internal server error") | |
| async def synthesize_speech_form( | |
| text: str = Form(..., description="Text to convert to speech"), | |
| voice: str = Form(default="en-US-AriaNeural", description="Voice identifier"), | |
| pitch: str = Form(default="+0Hz", description="Pitch adjustment (e.g., '+10Hz')"), | |
| rate: str = Form(default="+0%", description="Rate adjustment (e.g., '+20%')") | |
| ): | |
| """ | |
| Convert text to speech using form data (alternative endpoint) | |
| Useful for HTML forms or when JSON is not preferred | |
| """ | |
| # Create request object and validate | |
| try: | |
| request = TTSRequest(text=text, voice=voice, pitch=pitch, rate=rate) | |
| return await synthesize_speech(request) | |
| except ValueError as e: | |
| raise HTTPException(status_code=422, detail=str(e)) | |
| async def root(): | |
| """Root endpoint redirects to API documentation""" | |
| return JSONResponse({ | |
| "message": "Welcome to Text-to-Speech API", | |
| "documentation": "/docs", | |
| "health": "/health", | |
| "voices": "/voices", | |
| "synthesize": "/synthesize" | |
| }) | |
| # Exception handlers | |
| async def validation_exception_handler(request, exc): | |
| return JSONResponse( | |
| status_code=422, | |
| content={"detail": "Validation error", "errors": exc.detail} | |
| ) | |
| async def internal_exception_handler(request, exc): | |
| return JSONResponse( | |
| status_code=500, | |
| content={"detail": "Internal server error"} | |
| ) | |
| # Startup event | |
| async def startup_event(): | |
| logger.info("TTS API is starting up...") | |
| # Test edge-tts functionality | |
| try: | |
| voices = await edge_tts.list_voices() | |
| logger.info(f"Successfully loaded {len(voices)} voices") | |
| except Exception as e: | |
| logger.error(f"Failed to load voices: {e}") | |
| async def shutdown_event(): | |
| logger.info("TTS API is shutting down...") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| print("Starting TTS API Server with FastAPI...") | |
| print("API Documentation will be available at: http://localhost:7860/") | |
| print("Health check: http://localhost:7860/health") | |
| print("Available voices: http://localhost:7860/voices") | |
| print("\nExample usage (saves audio file locally):") | |
| print("curl -X POST 'http://localhost:7860/synthesize' \\") | |
| print(" -H 'Content-Type: application/json' \\") | |
| print(" -d '{\"text\":\"Hello from Hugging Face!\",\"voice\":\"en-GB-SoniaNeural\",\"pitch\":\"-10Hz\",\"rate\":\"+15%\"}' \\") | |
| print(" --output speech.mp3") | |
| print("\nFor your deployed space:") | |
| print("curl -X POST 'https://nitinbot001-tts-api.hf.space/synthesize' \\") | |
| print(" -H 'Content-Type: application/json' \\") | |
| print(" -d '{\"text\":\"hello my name is nitin\",\"voice\":\"en-US-AriaNeural\",\"pitch\":\"+0Hz\",\"rate\":\"+0%\"}' \\") | |
| print(" --output speech.mp3") | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |