urdu-tts-api / app.py
abd8433's picture
Update app.py
f114682 verified
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, VitsModel
from parler_tts import ParlerTTSForConditionalGeneration
import torch
import scipy.io.wavfile
import base64
import io
import logging
import os
from huggingface_hub import login
login(token=os.environ.get("HF_TOKEN"))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Bilingual TTS API", version="3.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ─── Load Models ──────────────────────────────────────────────────────────────
# English TTS β€” MMS (fast, no gating)
logger.info("Loading English TTS...")
eng_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
eng_tok = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
eng_tts.eval()
logger.info("English TTS loaded βœ…")
# Translation — Helsinki (accurate, dedicated EN→UR)
logger.info("Loading translation model...")
trans_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_model.eval()
logger.info("Translation model loaded βœ…")
# Urdu TTS β€” parler-tts-mini (public, smooth, natural voice)
logger.info("Loading Urdu TTS...")
urdu_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tok = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tts.eval()
logger.info("Urdu TTS loaded βœ…")
# ─── Helpers ──────────────────────────────────────────────────────────────────
def translate_to_urdu(text: str) -> str:
inputs = trans_tok(text, return_tensors="pt", padding=True)
with torch.no_grad():
translated = trans_model.generate(**inputs)
return trans_tok.decode(translated[0], skip_special_tokens=True)
def english_to_audio_b64(text: str) -> str:
inputs = eng_tok(text, return_tensors="pt")
with torch.no_grad():
waveform = eng_tts(**inputs).waveform
waveform_np = waveform.squeeze().numpy()
buf = io.BytesIO()
scipy.io.wavfile.write(buf, rate=eng_tts.config.sampling_rate, data=waveform_np)
buf.seek(0)
return base64.b64encode(buf.read()).decode("utf-8")
def urdu_to_audio_b64(urdu_text: str) -> str:
# Voice description β€” controls how the speech sounds
description = "A male speaker delivers clear, natural speech in a calm and neutral tone with no background noise."
desc_inputs = urdu_tok(description, return_tensors="pt")
text_inputs = urdu_tok(urdu_text, return_tensors="pt")
with torch.no_grad():
generation = urdu_tts.generate(
input_ids=desc_inputs.input_ids,
prompt_input_ids=text_inputs.input_ids,
)
waveform_np = generation.cpu().numpy().squeeze()
buf = io.BytesIO()
scipy.io.wavfile.write(buf, rate=urdu_tts.config.sampling_rate, data=waveform_np)
buf.seek(0)
return base64.b64encode(buf.read()).decode("utf-8")
# ─── Request ──────────────────────────────────────────────────────────────────
class TTSRequest(BaseModel):
text: str # Always English input
# ─── Endpoints ────────────────────────────────────────────────────────────────
@app.get("/")
def root():
return {"status": "Bilingual TTS API v3 running πŸš€"}
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/tts/english")
def tts_english(request: TTSRequest):
"""English text β†’ speaks in English"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
try:
audio = english_to_audio_b64(request.text)
return {"audio": audio, "language": "english", "text": request.text}
except Exception as e:
logger.error(f"English TTS error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/tts/english-to-urdu")
def tts_english_to_urdu(request: TTSRequest):
"""English text β†’ translate to Urdu β†’ speaks in Urdu"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
try:
urdu_text = translate_to_urdu(request.text)
logger.info(f"Translated: {urdu_text[:60]}")
audio = urdu_to_audio_b64(urdu_text)
return {
"audio": audio,
"language": "urdu",
"original_text": request.text,
"urdu_text": urdu_text,
}
except Exception as e:
logger.error(f"English→Urdu TTS error: {e}")
raise HTTPException(status_code=500, detail=str(e))