from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, VitsModel from parler_tts import ParlerTTSForConditionalGeneration import torch import scipy.io.wavfile import base64 import io import logging import os from huggingface_hub import login login(token=os.environ.get("HF_TOKEN")) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="Bilingual TTS API", version="3.0.0") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ─── Load Models ────────────────────────────────────────────────────────────── # English TTS — MMS (fast, no gating) logger.info("Loading English TTS...") eng_tts = VitsModel.from_pretrained("facebook/mms-tts-eng") eng_tok = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") eng_tts.eval() logger.info("English TTS loaded ✅") # Translation — Helsinki (accurate, dedicated EN→UR) logger.info("Loading translation model...") trans_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur") trans_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur") trans_model.eval() logger.info("Translation model loaded ✅") # Urdu TTS — parler-tts-mini (public, smooth, natural voice) logger.info("Loading Urdu TTS...") urdu_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts") urdu_tok = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") urdu_tts.eval() logger.info("Urdu TTS loaded ✅") # ─── Helpers ────────────────────────────────────────────────────────────────── def translate_to_urdu(text: str) -> str: inputs = trans_tok(text, return_tensors="pt", padding=True) with torch.no_grad(): translated = trans_model.generate(**inputs) return trans_tok.decode(translated[0], skip_special_tokens=True) def english_to_audio_b64(text: str) -> str: inputs = eng_tok(text, return_tensors="pt") with torch.no_grad(): waveform = eng_tts(**inputs).waveform waveform_np = waveform.squeeze().numpy() buf = io.BytesIO() scipy.io.wavfile.write(buf, rate=eng_tts.config.sampling_rate, data=waveform_np) buf.seek(0) return base64.b64encode(buf.read()).decode("utf-8") def urdu_to_audio_b64(urdu_text: str) -> str: # Voice description — controls how the speech sounds description = "A male speaker delivers clear, natural speech in a calm and neutral tone with no background noise." desc_inputs = urdu_tok(description, return_tensors="pt") text_inputs = urdu_tok(urdu_text, return_tensors="pt") with torch.no_grad(): generation = urdu_tts.generate( input_ids=desc_inputs.input_ids, prompt_input_ids=text_inputs.input_ids, ) waveform_np = generation.cpu().numpy().squeeze() buf = io.BytesIO() scipy.io.wavfile.write(buf, rate=urdu_tts.config.sampling_rate, data=waveform_np) buf.seek(0) return base64.b64encode(buf.read()).decode("utf-8") # ─── Request ────────────────────────────────────────────────────────────────── class TTSRequest(BaseModel): text: str # Always English input # ─── Endpoints ──────────────────────────────────────────────────────────────── @app.get("/") def root(): return {"status": "Bilingual TTS API v3 running 🚀"} @app.get("/health") def health(): return {"status": "ok"} @app.post("/tts/english") def tts_english(request: TTSRequest): """English text → speaks in English""" if not request.text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") try: audio = english_to_audio_b64(request.text) return {"audio": audio, "language": "english", "text": request.text} except Exception as e: logger.error(f"English TTS error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/tts/english-to-urdu") def tts_english_to_urdu(request: TTSRequest): """English text → translate to Urdu → speaks in Urdu""" if not request.text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") try: urdu_text = translate_to_urdu(request.text) logger.info(f"Translated: {urdu_text[:60]}") audio = urdu_to_audio_b64(urdu_text) return { "audio": audio, "language": "urdu", "original_text": request.text, "urdu_text": urdu_text, } except Exception as e: logger.error(f"English→Urdu TTS error: {e}") raise HTTPException(status_code=500, detail=str(e))