Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, VitsModel | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| import torch | |
| import scipy.io.wavfile | |
| import base64 | |
| import io | |
| import logging | |
| import os | |
| from huggingface_hub import login | |
| login(token=os.environ.get("HF_TOKEN")) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI(title="Bilingual TTS API", version="3.0.0") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # βββ Load Models ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # English TTS β MMS (fast, no gating) | |
| logger.info("Loading English TTS...") | |
| eng_tts = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
| eng_tok = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
| eng_tts.eval() | |
| logger.info("English TTS loaded β ") | |
| # Translation β Helsinki (accurate, dedicated ENβUR) | |
| logger.info("Loading translation model...") | |
| trans_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur") | |
| trans_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur") | |
| trans_model.eval() | |
| logger.info("Translation model loaded β ") | |
| # Urdu TTS β parler-tts-mini (public, smooth, natural voice) | |
| logger.info("Loading Urdu TTS...") | |
| urdu_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts") | |
| urdu_tok = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") | |
| urdu_tts.eval() | |
| logger.info("Urdu TTS loaded β ") | |
| # βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def translate_to_urdu(text: str) -> str: | |
| inputs = trans_tok(text, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| translated = trans_model.generate(**inputs) | |
| return trans_tok.decode(translated[0], skip_special_tokens=True) | |
| def english_to_audio_b64(text: str) -> str: | |
| inputs = eng_tok(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| waveform = eng_tts(**inputs).waveform | |
| waveform_np = waveform.squeeze().numpy() | |
| buf = io.BytesIO() | |
| scipy.io.wavfile.write(buf, rate=eng_tts.config.sampling_rate, data=waveform_np) | |
| buf.seek(0) | |
| return base64.b64encode(buf.read()).decode("utf-8") | |
| def urdu_to_audio_b64(urdu_text: str) -> str: | |
| # Voice description β controls how the speech sounds | |
| description = "A male speaker delivers clear, natural speech in a calm and neutral tone with no background noise." | |
| desc_inputs = urdu_tok(description, return_tensors="pt") | |
| text_inputs = urdu_tok(urdu_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| generation = urdu_tts.generate( | |
| input_ids=desc_inputs.input_ids, | |
| prompt_input_ids=text_inputs.input_ids, | |
| ) | |
| waveform_np = generation.cpu().numpy().squeeze() | |
| buf = io.BytesIO() | |
| scipy.io.wavfile.write(buf, rate=urdu_tts.config.sampling_rate, data=waveform_np) | |
| buf.seek(0) | |
| return base64.b64encode(buf.read()).decode("utf-8") | |
| # βββ Request ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TTSRequest(BaseModel): | |
| text: str # Always English input | |
| # βββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def root(): | |
| return {"status": "Bilingual TTS API v3 running π"} | |
| def health(): | |
| return {"status": "ok"} | |
| def tts_english(request: TTSRequest): | |
| """English text β speaks in English""" | |
| if not request.text.strip(): | |
| raise HTTPException(status_code=400, detail="Text cannot be empty") | |
| try: | |
| audio = english_to_audio_b64(request.text) | |
| return {"audio": audio, "language": "english", "text": request.text} | |
| except Exception as e: | |
| logger.error(f"English TTS error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def tts_english_to_urdu(request: TTSRequest): | |
| """English text β translate to Urdu β speaks in Urdu""" | |
| if not request.text.strip(): | |
| raise HTTPException(status_code=400, detail="Text cannot be empty") | |
| try: | |
| urdu_text = translate_to_urdu(request.text) | |
| logger.info(f"Translated: {urdu_text[:60]}") | |
| audio = urdu_to_audio_b64(urdu_text) | |
| return { | |
| "audio": audio, | |
| "language": "urdu", | |
| "original_text": request.text, | |
| "urdu_text": urdu_text, | |
| } | |
| except Exception as e: | |
| logger.error(f"EnglishβUrdu TTS error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) |