Spaces:
Sleeping
Sleeping
File size: 5,259 Bytes
27234d5 f114682 9144e13 27234d5 f114682 27234d5 9144e13 27234d5 9144e13 27234d5 f114682 9144e13 f114682 27234d5 f114682 9144e13 f114682 27234d5 f114682 9144e13 f114682 27234d5 9144e13 27234d5 9144e13 27234d5 9144e13 27234d5 9144e13 f114682 9144e13 27234d5 9144e13 27234d5 9144e13 27234d5 9144e13 27234d5 f114682 27234d5 9144e13 27234d5 9144e13 f114682 27234d5 9144e13 27234d5 f114682 27234d5 9144e13 f114682 27234d5 9144e13 27234d5 9144e13 27234d5 f114682 9144e13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, VitsModel
from parler_tts import ParlerTTSForConditionalGeneration
import torch
import scipy.io.wavfile
import base64
import io
import logging
import os
from huggingface_hub import login
login(token=os.environ.get("HF_TOKEN"))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Bilingual TTS API", version="3.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# βββ Load Models ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# English TTS β MMS (fast, no gating)
logger.info("Loading English TTS...")
eng_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
eng_tok = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
eng_tts.eval()
logger.info("English TTS loaded β
")
# Translation β Helsinki (accurate, dedicated ENβUR)
logger.info("Loading translation model...")
trans_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_model.eval()
logger.info("Translation model loaded β
")
# Urdu TTS β parler-tts-mini (public, smooth, natural voice)
logger.info("Loading Urdu TTS...")
urdu_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tok = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tts.eval()
logger.info("Urdu TTS loaded β
")
# βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def translate_to_urdu(text: str) -> str:
inputs = trans_tok(text, return_tensors="pt", padding=True)
with torch.no_grad():
translated = trans_model.generate(**inputs)
return trans_tok.decode(translated[0], skip_special_tokens=True)
def english_to_audio_b64(text: str) -> str:
inputs = eng_tok(text, return_tensors="pt")
with torch.no_grad():
waveform = eng_tts(**inputs).waveform
waveform_np = waveform.squeeze().numpy()
buf = io.BytesIO()
scipy.io.wavfile.write(buf, rate=eng_tts.config.sampling_rate, data=waveform_np)
buf.seek(0)
return base64.b64encode(buf.read()).decode("utf-8")
def urdu_to_audio_b64(urdu_text: str) -> str:
# Voice description β controls how the speech sounds
description = "A male speaker delivers clear, natural speech in a calm and neutral tone with no background noise."
desc_inputs = urdu_tok(description, return_tensors="pt")
text_inputs = urdu_tok(urdu_text, return_tensors="pt")
with torch.no_grad():
generation = urdu_tts.generate(
input_ids=desc_inputs.input_ids,
prompt_input_ids=text_inputs.input_ids,
)
waveform_np = generation.cpu().numpy().squeeze()
buf = io.BytesIO()
scipy.io.wavfile.write(buf, rate=urdu_tts.config.sampling_rate, data=waveform_np)
buf.seek(0)
return base64.b64encode(buf.read()).decode("utf-8")
# βββ Request ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TTSRequest(BaseModel):
text: str # Always English input
# βββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@app.get("/")
def root():
return {"status": "Bilingual TTS API v3 running π"}
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/tts/english")
def tts_english(request: TTSRequest):
"""English text β speaks in English"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
try:
audio = english_to_audio_b64(request.text)
return {"audio": audio, "language": "english", "text": request.text}
except Exception as e:
logger.error(f"English TTS error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/tts/english-to-urdu")
def tts_english_to_urdu(request: TTSRequest):
"""English text β translate to Urdu β speaks in Urdu"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
try:
urdu_text = translate_to_urdu(request.text)
logger.info(f"Translated: {urdu_text[:60]}")
audio = urdu_to_audio_b64(urdu_text)
return {
"audio": audio,
"language": "urdu",
"original_text": request.text,
"urdu_text": urdu_text,
}
except Exception as e:
logger.error(f"EnglishβUrdu TTS error: {e}")
raise HTTPException(status_code=500, detail=str(e)) |