File size: 5,259 Bytes
27234d5
 
 
f114682
9144e13
27234d5
 
 
 
 
f114682
 
 
 
27234d5
 
 
 
9144e13
27234d5
 
 
 
 
 
 
 
9144e13
27234d5
f114682
9144e13
 
 
 
f114682
27234d5
f114682
 
9144e13
 
 
f114682
27234d5
f114682
9144e13
 
 
 
f114682
27234d5
 
 
 
9144e13
 
27234d5
9144e13
 
27234d5
 
9144e13
 
 
 
 
 
 
 
 
27234d5
 
9144e13
f114682
 
9144e13
 
27234d5
9144e13
 
 
27234d5
9144e13
 
 
 
 
27234d5
 
9144e13
27234d5
 
f114682
27234d5
 
 
 
 
 
9144e13
27234d5
 
 
 
 
 
 
9144e13
f114682
27234d5
 
 
9144e13
 
27234d5
f114682
27234d5
 
 
 
9144e13
f114682
27234d5
 
 
 
9144e13
 
27234d5
9144e13
27234d5
 
 
 
 
f114682
9144e13
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, VitsModel
from parler_tts import ParlerTTSForConditionalGeneration
import torch
import scipy.io.wavfile
import base64
import io
import logging
import os
from huggingface_hub import login

login(token=os.environ.get("HF_TOKEN"))

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="Bilingual TTS API", version="3.0.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# ─── Load Models ──────────────────────────────────────────────────────────────

# English TTS β€” MMS (fast, no gating)
logger.info("Loading English TTS...")
eng_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
eng_tok = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
eng_tts.eval()
logger.info("English TTS loaded βœ…")

# Translation — Helsinki (accurate, dedicated EN→UR)
logger.info("Loading translation model...")
trans_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
trans_model.eval()
logger.info("Translation model loaded βœ…")

# Urdu TTS β€” parler-tts-mini (public, smooth, natural voice)
logger.info("Loading Urdu TTS...")
urdu_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tok = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
urdu_tts.eval()
logger.info("Urdu TTS loaded βœ…")


# ─── Helpers ──────────────────────────────────────────────────────────────────

def translate_to_urdu(text: str) -> str:
    inputs = trans_tok(text, return_tensors="pt", padding=True)
    with torch.no_grad():
        translated = trans_model.generate(**inputs)
    return trans_tok.decode(translated[0], skip_special_tokens=True)


def english_to_audio_b64(text: str) -> str:
    inputs = eng_tok(text, return_tensors="pt")
    with torch.no_grad():
        waveform = eng_tts(**inputs).waveform
    waveform_np = waveform.squeeze().numpy()
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, rate=eng_tts.config.sampling_rate, data=waveform_np)
    buf.seek(0)
    return base64.b64encode(buf.read()).decode("utf-8")


def urdu_to_audio_b64(urdu_text: str) -> str:
    # Voice description β€” controls how the speech sounds
    description = "A male speaker delivers clear, natural speech in a calm and neutral tone with no background noise."
    desc_inputs = urdu_tok(description, return_tensors="pt")
    text_inputs = urdu_tok(urdu_text, return_tensors="pt")
    with torch.no_grad():
        generation = urdu_tts.generate(
            input_ids=desc_inputs.input_ids,
            prompt_input_ids=text_inputs.input_ids,
        )
    waveform_np = generation.cpu().numpy().squeeze()
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, rate=urdu_tts.config.sampling_rate, data=waveform_np)
    buf.seek(0)
    return base64.b64encode(buf.read()).decode("utf-8")


# ─── Request ──────────────────────────────────────────────────────────────────

class TTSRequest(BaseModel):
    text: str  # Always English input


# ─── Endpoints ────────────────────────────────────────────────────────────────

@app.get("/")
def root():
    return {"status": "Bilingual TTS API v3 running πŸš€"}

@app.get("/health")
def health():
    return {"status": "ok"}


@app.post("/tts/english")
def tts_english(request: TTSRequest):
    """English text β†’ speaks in English"""
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    try:
        audio = english_to_audio_b64(request.text)
        return {"audio": audio, "language": "english", "text": request.text}
    except Exception as e:
        logger.error(f"English TTS error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@app.post("/tts/english-to-urdu")
def tts_english_to_urdu(request: TTSRequest):
    """English text β†’ translate to Urdu β†’ speaks in Urdu"""
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    try:
        urdu_text = translate_to_urdu(request.text)
        logger.info(f"Translated: {urdu_text[:60]}")
        audio = urdu_to_audio_b64(urdu_text)
        return {
            "audio": audio,
            "language": "urdu",
            "original_text": request.text,
            "urdu_text": urdu_text,
        }
    except Exception as e:
        logger.error(f"English→Urdu TTS error: {e}")
        raise HTTPException(status_code=500, detail=str(e))