import os
import torch
import torchaudio
import torchcodec
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from transformers import Wav2Vec2BertProcessor, AutoModelForCTC, VitsModel, AutoTokenizer
from pydub import AudioSegment
import tempfile
import io
import gradio as gr
from transformers import VitsModel, AutoTokenizer
import torch
import numpy as np
import soundfile as sf
import io
import os
import string
import unicodedata
from pypinyin import pinyin, Style
import re
from umsc import UgMultiScriptConverter
from huggingface_hub import login
from utils import preprocess_uyghur_text
app = FastAPI(title="Uyghur Text To Speech API")
# Allow specific domains or all (*) for testing
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/", response_class=HTMLResponse)
def greet_html():
return """
"""
model_cache = {}
tokenizer_cache = {}
def load_model_and_tokenizer(model_name: str, hf_token: str):
"""
Load model and tokenizer with caching to avoid reloading.
Args:
model_name (str): Name of the model from MODEL_OPTIONS.
Returns:
tuple: (model, tokenizer)
"""
if model_name not in model_cache:
model_cache[model_name] = VitsModel.from_pretrained(model_name, token=hf_token)
tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(model_name, token=hf_token)
return model_cache[model_name], tokenizer_cache[model_name]
def generate_speech(text: str, model_name: str, hf_token: str):
model, tokenizer = load_model_and_tokenizer(model_name, hf_token)
fixed_text =preprocess_uyghur_text(text)
print(text)
print(fixed_text)
inputs = tokenizer(fixed_text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
audio_data = output.squeeze().cpu().numpy()
sample_rate = model.config.sampling_rate
# Save to bytes io
byte_io = io.BytesIO()
sf.write(byte_io, audio_data, sample_rate, format='WAV')
byte_io.seek(0)
return byte_io
@app.post("/synthesize")
def synthesize(
text: str = Form(...),
model: str = Form("piyazon/TTS-CV-Unique-Ug-2"),
hf_token: str = Form(..., description="Hugging Face authentication token")):
try:
audio_bytes = generate_speech(text, model, hf_token)
return StreamingResponse(audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=speech.wav"})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)