import os import torch import torchaudio import torchcodec from fastapi import FastAPI, UploadFile, File, Form, HTTPException from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse from fastapi.middleware.cors import CORSMiddleware from transformers import Wav2Vec2BertProcessor, AutoModelForCTC, VitsModel, AutoTokenizer from pydub import AudioSegment import tempfile import io import gradio as gr from transformers import VitsModel, AutoTokenizer import torch import numpy as np import soundfile as sf import io import os import string import unicodedata from pypinyin import pinyin, Style import re from umsc import UgMultiScriptConverter from huggingface_hub import login from utils import preprocess_uyghur_text app = FastAPI(title="Uyghur Text To Speech API") # Allow specific domains or all (*) for testing app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/", response_class=HTMLResponse) def greet_html(): return """

tts.piyazon.top

""" model_cache = {} tokenizer_cache = {} def load_model_and_tokenizer(model_name: str, hf_token: str): """ Load model and tokenizer with caching to avoid reloading. Args: model_name (str): Name of the model from MODEL_OPTIONS. Returns: tuple: (model, tokenizer) """ if model_name not in model_cache: model_cache[model_name] = VitsModel.from_pretrained(model_name, token=hf_token) tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(model_name, token=hf_token) return model_cache[model_name], tokenizer_cache[model_name] def generate_speech(text: str, model_name: str, hf_token: str): model, tokenizer = load_model_and_tokenizer(model_name, hf_token) fixed_text =preprocess_uyghur_text(text) print(text) print(fixed_text) inputs = tokenizer(fixed_text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform audio_data = output.squeeze().cpu().numpy() sample_rate = model.config.sampling_rate # Save to bytes io byte_io = io.BytesIO() sf.write(byte_io, audio_data, sample_rate, format='WAV') byte_io.seek(0) return byte_io @app.post("/synthesize") def synthesize( text: str = Form(...), model: str = Form("piyazon/TTS-CV-Unique-Ug-2"), hf_token: str = Form(..., description="Hugging Face authentication token")): try: audio_bytes = generate_speech(text, model, hf_token) return StreamingResponse(audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=speech.wav"}) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)