Spaces:
Running
Running
| import os | |
| import torch | |
| import torchaudio | |
| import torchcodec | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from transformers import Wav2Vec2BertProcessor, AutoModelForCTC, VitsModel, AutoTokenizer | |
| from pydub import AudioSegment | |
| import tempfile | |
| import io | |
| import gradio as gr | |
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| import io | |
| import os | |
| import string | |
| import unicodedata | |
| from pypinyin import pinyin, Style | |
| import re | |
| from umsc import UgMultiScriptConverter | |
| from huggingface_hub import login | |
| from utils import preprocess_uyghur_text | |
| app = FastAPI(title="Uyghur Text To Speech API") | |
| # Allow specific domains or all (*) for testing | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| def greet_html(): | |
| return """ | |
| <html> | |
| <body> | |
| <h1> | |
| URL: | |
| <a href="https://tts.piyazon.top">https://tts.piyazon.top</a> | |
| </h1> | |
| </body> | |
| </html> | |
| """ | |
| model_cache = {} | |
| tokenizer_cache = {} | |
| def load_model_and_tokenizer(model_name: str, hf_token: str): | |
| """ | |
| Load model and tokenizer with caching to avoid reloading. | |
| Args: | |
| model_name (str): Name of the model from MODEL_OPTIONS. | |
| Returns: | |
| tuple: (model, tokenizer) | |
| """ | |
| if model_name not in model_cache: | |
| model_cache[model_name] = VitsModel.from_pretrained(model_name, token=hf_token) | |
| tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(model_name, token=hf_token) | |
| return model_cache[model_name], tokenizer_cache[model_name] | |
| def generate_speech(text: str, model_name: str, hf_token: str): | |
| model, tokenizer = load_model_and_tokenizer(model_name, hf_token) | |
| fixed_text =preprocess_uyghur_text(text) | |
| print(text) | |
| print(fixed_text) | |
| inputs = tokenizer(fixed_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| audio_data = output.squeeze().cpu().numpy() | |
| sample_rate = model.config.sampling_rate | |
| # Save to bytes io | |
| byte_io = io.BytesIO() | |
| sf.write(byte_io, audio_data, sample_rate, format='WAV') | |
| byte_io.seek(0) | |
| return byte_io | |
| def synthesize( | |
| text: str = Form(...), | |
| model: str = Form("piyazon/TTS-CV-Unique-Ug-2"), | |
| hf_token: str = Form(..., description="Hugging Face authentication token")): | |
| try: | |
| audio_bytes = generate_speech(text, model, hf_token) | |
| return StreamingResponse(audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=speech.wav"}) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |