Spaces:

piyazon
/

ug-tts-api

Running

ug-tts-api / app.py

Piyazon

change domain tts

4dd4e6b 3 months ago

3.06 kB

	import os
	import torch
	import torchaudio
	import torchcodec
	from fastapi import FastAPI, UploadFile, File, Form, HTTPException
	from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	from transformers import Wav2Vec2BertProcessor, AutoModelForCTC, VitsModel, AutoTokenizer
	from pydub import AudioSegment
	import tempfile
	import io
	import gradio as gr
	from transformers import VitsModel, AutoTokenizer
	import torch
	import numpy as np
	import soundfile as sf
	import io
	import os
	import string
	import unicodedata
	from pypinyin import pinyin, Style
	import re
	from umsc import UgMultiScriptConverter
	from huggingface_hub import login

	from utils import preprocess_uyghur_text

	app = FastAPI(title="Uyghur Text To Speech API")

	# Allow specific domains or all (*) for testing
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.get("/", response_class=HTMLResponse)
	def greet_html():
	return """
	<html>
	<body>
	<h1>
	URL:
	<a href="https://tts.piyazon.top">https://tts.piyazon.top</a>
	</h1>
	</body>
	</html>
	"""

	model_cache = {}
	tokenizer_cache = {}

	def load_model_and_tokenizer(model_name: str, hf_token: str):
	"""
	Load model and tokenizer with caching to avoid reloading.
	Args:
	model_name (str): Name of the model from MODEL_OPTIONS.
	Returns:
	tuple: (model, tokenizer)
	"""
	if model_name not in model_cache:
	model_cache[model_name] = VitsModel.from_pretrained(model_name, token=hf_token)
	tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(model_name, token=hf_token)
	return model_cache[model_name], tokenizer_cache[model_name]


	def generate_speech(text: str, model_name: str, hf_token: str):
	model, tokenizer = load_model_and_tokenizer(model_name, hf_token)

	fixed_text =preprocess_uyghur_text(text)

	print(text)
	print(fixed_text)

	inputs = tokenizer(fixed_text, return_tensors="pt")

	with torch.no_grad():
	output = model(**inputs).waveform

	audio_data = output.squeeze().cpu().numpy()
	sample_rate = model.config.sampling_rate

	# Save to bytes io
	byte_io = io.BytesIO()
	sf.write(byte_io, audio_data, sample_rate, format='WAV')
	byte_io.seek(0)
	return byte_io

	@app.post("/synthesize")
	def synthesize(
	text: str = Form(...),
	model: str = Form("piyazon/TTS-CV-Unique-Ug-2"),
	hf_token: str = Form(..., description="Hugging Face authentication token")):
	try:
	audio_bytes = generate_speech(text, model, hf_token)
	return StreamingResponse(audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=speech.wav"})
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)