antep-agzi / app.py
Arman Aksoy
Simplify UI and translate to Turkish
9c29482
import os
import subprocess
import tempfile
from functools import lru_cache
import gradio as gr
import numpy as np
from transformers import pipeline
# Optional torch import for tensor -> numpy safety (won't error if missing)
try:
import torch
except Exception: # pragma: no cover
torch = None
# ------------------------------------------------------------
# Model list (label → Hugging Face model id)
# ------------------------------------------------------------
MODEL_CHOICES = [
("Facebook's Original Turkish (facebook/mms-tts-tur)", "facebook/mms-tts-tur"),
("Custom checkpoint for fine tuning (armish/mms-tts-tur-train)", "armish/mms-tts-tur-train"),
("Gaziantepagzi.com - Asim Mihcioglu - 50 epoch (armish/mms-tts-antep-agzi1)", "armish/mms-tts-antep-agzi1"),
("Gaziantepagzi.com - Hatice Barazi - 50 epoch (armish/mms-tts-antep-agzi2)", "armish/mms-tts-antep-agzi2"),
("Antepagzindan.com - 50 epoch (armish/mms-tts-antep-agzi3)", "armish/mms-tts-antep-agzi3"),
("Antepagzindan.com - 50 epoch - uroman (armish/mms-tts-antep-agzi3_uroman)", "armish/mms-tts-antep-agzi3_uroman"),
("Antepagzindan.com - 100 epoch - uroman (armish/mms-tts-antep-agzi3_uroman_100)", "armish/mms-tts-antep-agzi3_uroman_100"),
]
# Default model → last one in list
DEFAULT_MODEL = "armish/mms-tts-antep-agzi3_uroman_100"
DEFAULT_LABEL = [lbl for (lbl, mid) in MODEL_CHOICES if mid == DEFAULT_MODEL][0]
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def _uromanize(text: str) -> str:
"""Romanize non-Latin text using the uroman Perl package."""
uroman_path = os.environ.get("UROMAN")
if not uroman_path:
raise RuntimeError(
"UROMAN environment variable is not set. "
"Add uroman to the repo and set UROMAN to its path in Space settings."
)
script = os.path.join(uroman_path, "bin", "uroman.pl")
proc = subprocess.run(
["perl", script],
input=text.encode("utf-8"),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if proc.returncode != 0:
raise RuntimeError(f"uroman error: {proc.stderr.decode('utf-8')}")
return proc.stdout.decode("utf-8").strip()
@lru_cache(maxsize=4)
def get_tts(model_name: str, device: int = -1):
"""Cache and return a TTS pipeline."""
return pipeline("text-to-speech", model=model_name, device=device)
def _to_numpy_1d(audio):
"""Convert HF output audio to a clean 1D float32 numpy array in [-1, 1]."""
# Convert torch -> numpy if needed
if torch is not None and isinstance(audio, torch.Tensor):
audio = audio.detach().cpu().numpy()
audio = np.asarray(audio)
# If batched or shaped (1, T) / (T, 1), squeeze to (T,)
if audio.ndim > 1:
audio = np.squeeze(audio)
# Ensure 1D
if audio.ndim != 1:
# Fall back: flatten to mono
audio = audio.reshape(-1)
# dtype & clean
audio = audio.astype(np.float32, copy=False)
# guard against NaNs/Infs
audio = np.nan_to_num(audio, nan=0.0, posinf=1.0, neginf=-1.0)
# clip to valid range
audio = np.clip(audio, -1.0, 1.0)
# If it's all zeros (rare), return as-is
return audio
def _write_wav_to_temp(sr: int, audio_f32: np.ndarray) -> str:
"""Write int16 WAV to a NamedTemporaryFile and return its filepath."""
# Scale to int16
audio_i16 = (audio_f32 * 32767.0).astype(np.int16)
# Write to a temp file using scipy to get a correct WAV header
import scipy.io.wavfile as wavfile # local import to keep import time light
sr = int(sr)
if sr <= 0:
raise ValueError(f"Invalid sampling rate: {sr}")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
wavfile.write(f.name, sr, audio_i16)
return f.name
def synthesize(text, model_label):
"""Main synthesis function."""
try:
# Map label → model id
model_map = dict(MODEL_CHOICES)
model_name = model_map.get(model_label, DEFAULT_MODEL)
# Always use CPU
device = -1
tts = get_tts(model_name, device=device)
out = tts(text)
# HF pipelines differ slightly; standard MMS returns dict with keys 'audio' and 'sampling_rate'
sr = int(out.get("sampling_rate") if isinstance(out, dict) else getattr(out, "sampling_rate", 16000))
audio = out.get("audio") if isinstance(out, dict) else getattr(out, "audio", None)
# Some pipelines return [audio] for batch size 1
if isinstance(audio, (list, tuple)):
audio = audio[0]
audio = _to_numpy_1d(audio)
wav_path = _write_wav_to_temp(sr, audio)
# Return FILEPATH (robust) instead of (sr, numpy) to avoid pydub header issues
return wav_path, gr.update(value=f"✅ Synthesized with {model_name}")
except Exception as e:
return None, gr.update(value=f"❌ {type(e).__name__}: {e}")
# ------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------
with gr.Blocks(title="Gaziantep Ağzı ile Seslendirme Servisi") as demo:
gr.Markdown(
"""
# Gaziantep Ağzı ile Seslendirme Servisi
Seslendirmek istediğiniz cümleyi yazın ve seslendir butonuna basın.
"""
)
text_in = gr.Textbox(
label="Metin",
value="Ben Antepli bir yapay zekayım.",
lines=3,
placeholder="Sentezlenecek metni girin…",
)
model_in = gr.Dropdown(
label="Model",
choices=[lbl for (lbl, _id) in MODEL_CHOICES],
value=DEFAULT_LABEL,
interactive=True,
)
synth_btn = gr.Button("🎙️ Seslendir", variant="primary")
# IMPORTANT: accept a FILEPATH from the function
audio_out = gr.Audio(label="Ses dosyası", type="filepath", autoplay=True)
status = gr.Textbox(label="Durum", value="", interactive=False)
synth_btn.click(
fn=synthesize,
inputs=[text_in, model_in],
outputs=[audio_out, status],
)
# --------------------------------------------------------
# Examples (per your request)
# --------------------------------------------------------
gr.Examples(
examples=[
["Ben Antepli bir yapay zekayım.", DEFAULT_LABEL],
["Kepeği kesilesice.", DEFAULT_LABEL],
["Gaziantep.", DEFAULT_LABEL],
["Ben sana ne hanek anlatıyorum?", DEFAULT_LABEL],
],
inputs=[text_in, model_in],
label="Örnekler",
)
if __name__ == "__main__":
# If your Space shows SSR warning, you can disable SSR explicitly if desired:
# demo.launch(ssr_mode=False)
demo.launch()