Spaces:

farid678
/

TTSModel

Runtime error

File size: 2,726 Bytes

c210c3a
2011547
19f1a35
ef52462
7bacf1c
9ed7609
 
 
ef52462
7bacf1c
 
ef52462
7bacf1c
ef52462
7bacf1c
b814e71
b1f91a4
ef52462
b1f91a4
c210c3a
4655264
ef52462
 
 
540093e
4655264
ef52462
 
 
 
 
 
 
b814e71
 
 
 
 
 
 
 
 
ef52462
 
c210c3a
ef52462
 
b814e71
ef52462
a766e2a
540093e
 
 
 
921b259
 
a766e2a
921b259
a766e2a
 
 
9ed7609
 
 
921b259
a766e2a
9ed7609
 
b814e71
2011547
9ed7609
2011547
c210c3a
9ed7609
7bacf1c
 
2011547
b814e71
 
7bacf1c
b814e71
 
 
2011547
9ed7609
19f1a35
c210c3a
2011547
9ed7609
ef52462
c210c3a
b814e71
 
ef52462
 
 
b814e71
540093e

import torch
import numpy as np
import gradio as gr
from transformers import pipeline
import logging
from scipy.io.wavfile import write
import uuid
import os
import warnings

# -----------------------------
# SUPPRESS WARNINGS
# -----------------------------
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)

# -----------------------------
# DEVICE SETUP
# -----------------------------
device = 0 if torch.cuda.is_available() else -1

# -----------------------------
# PATH TO FINE-TUNED MODEL
# -----------------------------
model_dir = "./"  # مسیر فایل‌های fine-tuned Orpheus در Space

# -----------------------------
# LOAD TTS PIPELINE
# -----------------------------
tts_pipe = pipeline(
    task="text-to-speech",
    model=model_dir,
    device=device
)

# -----------------------------
# INFERENCE FUNCTION
# -----------------------------
def tts_generate(text):
    if not text.strip():
        return None

    # اجرای مدل TTS
    output = tts_pipe(text)

    if "audio" not in output:
        raise ValueError("TTS pipeline did not return audio")

    audio = np.array(output["audio"], dtype=np.float32)

    # sanitize audio to avoid RuntimeWarning
    audio = np.nan_to_num(audio)          # convert NaN/Inf to 0
    audio = np.clip(audio, -1.0, 1.0)    # limit values to [-1,1]

    # بررسی و مقدار پیش‌فرض sampling rate
    sr = output.get("sampling_rate") or 22050

    # تبدیل float32 به int16
    audio_int16 = (audio * 32767).astype(np.int16)

    # ساخت پوشه خروجی
    os.makedirs("outputs", exist_ok=True)
    out_path = f"outputs/{uuid.uuid4().hex}.wav"

    # ذخیره WAV
    write(out_path, sr, audio_int16)

    return out_path

# -----------------------------
# SAMPLE TEXTS
# -----------------------------
SAMPLES = [
    "Just end up crashing somewhere. <laugh> No, because remember last time?",
    "Hmm… I don't know. <laugh> This feels like a bad idea. <gasp>",
    "I'm so tired today <yawn> but I still have so much work to do.",
]

# -----------------------------
# GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
    fn=tts_generate,
    inputs=gr.Textbox(
        label="Enter text (use expressive tags like <laugh>, <sigh>)",
        lines=5,
        placeholder=SAMPLES[0],
    ),
    outputs=gr.Audio(type="filepath", label="Generated Audio"),
    title="Fine-tuned Orpheus-3B Expressive TTS",
    examples=[[s] for s in SAMPLES],
)

# -----------------------------
# CLEAN RUN
# -----------------------------
if __name__ == "__main__":
    demo.launch(ssr_mode=False)  # کاهش خطاهای asyncio / Invalid file descriptor