File size: 2,726 Bytes
c210c3a 2011547 19f1a35 ef52462 7bacf1c 9ed7609 ef52462 7bacf1c ef52462 7bacf1c ef52462 7bacf1c b814e71 b1f91a4 ef52462 b1f91a4 c210c3a 4655264 ef52462 540093e 4655264 ef52462 b814e71 ef52462 c210c3a ef52462 b814e71 ef52462 a766e2a 540093e 921b259 a766e2a 921b259 a766e2a 9ed7609 921b259 a766e2a 9ed7609 b814e71 2011547 9ed7609 2011547 c210c3a 9ed7609 7bacf1c 2011547 b814e71 7bacf1c b814e71 2011547 9ed7609 19f1a35 c210c3a 2011547 9ed7609 ef52462 c210c3a b814e71 ef52462 b814e71 540093e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import torch
import numpy as np
import gradio as gr
from transformers import pipeline
import logging
from scipy.io.wavfile import write
import uuid
import os
import warnings
# -----------------------------
# SUPPRESS WARNINGS
# -----------------------------
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
# -----------------------------
# DEVICE SETUP
# -----------------------------
device = 0 if torch.cuda.is_available() else -1
# -----------------------------
# PATH TO FINE-TUNED MODEL
# -----------------------------
model_dir = "./" # مسیر فایلهای fine-tuned Orpheus در Space
# -----------------------------
# LOAD TTS PIPELINE
# -----------------------------
tts_pipe = pipeline(
task="text-to-speech",
model=model_dir,
device=device
)
# -----------------------------
# INFERENCE FUNCTION
# -----------------------------
def tts_generate(text):
if not text.strip():
return None
# اجرای مدل TTS
output = tts_pipe(text)
if "audio" not in output:
raise ValueError("TTS pipeline did not return audio")
audio = np.array(output["audio"], dtype=np.float32)
# sanitize audio to avoid RuntimeWarning
audio = np.nan_to_num(audio) # convert NaN/Inf to 0
audio = np.clip(audio, -1.0, 1.0) # limit values to [-1,1]
# بررسی و مقدار پیشفرض sampling rate
sr = output.get("sampling_rate") or 22050
# تبدیل float32 به int16
audio_int16 = (audio * 32767).astype(np.int16)
# ساخت پوشه خروجی
os.makedirs("outputs", exist_ok=True)
out_path = f"outputs/{uuid.uuid4().hex}.wav"
# ذخیره WAV
write(out_path, sr, audio_int16)
return out_path
# -----------------------------
# SAMPLE TEXTS
# -----------------------------
SAMPLES = [
"Just end up crashing somewhere. <laugh> No, because remember last time?",
"Hmm… I don't know. <laugh> This feels like a bad idea. <gasp>",
"I'm so tired today <yawn> but I still have so much work to do.",
]
# -----------------------------
# GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
fn=tts_generate,
inputs=gr.Textbox(
label="Enter text (use expressive tags like <laugh>, <sigh>)",
lines=5,
placeholder=SAMPLES[0],
),
outputs=gr.Audio(type="filepath", label="Generated Audio"),
title="Fine-tuned Orpheus-3B Expressive TTS",
examples=[[s] for s in SAMPLES],
)
# -----------------------------
# CLEAN RUN
# -----------------------------
if __name__ == "__main__":
demo.launch(ssr_mode=False) # کاهش خطاهای asyncio / Invalid file descriptor
|