File size: 1,484 Bytes
16b84f6
 
8b8068e
16b84f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b8068e
16b84f6
8b8068e
 
 
 
 
 
 
16b84f6
8b8068e
16b84f6
 
8b8068e
16b84f6
 
 
 
 
8b8068e
 
16b84f6
 
8b8068e
16b84f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gradio as gr
import torch
import os
import soundfile as sf
from transformers import pipeline

# Try GPU Unsloth model, fallback to CPU-friendly TTS
try:
    from unsloth import FastLanguageModel
    model_id = "unsloth/sesame-csm-tts"
    model, tokenizer = FastLanguageModel.from_pretrained(model_name=model_id)
    tts_pipeline = pipeline("text-to-speech", model=model_id)
    print("✅ Using Unsloth Sesame CSM TTS on GPU")
except Exception:
    print("⚠️ GPU model unavailable or Unsloth not supported on CPU.")
    model_id = "facebook/mms-tts-eng"
    tts_pipeline = pipeline("text-to-speech", model=model_id)

def generate_tts(text):
    if not text.strip():
        return None, "⚠️ Please enter some text."

    outputs = tts_pipeline(text)

    # Ensure output directory exists
    output_dir = "outputs"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "output.wav")

    # Save audio file
    sf.write(output_path, outputs["audio"], outputs["sampling_rate"])

    return output_path, "✅ Audio generated successfully!"

# Gradio UI
interface = gr.Interface(
    fn=generate_tts,
    inputs=gr.Textbox(label="Enter Text", placeholder="Type something..."),
    outputs=[gr.Audio(label="Generated Speech"), gr.Textbox(label="Status")],
    title="🎙️ Sesame CSM TTS Demo",
    description="Generate speech using Sesame CSM TTS (GPU-supported)",
    allow_flagging="never"
)

# Launch the interface
interface.launch()