import gradio as gr import torch import os import soundfile as sf from transformers import pipeline # Try GPU Unsloth model, fallback to CPU-friendly TTS try: from unsloth import FastLanguageModel model_id = "unsloth/sesame-csm-tts" model, tokenizer = FastLanguageModel.from_pretrained(model_name=model_id) tts_pipeline = pipeline("text-to-speech", model=model_id) print("✅ Using Unsloth Sesame CSM TTS on GPU") except Exception: print("⚠️ GPU model unavailable or Unsloth not supported on CPU.") model_id = "facebook/mms-tts-eng" tts_pipeline = pipeline("text-to-speech", model=model_id) def generate_tts(text): if not text.strip(): return None, "⚠️ Please enter some text." outputs = tts_pipeline(text) # Ensure output directory exists output_dir = "outputs" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "output.wav") # Save audio file sf.write(output_path, outputs["audio"], outputs["sampling_rate"]) return output_path, "✅ Audio generated successfully!" # Gradio UI interface = gr.Interface( fn=generate_tts, inputs=gr.Textbox(label="Enter Text", placeholder="Type something..."), outputs=[gr.Audio(label="Generated Speech"), gr.Textbox(label="Status")], title="🎙️ Sesame CSM TTS Demo", description="Generate speech using Sesame CSM TTS (GPU-supported)", allow_flagging="never" ) # Launch the interface interface.launch()