import gradio as gr import io import tempfile import numpy as np # Optional imports for Soprano TTS (lazy load) try: import torch # type: ignore except Exception: # pragma: no cover torch = None # type: ignore # Patch soprano's TransformersModel to fix dtype bug in v0.2.0 # The library incorrectly uses 'dtype=' instead of 'torch_dtype=' in from_pretrained() def _patch_soprano_transformers(): try: from transformers import AutoModelForCausalLM, AutoTokenizer import soprano.backends.transformers as soprano_transformers class PatchedTransformersModel(soprano_transformers.BaseModel): def __init__(self, device='cuda', model_path=None, **kwargs): self.device = device model_name_or_path = model_path if model_path else 'ekwek/Soprano-1.1-80M' self.model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32, device_map=device ) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.model.eval() # Inherit all other methods from original infer = soprano_transformers.TransformersModel.infer stream_infer = soprano_transformers.TransformersModel.stream_infer soprano_transformers.TransformersModel = PatchedTransformersModel except Exception as e: print(f"Warning: Could not patch soprano transformers backend: {e}") _patch_soprano_transformers() try: from soprano import SopranoTTS # type: ignore except Exception: # pragma: no cover SopranoTTS = None # type: ignore try: from scipy.io.wavfile import write as wav_write # type: ignore except Exception: # pragma: no cover wav_write = None # type: ignore _SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None} SAMPLE_RATE = 32000 def _init_soprano() -> None: """Initialize the Soprano model lazily. Requires CUDA GPU.""" if _SOPRANO_STATE["initialized"]: return if SopranoTTS is None: raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode") if not torch or not torch.cuda.is_available(): raise gr.Error( "Currently running on CPU. Soprano requires a GPU." ) device = "cuda" print(f"Using device: {device}") # Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers model = SopranoTTS( backend="auto", device=device, ) _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model}) def soprano_tts( text: str, temperature: float, top_p: float, repetition_penalty: float, ) -> tuple[int, np.ndarray] | None: """Generate speech from text using Soprano.""" if not text or not text.strip(): raise gr.Error("Please enter text to synthesize.") _init_soprano() model = _SOPRANO_STATE["model"] try: audio = model.infer( text, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) # Model returns a tensor; convert to numpy audio_np = audio.cpu().numpy() return (SAMPLE_RATE, audio_np) except gr.Error: raise except Exception as e: raise gr.Error(f"Error during speech generation: {str(e)[:200]}...") # --- Gradio UI --- with gr.Blocks() as demo: gr.HTML("
Powered by Soprano-1.1-80M | 32kHz High-Fidelity Audio
") with gr.Row(variant="panel"): temperature = gr.Slider( minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Temperature", info="Controls randomness. Lower = more deterministic.", ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="Top-P", info="Nucleus sampling threshold.", ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.2, step=0.05, label="Repetition Penalty", info="Penalizes repeated tokens.", ) text_input = gr.Textbox( label="Input Text", placeholder="Enter the text you want to convert to speech here...", value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", lines=5, ) generate_btn = gr.Button( "Generate Speech", variant="primary", ) audio_output = gr.Audio( label="Generated Speech", autoplay=True, ) generate_inputs = [text_input, temperature, top_p, repetition_penalty] generate_btn.click( fn=soprano_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech", ) text_input.submit( fn=soprano_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech_enter", ) if __name__ == "__main__": demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")