Soprano-TTS

import gradio as gr
import io
import tempfile
import numpy as np

# Optional imports for Soprano TTS (lazy load)
try:
    import torch  # type: ignore
except Exception:  # pragma: no cover
    torch = None  # type: ignore

# Patch soprano's TransformersModel to fix dtype bug in v0.2.0
# The library incorrectly uses 'dtype=' instead of 'torch_dtype=' in from_pretrained()
def _patch_soprano_transformers():
    try:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import soprano.backends.transformers as soprano_transformers
        
        class PatchedTransformersModel(soprano_transformers.BaseModel):
            def __init__(self, device='cuda', model_path=None, **kwargs):
                self.device = device
                model_name_or_path = model_path if model_path else 'ekwek/Soprano-1.1-80M'
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name_or_path,
                    torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32,
                    device_map=device
                )
                self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
                self.model.eval()
            
            # Inherit all other methods from original
            infer = soprano_transformers.TransformersModel.infer
            stream_infer = soprano_transformers.TransformersModel.stream_infer
        
        soprano_transformers.TransformersModel = PatchedTransformersModel
    except Exception as e:
        print(f"Warning: Could not patch soprano transformers backend: {e}")

_patch_soprano_transformers()

try:
    from soprano import SopranoTTS  # type: ignore
except Exception:  # pragma: no cover
    SopranoTTS = None  # type: ignore
try:
    from scipy.io.wavfile import write as wav_write  # type: ignore
except Exception:  # pragma: no cover
    wav_write = None  # type: ignore

_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}

SAMPLE_RATE = 32000


def _init_soprano() -> None:
    """Initialize the Soprano model lazily. Requires CUDA GPU."""
    if _SOPRANO_STATE["initialized"]:
        return
    if SopranoTTS is None:
        raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")

    if not torch or not torch.cuda.is_available():
        raise gr.Error(
            "Currently running on CPU. Soprano requires a GPU."
        )

    device = "cuda"
    print(f"Using device: {device}")

    # Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers
    model = SopranoTTS(
        backend="auto",
        device=device,
    )
    _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})


def soprano_tts(
    text: str,
    temperature: float,
    top_p: float,
    repetition_penalty: float,
) -> tuple[int, np.ndarray] | None:
    """Generate speech from text using Soprano."""
    if not text or not text.strip():
        raise gr.Error("Please enter text to synthesize.")

    _init_soprano()
    model = _SOPRANO_STATE["model"]

    try:
        audio = model.infer(
            text,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
        )
        # Model returns a tensor; convert to numpy
        audio_np = audio.cpu().numpy()
        return (SAMPLE_RATE, audio_np)
    except gr.Error:
        raise
    except Exception as e:
        raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")


# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-1.1-80M | 32kHz High-Fidelity Audio</p>")

    with gr.Row(variant="panel"):
        temperature = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.3,
            step=0.05,
            label="Temperature",
            info="Controls randomness. Lower = more deterministic.",
        )
        top_p = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.95,
            step=0.01,
            label="Top-P",
            info="Nucleus sampling threshold.",
        )
        repetition_penalty = gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.2,
            step=0.05,
            label="Repetition Penalty",
            info="Penalizes repeated tokens.",
        )

    text_input = gr.Textbox(
        label="Input Text",
        placeholder="Enter the text you want to convert to speech here...",
        value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
        lines=5,
    )

    generate_btn = gr.Button(
        "Generate Speech",
        variant="primary",
    )

    audio_output = gr.Audio(
        label="Generated Speech",
        autoplay=True,
    )

    generate_inputs = [text_input, temperature, top_p, repetition_penalty]

    generate_btn.click(
        fn=soprano_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech",
    )

    text_input.submit(
        fn=soprano_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech_enter",
    )

if __name__ == "__main__":
    demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")