import gradio as gr
from transformers import pipeline
import numpy as np

# Initialize the model
pipe = pipeline(model="suno/bark-small")

def text_to_speech(text):
    # Generate audio from text
    output = pipe(
        text,
        return_attention_mask=True
    )
    
    # Normalize and scale audio to int16 range
    audio = output["audio"]
    audio = np.float32(audio)  # Ensure float32 type
    audio = audio / np.max(np.abs(audio))  # Normalize to [-1, 1]
    audio = (audio * 32767).astype(np.int16)  # Convert to int16 range
    
    return (output["sampling_rate"], audio)

# Create Gradio interface
demo = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(
        label="Text to speak",
        placeholder="Enter the text you want to convert to speech...",
    ),
    outputs=gr.Audio(label="Generated Speech"),
    title="Text to Speech with Bark-small",
    description="Convert text to speech using the Suno Bark-small model",
    examples=[
        ["Hey, it's HuggingFace on the phone!"],
        ["Welcome to my text to speech demo."],
    ]
)

if __name__ == "__main__":
    demo.launch()