import gradio as gr from transformers import pipeline import numpy as np # Initialize the model pipe = pipeline(model="suno/bark-small") def text_to_speech(text): # Generate audio from text output = pipe( text, return_attention_mask=True ) # Normalize and scale audio to int16 range audio = output["audio"] audio = np.float32(audio) # Ensure float32 type audio = audio / np.max(np.abs(audio)) # Normalize to [-1, 1] audio = (audio * 32767).astype(np.int16) # Convert to int16 range return (output["sampling_rate"], audio) # Create Gradio interface demo = gr.Interface( fn=text_to_speech, inputs=gr.Textbox( label="Text to speak", placeholder="Enter the text you want to convert to speech...", ), outputs=gr.Audio(label="Generated Speech"), title="Text to Speech with Bark-small", description="Convert text to speech using the Suno Bark-small model", examples=[ ["Hey, it's HuggingFace on the phone!"], ["Welcome to my text to speech demo."], ] ) if __name__ == "__main__": demo.launch()