bark / app.py
latterworks's picture
Update app.py
ba4903d verified
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import numpy as np
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# Initialize Bark TTS model
try:
synthesizer = pipeline("text-to-speech", "suno/bark")
tts_available = True
except Exception as e:
print(f"TTS model failed to load: {e}")
tts_available = False
synthesizer = None
def generate_speech(text):
"""Generate speech from text using Bark TTS"""
if not tts_available or not synthesizer:
return None, "TTS not available"
try:
speech = synthesizer(text, forward_params={"do_sample": True})
# Convert to format Gradio expects
audio_data = speech["audio"].flatten()
sample_rate = speech["sampling_rate"]
return sample_rate, audio_data
except Exception as e:
return None, f"TTS Error: {str(e)}"
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
"""Generate chat response"""
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
if token:
response += token
yield response
def respond_with_audio(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
enable_tts
):
"""Generate chat response and optionally convert to speech"""
# Get text response
final_response = ""
for response in respond(message, history, system_message, max_tokens, temperature, top_p):
final_response = response
yield response, None # Yield text first, audio comes later
# Generate audio if TTS is enabled
if enable_tts and tts_available and final_response.strip():
try:
# Clean response for TTS (remove markdown, keep essential punctuation)
clean_text = final_response.replace("*", "").replace("#", "").replace("`", "")
# Limit length for TTS (Bark works best with shorter texts)
if len(clean_text) > 500:
clean_text = clean_text[:500] + "..."
sample_rate, audio_data = generate_speech(clean_text)
if sample_rate:
yield final_response, (sample_rate, audio_data)
else:
yield final_response, None
except Exception as e:
print(f"TTS generation failed: {e}")
yield final_response, None
else:
yield final_response, None
# Create the main chat interface with TTS option
with gr.Blocks(title="Chat + TTS Bot") as demo:
gr.Markdown("# 🤖 Chat Bot with Text-to-Speech")
gr.Markdown("Chat with Zephyr-7B and optionally hear responses with Bark TTS")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(height=400)
msg = gr.Textbox(
placeholder="Type your message here...",
label="Message",
lines=2
)
with gr.Row():
submit = gr.Button("💬 Send", variant="primary")
clear = gr.Button("🗑️ Clear")
with gr.Column(scale=1):
# TTS Controls
gr.Markdown("### 🔊 Text-to-Speech")
enable_tts = gr.Checkbox(
label="Enable TTS for responses",
value=False,
info="Generate audio for bot responses"
)
audio_output = gr.Audio(
label="Response Audio",
autoplay=False,
visible=True
)
# Manual TTS
gr.Markdown("### 🎤 Manual TTS")
tts_input = gr.Textbox(
placeholder="Enter text to convert to speech...",
label="Text for TTS",
lines=2
)
tts_button = gr.Button("🗣️ Generate Speech")
# Chat Settings (Collapsible)
with gr.Accordion("⚙️ Chat Settings", open=False):
system_message = gr.Textbox(
value="You are a friendly and helpful AI assistant.",
label="System Message",
lines=2
)
with gr.Row():
max_tokens = gr.Slider(
minimum=1,
maximum=2048,
value=512,
step=1,
label="Max tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p"
)
# State for chat history
chat_history = gr.State([])
def user_message(message, history):
"""Add user message to chat"""
return "", history + [[message, None]]
def bot_response(history, system_msg, max_tok, temp, top_p, tts_enabled):
"""Generate bot response with optional TTS"""
if not history or not history[-1][0]:
return history, None
user_msg = history[-1][0]
# Generate response
for response, audio in respond_with_audio(
user_msg,
history[:-1],
system_msg,
max_tok,
temp,
top_p,
tts_enabled
):
history[-1][1] = response
yield history, audio
def manual_tts(text):
"""Generate TTS for manual input"""
if not text.strip():
return None
return generate_speech(text)
# Event handlers
msg.submit(
user_message,
[msg, chatbot],
[msg, chatbot],
queue=False
).then(
bot_response,
[chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
[chatbot, audio_output]
)
submit.click(
user_message,
[msg, chatbot],
[msg, chatbot],
queue=False
).then(
bot_response,
[chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
[chatbot, audio_output]
)
clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
tts_button.click(
manual_tts,
inputs=[tts_input],
outputs=[audio_output]
)
# Add examples
gr.Examples(
examples=[
["Hello! How are you today?"],
["Tell me a short joke [laughs]"],
["Explain quantum physics in simple terms"],
["What's the weather like? [sighs]"]
],
inputs=[msg],
label="Example messages (try the ones with [laughs] or [sighs] for TTS effects!)"
)
if __name__ == "__main__":
demo.launch()