Spaces:

latterworks
/

bark

Sleeping

App Files Files Community

bark / app.py

latterworks

Update app.py

ba4903d verified 9 months ago

raw

history blame contribute delete

7.63 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from transformers import pipeline
	import numpy as np

	"""
	For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	"""
	client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

	# Initialize Bark TTS model
	try:
	synthesizer = pipeline("text-to-speech", "suno/bark")
	tts_available = True
	except Exception as e:
	print(f"TTS model failed to load: {e}")
	tts_available = False
	synthesizer = None

	def generate_speech(text):
	"""Generate speech from text using Bark TTS"""
	if not tts_available or not synthesizer:
	return None, "TTS not available"

	try:
	speech = synthesizer(text, forward_params={"do_sample": True})
	# Convert to format Gradio expects
	audio_data = speech["audio"].flatten()
	sample_rate = speech["sampling_rate"]
	return sample_rate, audio_data
	except Exception as e:
	return None, f"TTS Error: {str(e)}"

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	"""Generate chat response"""
	messages = [{"role": "system", "content": system_message}]
	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})
	messages.append({"role": "user", "content": message})

	response = ""
	for message in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = message.choices[0].delta.content
	if token:
	response += token
	yield response

	def respond_with_audio(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	enable_tts
	):
	"""Generate chat response and optionally convert to speech"""
	# Get text response
	final_response = ""
	for response in respond(message, history, system_message, max_tokens, temperature, top_p):
	final_response = response
	yield response, None # Yield text first, audio comes later

	# Generate audio if TTS is enabled
	if enable_tts and tts_available and final_response.strip():
	try:
	# Clean response for TTS (remove markdown, keep essential punctuation)
	clean_text = final_response.replace("*", "").replace("#", "").replace("`", "")
	# Limit length for TTS (Bark works best with shorter texts)
	if len(clean_text) > 500:
	clean_text = clean_text[:500] + "..."

	sample_rate, audio_data = generate_speech(clean_text)
	if sample_rate:
	yield final_response, (sample_rate, audio_data)
	else:
	yield final_response, None
	except Exception as e:
	print(f"TTS generation failed: {e}")
	yield final_response, None
	else:
	yield final_response, None

	# Create the main chat interface with TTS option
	with gr.Blocks(title="Chat + TTS Bot") as demo:
	gr.Markdown("# 🤖 Chat Bot with Text-to-Speech")
	gr.Markdown("Chat with Zephyr-7B and optionally hear responses with Bark TTS")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(height=400)
	msg = gr.Textbox(
	placeholder="Type your message here...",
	label="Message",
	lines=2
	)

	with gr.Row():
	submit = gr.Button("💬 Send", variant="primary")
	clear = gr.Button("🗑️ Clear")

	with gr.Column(scale=1):
	# TTS Controls
	gr.Markdown("### 🔊 Text-to-Speech")
	enable_tts = gr.Checkbox(
	label="Enable TTS for responses",
	value=False,
	info="Generate audio for bot responses"
	)

	audio_output = gr.Audio(
	label="Response Audio",
	autoplay=False,
	visible=True
	)

	# Manual TTS
	gr.Markdown("### 🎤 Manual TTS")
	tts_input = gr.Textbox(
	placeholder="Enter text to convert to speech...",
	label="Text for TTS",
	lines=2
	)
	tts_button = gr.Button("🗣️ Generate Speech")

	# Chat Settings (Collapsible)
	with gr.Accordion("⚙️ Chat Settings", open=False):
	system_message = gr.Textbox(
	value="You are a friendly and helpful AI assistant.",
	label="System Message",
	lines=2
	)
	with gr.Row():
	max_tokens = gr.Slider(
	minimum=1,
	maximum=2048,
	value=512,
	step=1,
	label="Max tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p"
	)

	# State for chat history
	chat_history = gr.State([])

	def user_message(message, history):
	"""Add user message to chat"""
	return "", history + [[message, None]]

	def bot_response(history, system_msg, max_tok, temp, top_p, tts_enabled):
	"""Generate bot response with optional TTS"""
	if not history or not history[-1][0]:
	return history, None

	user_msg = history[-1][0]

	# Generate response
	for response, audio in respond_with_audio(
	user_msg,
	history[:-1],
	system_msg,
	max_tok,
	temp,
	top_p,
	tts_enabled
	):
	history[-1][1] = response
	yield history, audio

	def manual_tts(text):
	"""Generate TTS for manual input"""
	if not text.strip():
	return None
	return generate_speech(text)

	# Event handlers
	msg.submit(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
	[chatbot, audio_output]
	)

	submit.click(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
	[chatbot, audio_output]
	)

	clear.click(lambda: ([], None), outputs=[chatbot, audio_output])

	tts_button.click(
	manual_tts,
	inputs=[tts_input],
	outputs=[audio_output]
	)

	# Add examples
	gr.Examples(
	examples=[
	["Hello! How are you today?"],
	["Tell me a short joke [laughs]"],
	["Explain quantum physics in simple terms"],
	["What's the weather like? [sighs]"]
	],
	inputs=[msg],
	label="Example messages (try the ones with [laughs] or [sighs] for TTS effects!)"
	)

	if __name__ == "__main__":
	demo.launch()