Spaces:

latterworks
/

bark

Sleeping

File size: 7,630 Bytes

import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import numpy as np

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# Initialize Bark TTS model
try:
    synthesizer = pipeline("text-to-speech", "suno/bark")
    tts_available = True
except Exception as e:
    print(f"TTS model failed to load: {e}")
    tts_available = False
    synthesizer = None

def generate_speech(text):
    """Generate speech from text using Bark TTS"""
    if not tts_available or not synthesizer:
        return None, "TTS not available"
    
    try:
        speech = synthesizer(text, forward_params={"do_sample": True})
        # Convert to format Gradio expects
        audio_data = speech["audio"].flatten()
        sample_rate = speech["sampling_rate"]
        return sample_rate, audio_data
    except Exception as e:
        return None, f"TTS Error: {str(e)}"

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    """Generate chat response"""
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})
    
    response = ""
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        if token:
            response += token
            yield response

def respond_with_audio(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    enable_tts
):
    """Generate chat response and optionally convert to speech"""
    # Get text response
    final_response = ""
    for response in respond(message, history, system_message, max_tokens, temperature, top_p):
        final_response = response
        yield response, None  # Yield text first, audio comes later
    
    # Generate audio if TTS is enabled
    if enable_tts and tts_available and final_response.strip():
        try:
            # Clean response for TTS (remove markdown, keep essential punctuation)
            clean_text = final_response.replace("*", "").replace("#", "").replace("`", "")
            # Limit length for TTS (Bark works best with shorter texts)
            if len(clean_text) > 500:
                clean_text = clean_text[:500] + "..."
            
            sample_rate, audio_data = generate_speech(clean_text)
            if sample_rate:
                yield final_response, (sample_rate, audio_data)
            else:
                yield final_response, None
        except Exception as e:
            print(f"TTS generation failed: {e}")
            yield final_response, None
    else:
        yield final_response, None

# Create the main chat interface with TTS option
with gr.Blocks(title="Chat + TTS Bot") as demo:
    gr.Markdown("# 🤖 Chat Bot with Text-to-Speech")
    gr.Markdown("Chat with Zephyr-7B and optionally hear responses with Bark TTS")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=400)
            msg = gr.Textbox(
                placeholder="Type your message here...",
                label="Message",
                lines=2
            )
            
            with gr.Row():
                submit = gr.Button("💬 Send", variant="primary")
                clear = gr.Button("🗑️ Clear")
        
        with gr.Column(scale=1):
            # TTS Controls
            gr.Markdown("### 🔊 Text-to-Speech")
            enable_tts = gr.Checkbox(
                label="Enable TTS for responses",
                value=False,
                info="Generate audio for bot responses"
            )
            
            audio_output = gr.Audio(
                label="Response Audio",
                autoplay=False,
                visible=True
            )
            
            # Manual TTS
            gr.Markdown("### 🎤 Manual TTS")
            tts_input = gr.Textbox(
                placeholder="Enter text to convert to speech...",
                label="Text for TTS",
                lines=2
            )
            tts_button = gr.Button("🗣️ Generate Speech")
    
    # Chat Settings (Collapsible)
    with gr.Accordion("⚙️ Chat Settings", open=False):
        system_message = gr.Textbox(
            value="You are a friendly and helpful AI assistant.",
            label="System Message",
            lines=2
        )
        with gr.Row():
            max_tokens = gr.Slider(
                minimum=1,
                maximum=2048,
                value=512,
                step=1,
                label="Max tokens"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=4.0,
                value=0.7,
                step=0.1,
                label="Temperature"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p"
            )
    
    # State for chat history
    chat_history = gr.State([])
    
    def user_message(message, history):
        """Add user message to chat"""
        return "", history + [[message, None]]
    
    def bot_response(history, system_msg, max_tok, temp, top_p, tts_enabled):
        """Generate bot response with optional TTS"""
        if not history or not history[-1][0]:
            return history, None
        
        user_msg = history[-1][0]
        
        # Generate response
        for response, audio in respond_with_audio(
            user_msg, 
            history[:-1], 
            system_msg, 
            max_tok, 
            temp, 
            top_p, 
            tts_enabled
        ):
            history[-1][1] = response
            yield history, audio
    
    def manual_tts(text):
        """Generate TTS for manual input"""
        if not text.strip():
            return None
        return generate_speech(text)
    
    # Event handlers
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
        [chatbot, audio_output]
    )
    
    submit.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
        [chatbot, audio_output]
    )
    
    clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
    
    tts_button.click(
        manual_tts,
        inputs=[tts_input],
        outputs=[audio_output]
    )
    
    # Add examples
    gr.Examples(
        examples=[
            ["Hello! How are you today?"],
            ["Tell me a short joke [laughs]"],
            ["Explain quantum physics in simple terms"],
            ["What's the weather like? [sighs]"]
        ],
        inputs=[msg],
        label="Example messages (try the ones with [laughs] or [sighs] for TTS effects!)"
    )

if __name__ == "__main__":
    demo.launch()