import gradio as gr from huggingface_hub import InferenceClient from transformers import pipeline import numpy as np """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # Initialize Bark TTS model try: synthesizer = pipeline("text-to-speech", "suno/bark") tts_available = True except Exception as e: print(f"TTS model failed to load: {e}") tts_available = False synthesizer = None def generate_speech(text): """Generate speech from text using Bark TTS""" if not tts_available or not synthesizer: return None, "TTS not available" try: speech = synthesizer(text, forward_params={"do_sample": True}) # Convert to format Gradio expects audio_data = speech["audio"].flatten() sample_rate = speech["sampling_rate"] return sample_rate, audio_data except Exception as e: return None, f"TTS Error: {str(e)}" def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): """Generate chat response""" messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content if token: response += token yield response def respond_with_audio( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, enable_tts ): """Generate chat response and optionally convert to speech""" # Get text response final_response = "" for response in respond(message, history, system_message, max_tokens, temperature, top_p): final_response = response yield response, None # Yield text first, audio comes later # Generate audio if TTS is enabled if enable_tts and tts_available and final_response.strip(): try: # Clean response for TTS (remove markdown, keep essential punctuation) clean_text = final_response.replace("*", "").replace("#", "").replace("`", "") # Limit length for TTS (Bark works best with shorter texts) if len(clean_text) > 500: clean_text = clean_text[:500] + "..." sample_rate, audio_data = generate_speech(clean_text) if sample_rate: yield final_response, (sample_rate, audio_data) else: yield final_response, None except Exception as e: print(f"TTS generation failed: {e}") yield final_response, None else: yield final_response, None # Create the main chat interface with TTS option with gr.Blocks(title="Chat + TTS Bot") as demo: gr.Markdown("# 🤖 Chat Bot with Text-to-Speech") gr.Markdown("Chat with Zephyr-7B and optionally hear responses with Bark TTS") with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot(height=400) msg = gr.Textbox( placeholder="Type your message here...", label="Message", lines=2 ) with gr.Row(): submit = gr.Button("💬 Send", variant="primary") clear = gr.Button("🗑️ Clear") with gr.Column(scale=1): # TTS Controls gr.Markdown("### 🔊 Text-to-Speech") enable_tts = gr.Checkbox( label="Enable TTS for responses", value=False, info="Generate audio for bot responses" ) audio_output = gr.Audio( label="Response Audio", autoplay=False, visible=True ) # Manual TTS gr.Markdown("### 🎤 Manual TTS") tts_input = gr.Textbox( placeholder="Enter text to convert to speech...", label="Text for TTS", lines=2 ) tts_button = gr.Button("🗣️ Generate Speech") # Chat Settings (Collapsible) with gr.Accordion("⚙️ Chat Settings", open=False): system_message = gr.Textbox( value="You are a friendly and helpful AI assistant.", label="System Message", lines=2 ) with gr.Row(): max_tokens = gr.Slider( minimum=1, maximum=2048, value=512, step=1, label="Max tokens" ) temperature = gr.Slider( minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p" ) # State for chat history chat_history = gr.State([]) def user_message(message, history): """Add user message to chat""" return "", history + [[message, None]] def bot_response(history, system_msg, max_tok, temp, top_p, tts_enabled): """Generate bot response with optional TTS""" if not history or not history[-1][0]: return history, None user_msg = history[-1][0] # Generate response for response, audio in respond_with_audio( user_msg, history[:-1], system_msg, max_tok, temp, top_p, tts_enabled ): history[-1][1] = response yield history, audio def manual_tts(text): """Generate TTS for manual input""" if not text.strip(): return None return generate_speech(text) # Event handlers msg.submit( user_message, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_response, [chatbot, system_message, max_tokens, temperature, top_p, enable_tts], [chatbot, audio_output] ) submit.click( user_message, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_response, [chatbot, system_message, max_tokens, temperature, top_p, enable_tts], [chatbot, audio_output] ) clear.click(lambda: ([], None), outputs=[chatbot, audio_output]) tts_button.click( manual_tts, inputs=[tts_input], outputs=[audio_output] ) # Add examples gr.Examples( examples=[ ["Hello! How are you today?"], ["Tell me a short joke [laughs]"], ["Explain quantum physics in simple terms"], ["What's the weather like? [sighs]"] ], inputs=[msg], label="Example messages (try the ones with [laughs] or [sighs] for TTS effects!)" ) if __name__ == "__main__": demo.launch()