File size: 7,630 Bytes
b94a974
 
ba4903d
 
b94a974
 
 
 
 
 
ba4903d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94a974
 
 
 
 
 
 
 
 
ba4903d
b94a974
 
 
 
 
 
 
ba4903d
b94a974
 
 
 
 
 
 
 
 
ba4903d
 
 
b94a974
ba4903d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94a974
ba4903d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94a974
 
ba4903d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import numpy as np

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# Initialize Bark TTS model
try:
    synthesizer = pipeline("text-to-speech", "suno/bark")
    tts_available = True
except Exception as e:
    print(f"TTS model failed to load: {e}")
    tts_available = False
    synthesizer = None

def generate_speech(text):
    """Generate speech from text using Bark TTS"""
    if not tts_available or not synthesizer:
        return None, "TTS not available"
    
    try:
        speech = synthesizer(text, forward_params={"do_sample": True})
        # Convert to format Gradio expects
        audio_data = speech["audio"].flatten()
        sample_rate = speech["sampling_rate"]
        return sample_rate, audio_data
    except Exception as e:
        return None, f"TTS Error: {str(e)}"

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    """Generate chat response"""
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})
    
    response = ""
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        if token:
            response += token
            yield response

def respond_with_audio(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    enable_tts
):
    """Generate chat response and optionally convert to speech"""
    # Get text response
    final_response = ""
    for response in respond(message, history, system_message, max_tokens, temperature, top_p):
        final_response = response
        yield response, None  # Yield text first, audio comes later
    
    # Generate audio if TTS is enabled
    if enable_tts and tts_available and final_response.strip():
        try:
            # Clean response for TTS (remove markdown, keep essential punctuation)
            clean_text = final_response.replace("*", "").replace("#", "").replace("`", "")
            # Limit length for TTS (Bark works best with shorter texts)
            if len(clean_text) > 500:
                clean_text = clean_text[:500] + "..."
            
            sample_rate, audio_data = generate_speech(clean_text)
            if sample_rate:
                yield final_response, (sample_rate, audio_data)
            else:
                yield final_response, None
        except Exception as e:
            print(f"TTS generation failed: {e}")
            yield final_response, None
    else:
        yield final_response, None

# Create the main chat interface with TTS option
with gr.Blocks(title="Chat + TTS Bot") as demo:
    gr.Markdown("# 🤖 Chat Bot with Text-to-Speech")
    gr.Markdown("Chat with Zephyr-7B and optionally hear responses with Bark TTS")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=400)
            msg = gr.Textbox(
                placeholder="Type your message here...",
                label="Message",
                lines=2
            )
            
            with gr.Row():
                submit = gr.Button("💬 Send", variant="primary")
                clear = gr.Button("🗑️ Clear")
        
        with gr.Column(scale=1):
            # TTS Controls
            gr.Markdown("### 🔊 Text-to-Speech")
            enable_tts = gr.Checkbox(
                label="Enable TTS for responses",
                value=False,
                info="Generate audio for bot responses"
            )
            
            audio_output = gr.Audio(
                label="Response Audio",
                autoplay=False,
                visible=True
            )
            
            # Manual TTS
            gr.Markdown("### 🎤 Manual TTS")
            tts_input = gr.Textbox(
                placeholder="Enter text to convert to speech...",
                label="Text for TTS",
                lines=2
            )
            tts_button = gr.Button("🗣️ Generate Speech")
    
    # Chat Settings (Collapsible)
    with gr.Accordion("⚙️ Chat Settings", open=False):
        system_message = gr.Textbox(
            value="You are a friendly and helpful AI assistant.",
            label="System Message",
            lines=2
        )
        with gr.Row():
            max_tokens = gr.Slider(
                minimum=1,
                maximum=2048,
                value=512,
                step=1,
                label="Max tokens"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=4.0,
                value=0.7,
                step=0.1,
                label="Temperature"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p"
            )
    
    # State for chat history
    chat_history = gr.State([])
    
    def user_message(message, history):
        """Add user message to chat"""
        return "", history + [[message, None]]
    
    def bot_response(history, system_msg, max_tok, temp, top_p, tts_enabled):
        """Generate bot response with optional TTS"""
        if not history or not history[-1][0]:
            return history, None
        
        user_msg = history[-1][0]
        
        # Generate response
        for response, audio in respond_with_audio(
            user_msg, 
            history[:-1], 
            system_msg, 
            max_tok, 
            temp, 
            top_p, 
            tts_enabled
        ):
            history[-1][1] = response
            yield history, audio
    
    def manual_tts(text):
        """Generate TTS for manual input"""
        if not text.strip():
            return None
        return generate_speech(text)
    
    # Event handlers
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
        [chatbot, audio_output]
    )
    
    submit.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_message, max_tokens, temperature, top_p, enable_tts],
        [chatbot, audio_output]
    )
    
    clear.click(lambda: ([], None), outputs=[chatbot, audio_output])
    
    tts_button.click(
        manual_tts,
        inputs=[tts_input],
        outputs=[audio_output]
    )
    
    # Add examples
    gr.Examples(
        examples=[
            ["Hello! How are you today?"],
            ["Tell me a short joke [laughs]"],
            ["Explain quantum physics in simple terms"],
            ["What's the weather like? [sighs]"]
        ],
        inputs=[msg],
        label="Example messages (try the ones with [laughs] or [sighs] for TTS effects!)"
    )

if __name__ == "__main__":
    demo.launch()