File size: 12,064 Bytes
adb6707
 
563d6b1
 
adb6707
563d6b1
 
af5ca25
c79d862
 
 
af5ca25
563d6b1
7c2a0f5
6558d3e
 
 
563d6b1
6558d3e
c79d862
 
bbc2a92
c79d862
af5ca25
563d6b1
af5ca25
 
7c2a0f5
 
 
 
 
af5ca25
 
 
 
 
 
 
 
6558d3e
 
 
af5ca25
6558d3e
af5ca25
6558d3e
 
 
c79d862
 
 
6558d3e
af5ca25
c79d862
 
 
 
6558d3e
c79d862
 
 
6558d3e
af5ca25
 
c79d862
 
7c2a0f5
bbc2a92
563d6b1
c79d862
bbc2a92
adb6707
563d6b1
af5ca25
c79d862
 
563d6b1
 
af5ca25
c79d862
563d6b1
af5ca25
 
563d6b1
c79d862
563d6b1
c79d862
563d6b1
 
c79d862
563d6b1
af5ca25
 
c79d862
 
af5ca25
c79d862
6558d3e
 
af5ca25
 
 
563d6b1
af5ca25
 
563d6b1
c79d862
 
 
 
 
6558d3e
 
 
af5ca25
 
 
c79d862
af5ca25
c79d862
 
af5ca25
 
 
 
c79d862
af5ca25
563d6b1
 
af5ca25
563d6b1
 
 
 
bbc2a92
af5ca25
563d6b1
 
adb6707
af5ca25
 
 
6558d3e
af5ca25
6558d3e
c79d862
 
af5ca25
 
adb6707
af5ca25
563d6b1
af5ca25
 
6558d3e
af5ca25
6558d3e
af5ca25
 
 
 
6558d3e
af5ca25
 
6558d3e
 
 
af5ca25
 
563d6b1
af5ca25
 
adb6707
 
 
 
 
c79d862
bbc2a92
 
c79d862
 
bbc2a92
563d6b1
af5ca25
adb6707
bbc2a92
 
 
af5ca25
 
bbc2a92
 
adb6707
bbc2a92
7c2a0f5
af5ca25
bbc2a92
 
af5ca25
6558d3e
bbc2a92
 
 
 
 
 
 
 
 
563d6b1
bbc2a92
c79d862
bbc2a92
 
 
 
 
 
c79d862
af5ca25
 
 
 
c79d862
af5ca25
563d6b1
 
bbc2a92
 
 
 
 
 
 
 
 
adb6707
bbc2a92
 
 
 
 
 
 
 
 
563d6b1
bbc2a92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563d6b1
bbc2a92
 
 
af5ca25
 
bbc2a92
 
 
 
563d6b1
 
 
bbc2a92
563d6b1
 
 
 
 
bbc2a92
563d6b1
 
adb6707
563d6b1
adb6707
 
563d6b1
 
 
c79d862
563d6b1
 
c79d862
563d6b1
c79d862
563d6b1
 
bbc2a92
 
 
c79d862
bbc2a92
 
adb6707
 
 
78f6180
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import gradio as gr
import time
import os
from typing import List, Dict

class ChatbotHandler:
    def __init__(self):
        self.model_name = "facebook/opt-6.7b"  # Smaller, faster 6.7B model instead of 13B
        self.tokenizer = None
        self.model = None
        self.chat_pipeline = None
        self.max_length = 512  # Reduced for speed
        self.temperature = 0.7
        self.model_loaded = False
        self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant. 
        You provide clear, accurate, and thoughtful responses. You are engaging and try to be 
        helpful while being honest about your limitations. Always maintain a positive and 
        supportive tone in your conversations."""
        
        # Initialize the model
        self.initialize_model()
    
    def initialize_model(self):
        """Initialize the Hugging Face model with quantization for speed."""
        try:
            from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
            import torch
        except ImportError:
            print("Transformers library not available. Please install the required dependencies.")
            return False

        try:
            print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.")
            
            # Configure 8-bit quantization for speed
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_enable_fp32_cpu_offload=True
            )
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name, 
                quantization_config=quantization_config,
                device_map="auto",  # Automatically distribute across available GPUs
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )
            
            # Set pad token if not present
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Create pipeline for text generation with optimized settings
            self.chat_pipeline = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device_map="auto",
                max_length=self.max_length,
                temperature=self.temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                truncation=True,
                use_fast=True
            )
            print("Model loaded successfully!")
            self.model_loaded = True
            return True
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            return False
    
    def get_response(self, message: str, history: List[Dict]) -> str:
        """Get response from the model with optimized settings."""
        if not self.chat_pipeline:
            return "Model not loaded. Please try again later."
        
        try:
            # Prepare conversation history as a single string (limit to last 2 exchanges for speed)
            conversation = self.system_prompt + "\n"
            
            # Add recent history (limit to last 2 exchanges for speed)
            for msg in history[-2:]:
                if msg["role"] == "user":
                    conversation += f"User: {msg['content']}\n"
                elif msg["role"] == "assistant":
                    conversation += f"Assistant: {msg['content']}\n"
            
            # Add current message
            conversation += f"User: {message}\nAssistant:"
            
            # Generate response with optimized settings for speed
            start_time = time.time()
            outputs = self.chat_pipeline(
                conversation,
                max_new_tokens=50,  # Shorter responses for speed
                num_return_sequences=1,
                return_full_text=False,
                do_sample=True,
                temperature=self.temperature,
                top_p=0.9,  # Add top_p for better quality
                repetition_penalty=1.1  # Reduce repetition
            )
            end_time = time.time()
            print(f"Response generated in {end_time - start_time:.2f} seconds")
            
            response = outputs[0]['generated_text'].strip()
            
            # Clean up response (remove any unwanted prefixes)
            if response.startswith("Assistant:"):
                response = response[10:].strip()
            elif response.startswith("User:"):
                response = "I apologize, but I seem to have gotten confused. How can I help you?"
            
            # Limit response length for speed
            if len(response) > 200:
                response = response[:200] + "..."
            
            # Faster streaming (yield larger chunks)
            words = response.split()
            current_response = ""
            chunk_size = 3  # Yield every 3 words for faster streaming
            for i in range(0, len(words), chunk_size):
                chunk = words[i:i + chunk_size]
                current_response += " ".join(chunk) + " "
                yield current_response.strip()
                time.sleep(0.01)  # Very short delay for smooth streaming
                    
        except Exception as e:
            yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}"

# Initialize chatbot handler
chat_handler = ChatbotHandler()

def respond_stream(message: str, history: List[Dict]):
    """Generate streaming response from the model with fixed history management."""
    if not message.strip():
        return "", history
    
    # Create a copy of history to avoid mutation issues
    current_history = history.copy()
    
    # Always add user message first to prevent disappearing chats
    current_history.append({"role": "user", "content": message})
    
    # Check if model is initialized
    if not chat_handler.chat_pipeline:
        current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."})
        return "", current_history
    
    # Get streaming response with error handling
    full_response = ""
    assistant_added = False
    
    try:
        for chunk in chat_handler.get_response(message, current_history[:-1]):  # Don't include current user message in context
            full_response = chunk
            # Update or add the assistant message
            if not assistant_added:
                current_history.append({"role": "assistant", "content": full_response})
                assistant_added = True
            else:
                current_history[-1]["content"] = full_response
            yield "", current_history
    except Exception as e:
        # If streaming fails, add a fallback response
        error_msg = "I apologize, but I encountered an error. Please try again."
        if not assistant_added:
            current_history.append({"role": "assistant", "content": error_msg})
        else:
            current_history[-1]["content"] = error_msg
        yield "", current_history

def clear_history():
    """Clear the chat history."""
    return []

def update_model_settings(temp, max_len):
    """Update model settings."""
    chat_handler.temperature = temp
    chat_handler.max_length = max_len
    return f"Settings updated: temp={temp}, max_length={max_len}"

# Create the interface
with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo:
    
    # Header
    gr.HTML("""
    <div style='text-align: center; padding: 20px;'>
        <h1>⚡ Fast AI Chatbot</h1>
        <p style='color: #666;'>Powered by OPT-6.7B with 8-bit quantization • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
    </div>
    """)
    
    # Status indicator
    if chat_handler.model_loaded:
        status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds."
        status_color = "#28a745"
    else:
        status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before."
        status_color = "#ffc107"
    
    gr.HTML(f"""
    <div style='text-align: center; padding: 10px; background-color: {status_color}15; border: 1px solid {status_color}30; border-radius: 5px; margin: 10px 0;'>
        <p style='color: {status_color}; margin: 0;'>{status_msg}</p>
    </div>
    """)
    
    # Model settings
    with gr.Accordion("Settings", open=False):
        with gr.Row():
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Higher values make responses more creative"
            )
            max_length = gr.Slider(
                minimum=256,
                maximum=1024,
                value=512,
                step=64,
                label="Max Length",
                info="Maximum context length (lower = faster)"
            )
    
    # Chatbot component
    chatbot = gr.Chatbot(
        type="messages",
        label="Conversation",
        height=500,
        show_copy_button=True,
        bubble_full_width=False,
        avatar_images=(None, "https://huggingface.co/datasets/huggingface/avatars/resolve/main/bot-avatar.png")
    )
    
    # Input section
    with gr.Row():
        msg = gr.Textbox(
            label="Your Message",
            placeholder="Type your message here and press Enter...",
            container=False,
            scale=4
        )
        submit_btn = gr.Button("Send", variant="primary", scale=1)
    
    # Control buttons
    with gr.Row():
        clear_btn = gr.Button("Clear Chat", variant="secondary")
        refresh_btn = gr.Button("Refresh Settings", variant="secondary")
    
    # Example questions
    with gr.Accordion("Example Questions", open=False):
        gr.Examples(
            examples=[
                "What's the difference between AI and machine learning?",
                "Can you explain quantum computing in simple terms?",
                "Help me write a professional email.",
                "What are some good books to learn programming?",
                "Can you help me brainstorm ideas for a project?",
                "Explain the concept of blockchain technology."
            ],
            inputs=msg,
            label="Click an example to start chatting"
        )
    
    # Footer
    gr.HTML("""
    <div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
        <p>This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!</p>
        <p><strong>Speed optimizations:</strong> Smaller model, quantization, shorter responses, optimized parameters.</p>
    </div>
    """)
    
    # Event handlers
    # Chat functionality
    msg.submit(
        respond_stream,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot]
    )
    
    submit_btn.click(
        respond_stream,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot]
    )
    
    # Clear chat
    clear_btn.click(clear_history, outputs=chatbot)
    
    # Update model settings
    temperature.change(
        update_model_settings,
        inputs=[temperature, max_length],
        outputs=[]
    )
    max_length.change(
        update_model_settings,
        inputs=[temperature, max_length],
        outputs=[]
    )
    
    # Refresh settings (useful for debugging)
    refresh_btn.click(
        lambda: f"Settings: temp={chat_handler.temperature}, max_length={chat_handler.max_length}",
        outputs=[]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)