File size: 9,204 Bytes
882079e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e4badb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from typing import List, Tuple

# Model configuration
MODEL_PATH = "microsoft/UserLM-8b"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Global variables for model and tokenizer
model = None
tokenizer = None

def load_model():
    """Load the model and tokenizer."""
    global model, tokenizer
    
    print(f"Loading model {MODEL_PATH}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH, 
        trust_remote_code=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        low_cpu_mem_usage=True
    ).to(DEVICE)
    print(f"Model loaded successfully on {DEVICE}")
    return model, tokenizer

@spaces.GPU(duration=120)
def generate_response(
    message: str,
    chat_history: List[Tuple[str, str]],
    system_prompt: str,
    temperature: float,
    top_p: float,
    max_new_tokens: int,
) -> str:
    """Generate a response from the model."""
    global model, tokenizer
    
    # Load model if not already loaded
    if model is None or tokenizer is None:
        model, tokenizer = load_model()
    
    # Build conversation history
    messages = []
    
    # Add system prompt if provided
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})
    
    # Add chat history
    for user_msg, assistant_msg in chat_history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Tokenize input
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)
    
    # Define special tokens
    end_token = "<|eot_id|>"
    end_token_id = tokenizer.encode(end_token, add_special_tokens=False)
    
    end_conv_token = "<|endconversation|>"
    end_conv_token_id = tokenizer.encode(end_conv_token, add_special_tokens=False)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            eos_token_id=end_token_id,
            pad_token_id=tokenizer.eos_token_id,
            bad_words_ids=[[token_id] for token_id in end_conv_token_id]
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    return response

def respond(
    message: str,
    chat_history: List[Tuple[str, str]],
    system_prompt: str,
    temperature: float,
    top_p: float,
    max_new_tokens: int,
):
    """Stream response to the chatbot."""
    # Generate complete response
    bot_message = generate_response(
        message, 
        chat_history, 
        system_prompt, 
        temperature, 
        top_p, 
        max_new_tokens
    )
    
    # Add to chat history
    chat_history.append((message, bot_message))
    
    # Stream the response character by character for better UX
    partial_message = ""
    for char in bot_message:
        partial_message += char
        time.sleep(0.01)  # Small delay for streaming effect
        yield chat_history[:-1] + [(message, partial_message)]
    
    yield chat_history

def clear_conversation():
    """Clear the conversation history."""
    return [], None

# Create the Gradio interface
with gr.Blocks(title="UserLM-8b Chat", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # πŸ€– UserLM-8b Chat Interface
        
        Chat with Microsoft's UserLM-8b model. This model is designed to simulate user behavior and generate responses as if from a user perspective.
        
        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """
    )
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                height=500,
                show_copy_button=True,
                bubble_full_width=False,
                avatar_images=(None, "πŸ€–"),
                render_markdown=True,
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Message",
                    placeholder="Type your message here and press Enter...",
                    lines=2,
                    scale=4,
                    autofocus=True,
                )
                submit_btn = gr.Button("Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.ClearButton(
                    [chatbot, msg], 
                    value="πŸ—‘οΈ Clear Chat"
                )
                retry_btn = gr.Button("πŸ”„ Retry Last")
                undo_btn = gr.Button("↩️ Undo Last")
        
        with gr.Column(scale=1):
            gr.Markdown("### βš™οΈ Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt",
                placeholder="Set the behavior of the model...",
                value="You are a user who wants to implement a special type of sequence. The sequence sums up the two previous numbers in the sequence and adds 1 to the result. The first two numbers in the sequence are 1 and 1.",
                lines=4,
            )
            
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=1.0,
                step=0.1,
                label="Temperature",
                info="Higher values make output more random"
            )
            
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.8,
                step=0.05,
                label="Top-p (nucleus sampling)",
                info="Lower values focus on more likely tokens"
            )
            
            max_new_tokens = gr.Slider(
                minimum=10,
                maximum=512,
                value=100,
                step=10,
                label="Max New Tokens",
                info="Maximum number of tokens to generate"
            )
            
            gr.Markdown(
                """
                ### πŸ“Š Model Info
                - **Model**: microsoft/UserLM-8b
                - **Parameters**: 8 billion
                - **Device**: """ + DEVICE.upper() + """
                - **Precision**: FP16 (CUDA) / FP32 (CPU)
                """
            )
    
    # Store conversation history
    chat_history = gr.State([])
    
    # Event handlers
    def user_submit(message, history):
        return "", history + [(message, None)]
    
    def bot_respond(history, system, temp, top_p, max_tokens):
        if not history or history[-1][1] is not None:
            return history
        
        message = history[-1][0]
        history_without_last = history[:-1]
        
        for new_history in respond(message, history_without_last, system, temp, top_p, max_tokens):
            yield new_history
    
    def retry_last(history, system, temp, top_p, max_tokens):
        if not history:
            return history
        
        # Remove last exchange and regenerate
        last_user_msg = history[-1][0]
        history = history[:-1]
        
        for new_history in respond(last_user_msg, history, system, temp, top_p, max_tokens):
            yield new_history
    
    def undo_last(history):
        if history:
            return history[:-1]
        return history
    
    # Connect events
    msg.submit(
        user_submit, 
        [msg, chatbot], 
        [msg, chatbot],
        queue=False
    ).then(
        bot_respond,
        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
        chatbot
    )
    
    submit_btn.click(
        user_submit,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_respond,
        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
        chatbot
    )
    
    retry_btn.click(
        retry_last,
        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
        chatbot
    )
    
    undo_btn.click(
        undo_last,
        chatbot,
        chatbot
    )
    
    # Load model on startup
    demo.load(
        fn=lambda: gr.Info("Model loading... This may take a moment on first run."),
        inputs=None,
        outputs=None
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["Can you help me understand how this sequence works?"],
            ["What would be the next 5 numbers in the sequence?"],
            ["Let's implement this sequence in Python together."],
            ["Can you explain the pattern: 1, 1, 3, 5, 9, 15...?"],
        ],
        inputs=msg,
        label="Example Messages",
    )

if __name__ == "__main__":
    demo.launch(
        share=False,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860,
    )