TobDeBer commited on
Commit
ee7dcb0
·
1 Parent(s): b393dd6

multiturn 1

Browse files
Files changed (1) hide show
  1. app.py +47 -140
app.py CHANGED
@@ -7,43 +7,29 @@ from threading import Thread
7
  import sys
8
  import os
9
 
10
- # Model configuration - using SmolLM2 for efficient inference
11
- # Check for command line argument for local model path
12
  if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
13
  MODEL_NAME = sys.argv[1]
14
  print(f"Using local model from: {MODEL_NAME}")
15
  else:
16
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
17
 
18
- # Global variables for model components
19
  tokenizer = None
20
  model = None
21
- text_generator = None
22
 
23
  def load_model():
24
  """Load the Smol LLM model and tokenizer"""
25
- global tokenizer, model, text_generator
26
  try:
27
  print(f"Loading model: {MODEL_NAME}")
28
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  MODEL_NAME,
31
- dtype=torch.float32, # Use float32 for CPU
32
  device_map="auto"
33
  )
34
 
35
- # Create text generation pipeline (still useful for non-streaming checks if needed, but we use model.generate for streaming)
36
- text_generator = pipeline(
37
- "text-generation",
38
- model=model,
39
- tokenizer=tokenizer,
40
- max_new_tokens=512,
41
- temperature=0.7,
42
- top_p=0.95,
43
- do_sample=True
44
- )
45
-
46
- # Set pad token if not present
47
  if tokenizer.pad_token is None:
48
  tokenizer.pad_token = tokenizer.eos_token
49
 
@@ -51,36 +37,32 @@ def load_model():
51
  except Exception as e:
52
  return f"❌ Error loading model: {str(e)}"
53
 
54
- def format_prompt(prompt, system_prompt=None):
55
- """Format the prompt for chat-style models using tokenizer's template"""
56
- messages = []
57
- if system_prompt:
58
- messages.append({"role": "system", "content": system_prompt})
59
- messages.append({"role": "user", "content": prompt})
60
- return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
61
-
62
- def generate_text(
63
- prompt,
64
- max_length=200,
65
- temperature=0.7,
66
- top_p=0.95,
67
- repetition_penalty=1.1,
68
- system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
69
- ):
70
- """Generate text using the loaded model with streaming"""
71
  global model, tokenizer
72
 
73
  if model is None or tokenizer is None:
74
  yield "⚠️ Please wait for the model to finish loading..."
75
  return
76
 
77
- if not prompt.strip():
78
  yield "⚠️ Please enter a prompt."
79
  return
80
 
81
  try:
 
 
 
 
 
 
 
 
 
 
 
82
  # Format the prompt
83
- formatted_prompt = format_prompt(prompt, system_prompt)
84
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
85
 
86
  # Setup streamer
@@ -111,25 +93,18 @@ def generate_text(
111
  for new_text in streamer:
112
  generated_text += new_text
113
  token_count += 1
114
- current_time = time.time()
115
- elapsed_time = current_time - start_time
116
-
117
- # Avoid division by zero
118
- if elapsed_time > 0:
119
- tps = token_count / elapsed_time
120
- stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f} seconds ({tps:.2f} tokens/s)*"
121
- else:
122
- stats = "\n\n---\n*Starting generation...*"
123
-
124
- yield f"**Response:**\n{generated_text}{stats}"
125
 
126
  except Exception as e:
127
  yield f"❌ Error during generation: {str(e)}"
128
 
129
- def clear_chat():
130
- """Clear the chat interface"""
131
- return "", "*Response will appear here...*"
132
-
133
  # Create custom theme
134
  custom_theme = gr.themes.Soft(
135
  primary_hue="blue",
@@ -149,119 +124,51 @@ custom_theme = gr.themes.Soft(
149
  with gr.Blocks(theme=custom_theme) as demo:
150
  gr.Markdown(
151
  """
152
- # 🤖 Smol LLM Inference GUI
153
 
154
- Efficient text generation using SmolLM2-135M
155
-
156
- This application runs a compact language model locally for text generation.
157
- Perfect for chat, completion tasks, and creative writing.
158
  """
159
  )
160
 
161
- # Main interface
162
- with gr.Group():
163
- gr.Markdown("### 💬 Text Generation")
164
-
165
- prompt_input = gr.Textbox(
166
- label="Enter your prompt",
167
- placeholder="Type your message here...",
168
- lines=4,
169
- autofocus=True
170
- )
171
-
172
- with gr.Row():
173
- generate_btn = gr.Button(
174
- "🚀 Generate",
175
- variant="primary",
176
- size="lg"
177
- )
178
- clear_btn = gr.Button(
179
- "🗑️ Clear",
180
- variant="secondary"
181
- )
182
-
183
- output_text = gr.Markdown(
184
- label="Generated Response",
185
- value="*Response will appear here...*"
186
- )
187
-
188
- # Settings
189
- with gr.Accordion("⚙️ Settings", open=False):
190
- # Generation parameters
191
- gr.Markdown("### ⚙️ Generation Parameters")
192
-
193
- with gr.Row():
194
- max_length = gr.Slider(
195
  minimum=50,
196
  maximum=1024,
197
  value=200,
198
  step=50,
199
  label="Max Tokens"
200
- )
201
- temperature = gr.Slider(
202
  minimum=0.1,
203
  maximum=2.0,
204
  value=0.7,
205
  step=0.1,
206
  label="Temperature"
207
- )
208
-
209
- with gr.Row():
210
- top_p = gr.Slider(
211
  minimum=0.1,
212
  maximum=1.0,
213
  value=0.95,
214
  step=0.05,
215
  label="Top-p"
216
- )
217
- repetition_penalty = gr.Slider(
218
  minimum=1.0,
219
  maximum=2.0,
220
  value=1.1,
221
  step=0.1,
222
  label="Repetition Penalty"
 
 
 
 
 
223
  )
224
-
225
- system_prompt = gr.Textbox(
226
- label="System Prompt",
227
- value="You are a helpful AI assistant. Provide clear and concise answers.",
228
- lines=3,
229
- placeholder="Enter a system prompt to guide the model's behavior..."
230
- )
231
-
232
- # Event handlers
233
-
234
-
235
- generate_btn.click(
236
- fn=generate_text,
237
- inputs=[
238
- prompt_input,
239
- max_length,
240
- temperature,
241
- top_p,
242
- repetition_penalty,
243
- system_prompt
244
- ],
245
- outputs=[output_text]
246
- )
247
-
248
- clear_btn.click(
249
- fn=clear_chat,
250
- outputs=[prompt_input, output_text]
251
- )
252
-
253
- # Allow Enter key to generate
254
- prompt_input.submit(
255
- fn=generate_text,
256
- inputs=[
257
- prompt_input,
258
- max_length,
259
- temperature,
260
- top_p,
261
- repetition_penalty,
262
- system_prompt
263
  ],
264
- outputs=[output_text]
265
  )
266
 
267
  # Auto-load the model at startup
@@ -272,4 +179,4 @@ print(f"Startup load status: {load_status}")
272
  demo.launch(
273
  share=False,
274
  show_error=True
275
- )
 
7
  import sys
8
  import os
9
 
10
+ # Model configuration
 
11
  if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
12
  MODEL_NAME = sys.argv[1]
13
  print(f"Using local model from: {MODEL_NAME}")
14
  else:
15
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
16
 
17
+ # Global variables
18
  tokenizer = None
19
  model = None
 
20
 
21
  def load_model():
22
  """Load the Smol LLM model and tokenizer"""
23
+ global tokenizer, model
24
  try:
25
  print(f"Loading model: {MODEL_NAME}")
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForCausalLM.from_pretrained(
28
  MODEL_NAME,
29
+ dtype=torch.float32,
30
  device_map="auto"
31
  )
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if tokenizer.pad_token is None:
34
  tokenizer.pad_token = tokenizer.eos_token
35
 
 
37
  except Exception as e:
38
  return f"❌ Error loading model: {str(e)}"
39
 
40
+ def chat_predict(message, history, max_length, temperature, top_p, repetition_penalty, system_prompt):
41
+ """Generate text using the loaded model with streaming and history"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  global model, tokenizer
43
 
44
  if model is None or tokenizer is None:
45
  yield "⚠️ Please wait for the model to finish loading..."
46
  return
47
 
48
+ if not message.strip():
49
  yield "⚠️ Please enter a prompt."
50
  return
51
 
52
  try:
53
+ # Build conversation history
54
+ messages = []
55
+ if system_prompt:
56
+ messages.append({"role": "system", "content": system_prompt})
57
+
58
+ for user_msg, assistant_msg in history:
59
+ messages.append({"role": "user", "content": user_msg})
60
+ messages.append({"role": "assistant", "content": assistant_msg})
61
+
62
+ messages.append({"role": "user", "content": message})
63
+
64
  # Format the prompt
65
+ formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
66
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
67
 
68
  # Setup streamer
 
93
  for new_text in streamer:
94
  generated_text += new_text
95
  token_count += 1
96
+ yield generated_text
97
+
98
+ # Append stats after generation is complete
99
+ elapsed_time = time.time() - start_time
100
+ if elapsed_time > 0:
101
+ tps = token_count / elapsed_time
102
+ stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f}s ({tps:.2f} t/s)*"
103
+ yield generated_text + stats
 
 
 
104
 
105
  except Exception as e:
106
  yield f"❌ Error during generation: {str(e)}"
107
 
 
 
 
 
108
  # Create custom theme
109
  custom_theme = gr.themes.Soft(
110
  primary_hue="blue",
 
124
  with gr.Blocks(theme=custom_theme) as demo:
125
  gr.Markdown(
126
  """
127
+ # 🤖 Smol LLM Chat
128
 
129
+ Multi-turn chat with SmolLM2-135M.
 
 
 
130
  """
131
  )
132
 
133
+ # Chat Interface
134
+ chat_interface = gr.ChatInterface(
135
+ fn=chat_predict,
136
+ additional_inputs=[
137
+ gr.Slider(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  minimum=50,
139
  maximum=1024,
140
  value=200,
141
  step=50,
142
  label="Max Tokens"
143
+ ),
144
+ gr.Slider(
145
  minimum=0.1,
146
  maximum=2.0,
147
  value=0.7,
148
  step=0.1,
149
  label="Temperature"
150
+ ),
151
+ gr.Slider(
 
 
152
  minimum=0.1,
153
  maximum=1.0,
154
  value=0.95,
155
  step=0.05,
156
  label="Top-p"
157
+ ),
158
+ gr.Slider(
159
  minimum=1.0,
160
  maximum=2.0,
161
  value=1.1,
162
  step=0.1,
163
  label="Repetition Penalty"
164
+ ),
165
+ gr.Textbox(
166
+ label="System Prompt",
167
+ value="You are a helpful AI assistant. Provide clear and concise answers.",
168
+ lines=2
169
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  ],
171
+ additional_inputs_accordion=gr.Accordion("⚙️ Generation Parameters", open=False),
172
  )
173
 
174
  # Auto-load the model at startup
 
179
  demo.launch(
180
  share=False,
181
  show_error=True
182
+ )