NeoPy commited on
Commit
32d5c15
·
verified ·
1 Parent(s): 25e5623

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +120 -96
  2. requirements.txt +6 -6
app.py CHANGED
@@ -1,12 +1,33 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
 
 
4
 
5
- # Initialize the client
6
- client = InferenceClient(
7
- model="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
8
- token=os.getenv("HF_TOKEN")
9
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Default system prompts
12
  SYSTEM_PROMPTS = {
@@ -18,19 +39,27 @@ SYSTEM_PROMPTS = {
18
  "Custom": ""
19
  }
20
 
21
- def format_thinking(content):
22
- """Format thinking tags for display"""
23
- if "" in content:
24
- parts = content.split("" in part:
25
- think_content, rest = part.split("", 1)
26
- formatted += f"\n\n<details><summary>💭 Thinking Process</summary>\n\n{think_content.strip()}\n\n</details>\n\n{rest}"
27
- else:
28
- formatted += part
29
- return formatted
30
- return content
 
 
31
 
32
- def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking):
33
  """Main chat function with streaming support"""
 
 
 
 
 
 
34
 
35
  # Determine system prompt
36
  if system_prompt_choice == "Custom":
@@ -38,54 +67,53 @@ def chat(message, history, system_prompt_choice, custom_system_prompt, temperatu
38
  else:
39
  system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"])
40
 
41
- # Build messages
42
- messages = [{"role": "system", "content": system_content}]
43
-
44
- # Add history
45
  for msg in history:
46
- if msg["role"] == "user":
47
- messages.append({"role": "user", "content": msg["content"]})
48
- elif msg["role"] == "assistant":
49
- # Clean up thinking tags from history
50
- content = msg["content"]
51
- if "<details>" in content:
52
- # Remove the formatted thinking for API calls
53
- import re
54
- content = re.sub(r'<details>.*?</details>', '', content, flags=re.DOTALL)
55
- messages.append({"role": "assistant", "content": content.strip()})
56
 
57
- # Add current message
58
  messages.append({"role": "user", "content": message})
59
 
60
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  response = ""
62
- stream = client.chat_completion(
63
- messages=messages,
64
- max_tokens=max_tokens,
65
- temperature=temperature,
66
- top_p=top_p,
67
- stream=True
68
- )
69
 
70
- for chunk in stream:
71
- if chunk.choices[0].delta.content:
72
- response += chunk.choices[0].delta.content
73
- # Format thinking if enabled
74
- if show_thinking:
75
- yield format_thinking(response)
76
- else:
77
- # Hide thinking content
78
- display_response = response
79
- if "" in display_response:
80
- import re
81
- display_response = re.sub(r'', '', display_response, flags=re.DOTALL)
82
- else:
83
- # Still thinking, show placeholder
84
- display_response = "🤔 *Thinking...*"
85
- yield display_response.strip()
86
-
87
  except Exception as e:
88
- yield f"❌ Error: {str(e)}\n\nPlease check your HF_TOKEN and try again."
89
 
90
  def clear_chat():
91
  """Clear the chat history"""
@@ -129,44 +157,46 @@ css = """
129
  .header-container a:hover {
130
  text-decoration: underline;
131
  }
132
- .parameter-box {
133
  background: var(--background-fill-secondary);
134
- padding: 15px;
135
  border-radius: 8px;
136
- margin-top: 10px;
 
137
  }
138
  .chatbot-container {
139
  min-height: 500px;
140
  }
141
- footer {
142
- text-align: center;
143
- margin-top: 20px;
144
- padding: 10px;
145
- color: var(--body-text-color-subdued);
146
- }
147
  """
148
 
149
  # Build the interface
150
  with gr.Blocks(
151
- title="DeepSeek R1 Chatbot",
152
  theme=gr.themes.Soft(),
153
  css=css,
154
  fill_height=True,
155
  footer_links=[
156
  {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
157
- {"label": "Model", "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"}
158
  ]
159
  ) as demo:
160
 
161
  # Header
162
  gr.HTML("""
163
  <div class="header-container">
164
- <h1>🧠 DeepSeek R1 Chatbot</h1>
165
- <p>Powered by DeepSeek-R1-0528-Qwen3-8B with reasoning capabilities</p>
166
  <p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
167
  </div>
168
  """)
169
 
 
 
 
 
 
 
 
170
  with gr.Row():
171
  # Main chat column
172
  with gr.Column(scale=3):
@@ -175,7 +205,6 @@ with gr.Blocks(
175
  height=500,
176
  type="messages",
177
  show_copy_button=True,
178
- avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg"),
179
  render_markdown=True,
180
  elem_classes=["chatbot-container"]
181
  )
@@ -226,12 +255,12 @@ with gr.Blocks(
226
  )
227
 
228
  max_tokens = gr.Slider(
229
- minimum=64,
230
- maximum=4096,
231
- value=1024,
232
- step=64,
233
  label="Max Tokens",
234
- info="Maximum response length"
235
  )
236
 
237
  top_p = gr.Slider(
@@ -243,13 +272,6 @@ with gr.Blocks(
243
  info="Nucleus sampling parameter"
244
  )
245
 
246
- with gr.Accordion("Display Options", open=False):
247
- show_thinking = gr.Checkbox(
248
- value=True,
249
- label="Show Thinking Process",
250
- info="Display the model's reasoning steps"
251
- )
252
-
253
  # Export output
254
  export_output = gr.Textbox(
255
  label="Exported Chat",
@@ -262,11 +284,11 @@ with gr.Blocks(
262
  gr.Markdown("### 💡 Example Prompts")
263
  gr.Examples(
264
  examples=[
265
- ["Explain quantum computing in simple terms"],
266
- ["Write a haiku about artificial intelligence"],
267
- ["What's the time complexity of quicksort and why?"],
268
- ["Help me brainstorm ideas for a sustainable business"],
269
- ["Solve this step by step: If 3x + 7 = 22, what is x?"],
270
  ],
271
  inputs=msg,
272
  label=""
@@ -287,7 +309,7 @@ with gr.Blocks(
287
  history.append({"role": "user", "content": message})
288
  return "", history
289
 
290
- def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking):
291
  if not history:
292
  yield history
293
  return
@@ -297,11 +319,11 @@ with gr.Blocks(
297
 
298
  history.append({"role": "assistant", "content": ""})
299
 
300
- for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking):
301
  history[-1]["content"] = response
302
  yield history
303
 
304
- def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking):
305
  if len(history) >= 2:
306
  # Remove last assistant message
307
  history = history[:-1]
@@ -311,7 +333,7 @@ with gr.Blocks(
311
 
312
  history.append({"role": "assistant", "content": ""})
313
 
314
- for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking):
315
  history[-1]["content"] = response
316
  yield history
317
  else:
@@ -329,7 +351,7 @@ with gr.Blocks(
329
  queue=False
330
  ).then(
331
  bot_response,
332
- inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking],
333
  outputs=[chatbot]
334
  )
335
 
@@ -340,7 +362,7 @@ with gr.Blocks(
340
  queue=False
341
  ).then(
342
  bot_response,
343
- inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking],
344
  outputs=[chatbot]
345
  )
346
 
@@ -351,7 +373,7 @@ with gr.Blocks(
351
 
352
  regenerate_btn.click(
353
  regenerate,
354
- inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p, show_thinking],
355
  outputs=[chatbot]
356
  )
357
 
@@ -362,4 +384,6 @@ with gr.Blocks(
362
  )
363
 
364
  if __name__ == "__main__":
 
 
365
  demo.launch()
 
1
+ =%= app.py =%=
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ import torch
5
+ from threading import Thread
6
+ import re
7
 
8
+ # Model configuration - using a smaller model that works well on CPU
9
+ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
10
+
11
+ # Global variables for model and tokenizer
12
+ model = None
13
+ tokenizer = None
14
+
15
+ def load_model():
16
+ """Load the model and tokenizer"""
17
+ global model, tokenizer
18
+
19
+ if model is None:
20
+ print("Loading model... This may take a moment on CPU.")
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ MODEL_NAME,
24
+ torch_dtype=torch.float32,
25
+ device_map="cpu",
26
+ low_cpu_mem_usage=True
27
+ )
28
+ print("Model loaded successfully!")
29
+
30
+ return model, tokenizer
31
 
32
  # Default system prompts
33
  SYSTEM_PROMPTS = {
 
39
  "Custom": ""
40
  }
41
 
42
+ def format_chat_prompt(messages, system_prompt):
43
+ """Format messages for TinyLlama chat format"""
44
+ formatted = f"<|system|>\n{system_prompt}</s>\n"
45
+
46
+ for msg in messages:
47
+ if msg["role"] == "user":
48
+ formatted += f"<|user|>\n{msg['content']}</s>\n"
49
+ elif msg["role"] == "assistant":
50
+ formatted += f"<|assistant|>\n{msg['content']}</s>\n"
51
+
52
+ formatted += "<|assistant|>\n"
53
+ return formatted
54
 
55
+ def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
56
  """Main chat function with streaming support"""
57
+ global model, tokenizer
58
+
59
+ # Load model if not loaded
60
+ if model is None:
61
+ yield "⏳ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..."
62
+ load_model()
63
 
64
  # Determine system prompt
65
  if system_prompt_choice == "Custom":
 
67
  else:
68
  system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"])
69
 
70
+ # Build messages list
71
+ messages = []
 
 
72
  for msg in history:
73
+ if msg["role"] in ["user", "assistant"]:
74
+ messages.append({"role": msg["role"], "content": msg["content"]})
 
 
 
 
 
 
 
 
75
 
 
76
  messages.append({"role": "user", "content": message})
77
 
78
  try:
79
+ # Format the prompt
80
+ prompt = format_chat_prompt(messages, system_content)
81
+
82
+ # Tokenize
83
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
84
+
85
+ # Set up streamer
86
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
87
+
88
+ # Generation parameters
89
+ generation_kwargs = {
90
+ "input_ids": inputs["input_ids"],
91
+ "attention_mask": inputs["attention_mask"],
92
+ "max_new_tokens": max_tokens,
93
+ "temperature": temperature if temperature > 0 else 0.1,
94
+ "top_p": top_p,
95
+ "do_sample": temperature > 0,
96
+ "streamer": streamer,
97
+ "pad_token_id": tokenizer.eos_token_id,
98
+ "eos_token_id": tokenizer.eos_token_id,
99
+ }
100
+
101
+ # Run generation in a separate thread
102
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
103
+ thread.start()
104
+
105
+ # Stream the response
106
  response = ""
107
+ for new_text in streamer:
108
+ response += new_text
109
+ # Clean up any remaining special tokens
110
+ clean_response = response.replace("</s>", "").strip()
111
+ yield clean_response
112
+
113
+ thread.join()
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
+ yield f"❌ Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens."
117
 
118
  def clear_chat():
119
  """Clear the chat history"""
 
157
  .header-container a:hover {
158
  text-decoration: underline;
159
  }
160
+ .info-box {
161
  background: var(--background-fill-secondary);
162
+ padding: 10px 15px;
163
  border-radius: 8px;
164
+ margin: 10px 0;
165
+ border-left: 4px solid #667eea;
166
  }
167
  .chatbot-container {
168
  min-height: 500px;
169
  }
 
 
 
 
 
 
170
  """
171
 
172
  # Build the interface
173
  with gr.Blocks(
174
+ title="TinyLlama Chatbot (CPU)",
175
  theme=gr.themes.Soft(),
176
  css=css,
177
  fill_height=True,
178
  footer_links=[
179
  {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
180
+ {"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
181
  ]
182
  ) as demo:
183
 
184
  # Header
185
  gr.HTML("""
186
  <div class="header-container">
187
+ <h1>🦙 TinyLlama Chatbot</h1>
188
+ <p>Powered by TinyLlama-1.1B-Chat - Running locally on CPU</p>
189
  <p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
190
  </div>
191
  """)
192
 
193
+ gr.HTML("""
194
+ <div class="info-box">
195
+ ℹ️ <strong>CPU Mode:</strong> This chatbot runs entirely on CPU without any API calls.
196
+ First response may take longer as the model loads. Responses are generated locally.
197
+ </div>
198
+ """)
199
+
200
  with gr.Row():
201
  # Main chat column
202
  with gr.Column(scale=3):
 
205
  height=500,
206
  type="messages",
207
  show_copy_button=True,
 
208
  render_markdown=True,
209
  elem_classes=["chatbot-container"]
210
  )
 
255
  )
256
 
257
  max_tokens = gr.Slider(
258
+ minimum=32,
259
+ maximum=512,
260
+ value=256,
261
+ step=32,
262
  label="Max Tokens",
263
+ info="Maximum response length (lower = faster on CPU)"
264
  )
265
 
266
  top_p = gr.Slider(
 
272
  info="Nucleus sampling parameter"
273
  )
274
 
 
 
 
 
 
 
 
275
  # Export output
276
  export_output = gr.Textbox(
277
  label="Exported Chat",
 
284
  gr.Markdown("### 💡 Example Prompts")
285
  gr.Examples(
286
  examples=[
287
+ ["Explain what machine learning is in simple terms"],
288
+ ["Write a short poem about the ocean"],
289
+ ["What are three tips for staying productive?"],
290
+ ["Tell me a fun fact about space"],
291
+ ["How do I make a simple pasta dish?"],
292
  ],
293
  inputs=msg,
294
  label=""
 
309
  history.append({"role": "user", "content": message})
310
  return "", history
311
 
312
+ def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
313
  if not history:
314
  yield history
315
  return
 
319
 
320
  history.append({"role": "assistant", "content": ""})
321
 
322
+ for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
323
  history[-1]["content"] = response
324
  yield history
325
 
326
+ def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
327
  if len(history) >= 2:
328
  # Remove last assistant message
329
  history = history[:-1]
 
333
 
334
  history.append({"role": "assistant", "content": ""})
335
 
336
+ for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
337
  history[-1]["content"] = response
338
  yield history
339
  else:
 
351
  queue=False
352
  ).then(
353
  bot_response,
354
+ inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
355
  outputs=[chatbot]
356
  )
357
 
 
362
  queue=False
363
  ).then(
364
  bot_response,
365
+ inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
366
  outputs=[chatbot]
367
  )
368
 
 
373
 
374
  regenerate_btn.click(
375
  regenerate,
376
+ inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
377
  outputs=[chatbot]
378
  )
379
 
 
384
  )
385
 
386
  if __name__ == "__main__":
387
+ # Pre-load model on startup (optional - can be commented out for faster startup)
388
+ print("Starting TinyLlama Chatbot...")
389
  demo.launch()
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
- huggingface_hub
2
  gradio
3
- requests
4
- Pillow
5
  git+https://github.com/huggingface/transformers
6
  torch
7
- tokenizers
 
8
  accelerate
 
 
 
 
9
  numpy
10
- pandas
11
  sentencepiece
12
- datasets
 
 
1
  gradio
 
 
2
  git+https://github.com/huggingface/transformers
3
  torch
4
+ requests
5
+ Pillow
6
  accelerate
7
+ tokenizers
8
+ datasets
9
+ torchvision
10
+ torchaudio
11
  numpy
 
12
  sentencepiece