Ultronprime commited on
Commit
37162ec
·
verified ·
1 Parent(s): 7818f69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -82
app.py CHANGED
@@ -2,129 +2,136 @@ import gradio as gr
2
  import spaces
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
  css = """
7
  footer {visibility: hidden}
8
- .message-wrap {padding: 10px}
9
- .assistant-message pre {background-color: #f6f8fa; padding: 12px; border-radius: 8px}
 
 
 
 
10
  """
11
 
12
  model_name = "ngxson/MiniThinky-v2-1B-Llama-3.2"
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
- # Initialize tokenizer and model
16
- tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- model = AutoModelForCausalLM.from_pretrained(
18
- model_name,
19
- torch_dtype=torch.float16,
20
- device_map="auto"
21
- )
 
 
 
22
 
23
- EXAMPLES = [
24
- "Solve the equation x^2 - 3x + 2 = 0",
25
- "Lily is three times older than her son. In 15 years, she will be twice as old as him. How old is she now?",
26
- "Write python code to compute the nth fibonacci number."
27
- ]
28
 
29
- def format_message(message, history):
30
- base_prompt = "You are MiniThinky, a helpful AI assistant. You always think before giving the answer. Use <|thinking|> before thinking and <|answer|> before giving the answer."
 
 
 
31
 
32
- # Format conversation history
33
- messages = [{"role": "system", "content": base_prompt}]
 
34
 
35
- # Add conversation history
36
- for human, assistant in history:
37
- messages.append({"role": "user", "content": human})
38
- messages.append({"role": "assistant", "content": assistant})
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # Add current message
 
 
 
41
  messages.append({"role": "user", "content": message})
42
 
43
- return tokenizer.apply_chat_template(messages, tokenize=False)
44
-
45
- @spaces.GPU(duration=60)
46
- def generate(message, history):
47
  try:
48
- # Format prompt with history
49
- prompt = format_message(message, history)
50
-
51
- # Encode prompt
52
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
53
- inputs = inputs.to(device)
54
 
55
- # Generate response
56
  outputs = model.generate(
57
  **inputs,
58
  max_new_tokens=512,
59
- do_sample=True,
60
  temperature=0.7,
61
- top_p=0.9,
62
- repetition_penalty=1.2,
63
- pad_token_id=tokenizer.eos_token_id
64
  )
65
 
66
- # Decode response
67
- response = tokenizer.decode(outputs[0], skip_special_tokens=False)
68
  response = response.split(message)[-1].strip()
69
 
70
- # Split thinking and answer parts
71
- thinking = ""
72
- answer = response
73
 
74
- if "<|thinking|>" in response:
75
- parts = response.split("<|thinking|>", 1)
76
- if len(parts) > 1:
77
- thinking = parts[1].split("<|answer|>")[0].strip()
78
- answer = parts[1].split("<|answer|>")[1].strip()
79
-
80
- # Format final response
81
- final_response = f"🤔 Thinking:\n{thinking}\n\n✨ Answer:\n{answer}"
82
-
83
- return final_response
84
 
85
  except Exception as e:
86
- return f"Error: {str(e)}"
 
87
 
88
  with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
89
- gr.HTML(
90
- """
91
- <div style='text-align: center'>
92
- <h1>MiniThinky Chat Assistant</h1>
93
- <p>A helpful AI assistant that thinks before answering.</p>
94
- </div>
95
- """
96
- )
97
 
98
  chatbot = gr.Chatbot(
99
- label="Conversation",
100
- height=500,
 
 
101
  )
102
 
103
  with gr.Row():
104
  txt = gr.Textbox(
105
  placeholder="Type your message here...",
106
- show_label=False,
107
  scale=4
108
  )
109
- btn = gr.Button("Send", scale=1)
110
 
111
- clear = gr.ClearButton([txt, chatbot])
112
-
113
- # Example buttons
114
  with gr.Row():
115
- for example in EXAMPLES:
116
- gr.Button(example).click(
117
- lambda msg: gr.update(value=msg),
118
- [example],
119
- [txt]
120
- )
 
 
 
 
 
121
 
122
- txt.submit(generate, [txt, chatbot], [chatbot]).then(
123
- lambda: "", None, [txt]
124
- )
125
- btn.click(generate, [txt, chatbot], [chatbot]).then(
126
- lambda: "", None, [txt]
127
- )
 
128
 
129
  if __name__ == "__main__":
130
- demo.queue(max_size=20, api_open=False).launch()
 
2
  import spaces
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
+ import re
6
 
7
  css = """
8
  footer {visibility: hidden}
9
+ .message-wrap {max-width: 900px}
10
+ .bot {background-color: #f7f7f8}
11
+ .user {background-color: white}
12
+ .message {padding: 20px; margin: 10px}
13
+ .thinking {color: #666; font-style: italic; border-left: 3px solid #666; padding-left: 10px; margin: 10px 0}
14
+ .answer {margin-top: 10px}
15
  """
16
 
17
  model_name = "ngxson/MiniThinky-v2-1B-Llama-3.2"
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
20
+ try:
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_name,
24
+ torch_dtype=torch.float16,
25
+ device_map="auto"
26
+ )
27
+ except Exception as e:
28
+ print(f"Error loading model: {e}")
29
+ raise gr.Error("Failed to load model. Please try again later.")
30
 
31
+ SYSTEM_MESSAGE = "You are MiniThinky, a helpful AI assistant. You always think before giving the answer. Use <|thinking|> before thinking and <|answer|> before giving the answer."
 
 
 
 
32
 
33
+ def parse_response(text):
34
+ """Parse thinking and answer from response"""
35
+ # Extract thinking part
36
+ thinking_match = re.search(r'<\|thinking\|>(.*?)(?=<\|answer\|>|$)', text, re.DOTALL)
37
+ thinking = thinking_match.group(1).strip() if thinking_match else ""
38
 
39
+ # Extract answer part
40
+ answer_match = re.search(r'<\|answer\|>(.*?)$', text, re.DOTALL)
41
+ answer = answer_match.group(1).strip() if answer_match else text.strip()
42
 
43
+ return thinking, answer
44
+
45
+ def format_message(text):
46
+ """Format message with thinking and answer sections"""
47
+ thinking, answer = parse_response(text)
48
+ formatted = []
49
+ if thinking:
50
+ formatted.append(f'<div class="thinking">{thinking}</div>')
51
+ if answer:
52
+ formatted.append(f'<div class="answer">{answer}</div>')
53
+ return "\n".join(formatted)
54
+
55
+ @spaces.GPU(duration=60)
56
+ def generate_response(message, history):
57
+ messages = [{"role": "system", "content": SYSTEM_MESSAGE}]
58
 
59
+ # Add history to context
60
+ for user_msg, bot_msg in history:
61
+ messages.append({"role": "user", "content": user_msg})
62
+ messages.append({"role": "assistant", "content": bot_msg})
63
  messages.append({"role": "user", "content": message})
64
 
65
+ # Format prompt
66
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False)
67
+
 
68
  try:
69
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)
 
 
 
 
 
70
 
 
71
  outputs = model.generate(
72
  **inputs,
73
  max_new_tokens=512,
 
74
  temperature=0.7,
75
+ do_sample=True,
76
+ top_p=0.95,
77
+ pad_token_id=tokenizer.eos_token_id,
78
  )
79
 
80
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
81
  response = response.split(message)[-1].strip()
82
 
83
+ # Format response for display
84
+ formatted_response = format_message(response)
 
85
 
86
+ torch.cuda.empty_cache()
87
+ return formatted_response
 
 
 
 
 
 
 
 
88
 
89
  except Exception as e:
90
+ print(f"Error: {e}")
91
+ return "[Error occurred during generation]"
92
 
93
  with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
94
+ gr.HTML("""
95
+ <h1 style="text-align: center; margin-bottom: 1rem">
96
+ MiniThinky Chat Assistant
97
+ </h1>
98
+ """)
 
 
 
99
 
100
  chatbot = gr.Chatbot(
101
+ bubble=True,
102
+ height=600,
103
+ container=True,
104
+ show_copy_button=True
105
  )
106
 
107
  with gr.Row():
108
  txt = gr.Textbox(
109
  placeholder="Type your message here...",
110
+ container=False,
111
  scale=4
112
  )
113
+ submit_btn = gr.Button("Send", scale=1, variant="primary")
114
 
 
 
 
115
  with gr.Row():
116
+ clear_btn = gr.ClearButton([txt, chatbot], value="Clear chat")
117
+
118
+ with gr.Accordion("Examples", open=False):
119
+ gr.Examples(
120
+ examples=[
121
+ "Solve the equation x^2 - 3x + 2 = 0",
122
+ "Lily is three times older than her son. In 15 years, she will be twice as old as him. How old is she now?",
123
+ "Write python code to compute the nth fibonacci number.",
124
+ ],
125
+ inputs=txt
126
+ )
127
 
128
+ def respond(message, chat_history):
129
+ bot_message = generate_response(message, chat_history)
130
+ chat_history.append((message, bot_message))
131
+ return "", chat_history
132
+
133
+ txt.submit(respond, [txt, chatbot], [txt, chatbot])
134
+ submit_btn.click(respond, [txt, chatbot], [txt, chatbot])
135
 
136
  if __name__ == "__main__":
137
+ demo.queue(max_size=20).launch()