Ultronprime commited on
Commit
7818f69
·
verified ·
1 Parent(s): 5cfc354

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -84
app.py CHANGED
@@ -3,126 +3,128 @@ import spaces
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
 
6
- # Add CSS for footer hiding and styling
7
  css = """
8
- footer {
9
- visibility: hidden;
10
- }
11
- .container {max-width: 850px; margin: auto; padding: 20px}
12
- .title {text-align: center; margin-bottom: 20px}
13
  """
14
 
15
- # Model initialization
16
  model_name = "ngxson/MiniThinky-v2-1B-Llama-3.2"
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
- try:
20
- tokenizer = AutoTokenizer.from_pretrained(model_name)
21
- model = AutoModelForCausalLM.from_pretrained(
22
- model_name,
23
- torch_dtype=torch.float16,
24
- device_map="auto"
25
- )
26
- except Exception as e:
27
- print(f"Error loading model: {e}")
28
- raise gr.Error("Failed to load model. Please try again later.")
29
 
30
- SYSTEM_MESSAGE = "You are MiniThinky, a helpful AI assistant. You always think before giving the answer. Use <|thinking|> before thinking and <|answer|> before giving the answer."
 
 
 
 
31
 
32
- def format_chat_prompt(messages):
33
- formatted_messages = [{"role": "system", "content": SYSTEM_MESSAGE}]
34
- formatted_messages.extend(messages)
35
- return tokenizer.apply_chat_template(
36
- formatted_messages,
37
- tokenize=False,
38
- add_generation_prompt=True
39
- )
 
 
 
 
 
 
 
40
 
41
  @spaces.GPU(duration=60)
42
- def generate_response(message, history, progress=gr.Progress(track_tqdm=True)):
43
- if not message.strip():
44
- return "", history
45
-
46
  try:
47
- # Format messages including history
48
- messages = []
49
- for user_msg, assistant_msg in history:
50
- messages.append({"role": "user", "content": user_msg})
51
- messages.append({"role": "assistant", "content": assistant_msg})
52
- messages.append({"role": "user", "content": message})
53
-
54
- # Format prompt
55
- prompt = format_chat_prompt(messages)
56
 
57
- # Tokenize
58
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)
 
59
 
60
- # Generate
61
  outputs = model.generate(
62
  **inputs,
63
  max_new_tokens=512,
64
- temperature=0.7,
65
  do_sample=True,
66
- pad_token_id=tokenizer.eos_token_id,
 
 
 
67
  )
68
 
69
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
70
-
71
- # Extract response after the last user message
72
  response = response.split(message)[-1].strip()
73
 
74
- # Clear GPU memory
75
- torch.cuda.empty_cache()
 
76
 
77
- return response
 
 
 
 
78
 
79
- except Exception as e:
80
- print(f"Error during generation: {e}")
81
- return "[Error: Generation failed. Please try again.]", history
 
82
 
83
- def respond(message, chat_history):
84
- try:
85
- bot_message = generate_response(message, chat_history)
86
- chat_history.append((message, bot_message))
87
- return "", chat_history
88
  except Exception as e:
89
- raise gr.Error(str(e))
90
 
91
- # Gradio Interface
92
  with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
93
  gr.HTML(
94
  """
95
- <div class="title">
96
  <h1>MiniThinky Chat Assistant</h1>
97
- <p>A helpful AI assistant that thinks before answering</p>
98
  </div>
99
  """
100
  )
101
 
102
- with gr.Column(elem_id="col-container"):
103
- chatbot = gr.Chatbot(height=400)
104
- with gr.Row():
105
- msg = gr.Textbox(
106
- placeholder="Type your message here...",
107
- container=False,
108
- scale=4
109
- )
110
- submit = gr.Button("Submit", scale=1)
111
-
112
- clear = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")
113
-
114
- with gr.Accordion("Examples", open=False):
115
- gr.Examples(
116
- examples=[
117
- "What is the capital of France?",
118
- "Explain quantum computing in simple terms",
119
- "Write a short poem about AI",
120
- ],
121
- inputs=msg
 
 
122
  )
123
 
124
- msg.submit(respond, [msg, chatbot], [msg, chatbot], queue=True)
125
- submit.click(respond, [msg, chatbot], [msg, chatbot], queue=True)
 
 
 
 
126
 
127
  if __name__ == "__main__":
128
  demo.queue(max_size=20, api_open=False).launch()
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
 
 
6
  css = """
7
+ footer {visibility: hidden}
8
+ .message-wrap {padding: 10px}
9
+ .assistant-message pre {background-color: #f6f8fa; padding: 12px; border-radius: 8px}
 
 
10
  """
11
 
 
12
  model_name = "ngxson/MiniThinky-v2-1B-Llama-3.2"
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
+ # Initialize tokenizer and model
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ torch_dtype=torch.float16,
20
+ device_map="auto"
21
+ )
 
 
 
22
 
23
+ EXAMPLES = [
24
+ "Solve the equation x^2 - 3x + 2 = 0",
25
+ "Lily is three times older than her son. In 15 years, she will be twice as old as him. How old is she now?",
26
+ "Write python code to compute the nth fibonacci number."
27
+ ]
28
 
29
+ def format_message(message, history):
30
+ base_prompt = "You are MiniThinky, a helpful AI assistant. You always think before giving the answer. Use <|thinking|> before thinking and <|answer|> before giving the answer."
31
+
32
+ # Format conversation history
33
+ messages = [{"role": "system", "content": base_prompt}]
34
+
35
+ # Add conversation history
36
+ for human, assistant in history:
37
+ messages.append({"role": "user", "content": human})
38
+ messages.append({"role": "assistant", "content": assistant})
39
+
40
+ # Add current message
41
+ messages.append({"role": "user", "content": message})
42
+
43
+ return tokenizer.apply_chat_template(messages, tokenize=False)
44
 
45
  @spaces.GPU(duration=60)
46
+ def generate(message, history):
 
 
 
47
  try:
48
+ # Format prompt with history
49
+ prompt = format_message(message, history)
 
 
 
 
 
 
 
50
 
51
+ # Encode prompt
52
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
53
+ inputs = inputs.to(device)
54
 
55
+ # Generate response
56
  outputs = model.generate(
57
  **inputs,
58
  max_new_tokens=512,
 
59
  do_sample=True,
60
+ temperature=0.7,
61
+ top_p=0.9,
62
+ repetition_penalty=1.2,
63
+ pad_token_id=tokenizer.eos_token_id
64
  )
65
 
66
+ # Decode response
67
+ response = tokenizer.decode(outputs[0], skip_special_tokens=False)
 
68
  response = response.split(message)[-1].strip()
69
 
70
+ # Split thinking and answer parts
71
+ thinking = ""
72
+ answer = response
73
 
74
+ if "<|thinking|>" in response:
75
+ parts = response.split("<|thinking|>", 1)
76
+ if len(parts) > 1:
77
+ thinking = parts[1].split("<|answer|>")[0].strip()
78
+ answer = parts[1].split("<|answer|>")[1].strip()
79
 
80
+ # Format final response
81
+ final_response = f"🤔 Thinking:\n{thinking}\n\n✨ Answer:\n{answer}"
82
+
83
+ return final_response
84
 
 
 
 
 
 
85
  except Exception as e:
86
+ return f"Error: {str(e)}"
87
 
 
88
  with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
89
  gr.HTML(
90
  """
91
+ <div style='text-align: center'>
92
  <h1>MiniThinky Chat Assistant</h1>
93
+ <p>A helpful AI assistant that thinks before answering.</p>
94
  </div>
95
  """
96
  )
97
 
98
+ chatbot = gr.Chatbot(
99
+ label="Conversation",
100
+ height=500,
101
+ )
102
+
103
+ with gr.Row():
104
+ txt = gr.Textbox(
105
+ placeholder="Type your message here...",
106
+ show_label=False,
107
+ scale=4
108
+ )
109
+ btn = gr.Button("Send", scale=1)
110
+
111
+ clear = gr.ClearButton([txt, chatbot])
112
+
113
+ # Example buttons
114
+ with gr.Row():
115
+ for example in EXAMPLES:
116
+ gr.Button(example).click(
117
+ lambda msg: gr.update(value=msg),
118
+ [example],
119
+ [txt]
120
  )
121
 
122
+ txt.submit(generate, [txt, chatbot], [chatbot]).then(
123
+ lambda: "", None, [txt]
124
+ )
125
+ btn.click(generate, [txt, chatbot], [chatbot]).then(
126
+ lambda: "", None, [txt]
127
+ )
128
 
129
  if __name__ == "__main__":
130
  demo.queue(max_size=20, api_open=False).launch()