Ksjsjjdj commited on
Commit
5ae3817
·
verified ·
1 Parent(s): 2593844

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -227
app.py CHANGED
@@ -1,241 +1,69 @@
1
- import re
2
- import threading
3
-
4
  import gradio as gr
5
  import spaces
6
- import transformers
7
- from transformers import pipeline
8
-
9
- # loading model and tokenizer
10
- model_name = "Ksjsjjdj/nucleus-model-v10142"
11
-
12
- if gr.NO_RELOAD:
13
- pipe = pipeline(
14
- "text-generation",
15
- model=model_name,
16
- device_map="auto",
17
- torch_dtype="auto",
18
- )
19
-
20
- # --- FIX START: Manually set a chat template if one is missing ---
21
- # This uses a standard ChatML format (User: ... Assistant: ...)
22
- if pipe.tokenizer.chat_template is None:
23
- pipe.tokenizer.chat_template = (
24
- "{% for message in messages %}"
25
- "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
26
- "{% endfor %}"
27
- "{% if add_generation_prompt %}"
28
- "{{ '<|im_start|>assistant\n' }}"
29
- "{% endif %}"
30
- )
31
- # --- FIX END ---
32
-
33
- # the answer marker to detect final answer
34
- ANSWER_MARKER = "**ANSWER**"
35
-
36
- # the sentences starting the reasoning step by step
37
- rethink_prepends = [
38
- "OK, I need to figure out ",
39
- "I think ",
40
- "Wait, I think ",
41
- "Let me check if ",
42
- "I should also remember that ",
43
- "Another thing to note is that ",
44
- "I also recall that ",
45
- "I think I have a good grasp ",
46
- "Now, using all the above information, I can answer the question using the original language used for the question:"
47
- "\n{question}\n"
48
- f"\n{ANSWER_MARKER}\n",
49
- ]
50
-
51
 
52
- # to fix some problems with math display
53
- latex_delimiters = [
54
- {"left": "$$", "right": "$$", "display": True},
55
- {"left": "$", "right": "$", "display": False},
56
- ]
57
 
 
58
 
59
- def reformat_math(text):
60
- """Fix MathJax delimiters to use the Gradio syntax (Katex).
61
-
62
- This is a workaround to display math formulas in Gradio. For now, I havn't found a way to
63
- make it work as expected using others latex_delimiters...
64
- """
65
- text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
66
- text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL)
67
- return text
68
-
69
-
70
- def user_input(message, history: list):
71
- """Append the user input in the history and clean the input textbox"""
72
- return "", history + [
73
- gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
74
- ]
75
-
76
-
77
- def rebuild_messages(history: list):
78
- """Rebuid the messages from the history to be used by the model without the intermediate thoughs"""
79
- messages = []
80
- for h in history:
81
- if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
82
- messages.append(h)
83
- elif (
84
- isinstance(h, gr.ChatMessage)
85
- and h.metadata.get("title")
86
- and isinstance(h.content, str)
87
- ):
88
- messages.append({"role": h.role, "content": h.content})
89
- return messages
90
-
91
 
92
  @spaces.GPU
93
- def bot(
94
- history: list,
95
- max_num_tokens: int,
96
- final_num_tokens: int,
97
- do_sample: bool,
98
- temperature: float,
99
- ):
100
- """Make the model answering the question"""
 
 
 
 
101
 
102
- # to get token as a stream, later in a thread
103
- streamer = transformers.TextIteratorStreamer(
104
- pipe.tokenizer, # pyright: ignore
105
- skip_special_tokens=True,
106
- skip_prompt=True,
107
  )
108
 
109
- # to reinsert the question in the reasoning if needed
110
- question = history[-1]["content"]
111
-
112
- # prepare the assistant message
113
- history.append(
114
- gr.ChatMessage(
115
- role="assistant",
116
- content=str(""),
117
- metadata={"title": "🧠 Thinking...", "status": "pending"},
118
- )
119
  )
120
 
121
- # for the moment, make the reasoning to be displayed in the chat
122
- messages = rebuild_messages(history)
123
- for i, prepend in enumerate(rethink_prepends):
124
- if i > 0:
125
- messages[-1]["content"] += "\n\n"
126
- messages[-1]["content"] += prepend.format(question=question)
127
-
128
- num_tokens = int(
129
- max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
130
- )
131
- t = threading.Thread(
132
- target=pipe,
133
- args=(messages,),
134
- kwargs=dict(
135
- max_new_tokens=num_tokens,
136
- streamer=streamer,
137
- do_sample=do_sample,
138
- temperature=temperature,
139
- ),
140
- )
141
- t.start()
142
-
143
- # rebuild the history with the new content
144
- history[-1].content += prepend.format(question=question)
145
- if ANSWER_MARKER in prepend:
146
- history[-1].metadata = {"title": "💭 Thoughs", "status": "done"}
147
- # stop thinking, this is the answer now (no metadata for intermediate steps)
148
- history.append(gr.ChatMessage(role="assistant", content=""))
149
- for token in streamer:
150
- history[-1].content += token
151
- history[-1].content = reformat_math(history[-1].content)
152
- yield history
153
- t.join()
154
-
155
- yield history
156
-
157
-
158
- with gr.Blocks(fill_height=True, title="Making any LLM model reasoning") as demo:
159
- with gr.Row(scale=1):
160
- with gr.Column(scale=5):
161
- gr.Markdown(f"""
162
- # Force reasoning for any LLM
163
-
164
- This is a simple proof-of-concept to get any LLM (Large language Model) to reason ahead of its response.
165
- This interface uses *{model_name}* model **which is not a reasoning model**. The used method
166
- is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.
167
-
168
- See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
169
- """)
170
- chatbot = gr.Chatbot(
171
- scale=1,
172
- type="messages",
173
- latex_delimiters=latex_delimiters,
174
- )
175
- msg = gr.Textbox(
176
- submit_btn=True,
177
- label="",
178
- show_label=False,
179
- placeholder="Type your question here.",
180
- autofocus=True,
181
- )
182
- with gr.Column(scale=1):
183
- gr.Markdown("""## Tweaking""")
184
- num_tokens = gr.Slider(
185
- 50,
186
- 1024,
187
- 100,
188
- step=1,
189
- label="Max tokens per reasoning step",
190
- interactive=True,
191
- )
192
- final_num_tokens = gr.Slider(
193
- 50,
194
- 1024,
195
- 512,
196
- step=1,
197
- label="Max token for the final answer",
198
- interactive=True,
199
- )
200
- do_sample = gr.Checkbox(True, label="Do sample")
201
- temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature")
202
- gr.Markdown("""
203
- Using smaller number of tokens in the reasoning steps will make the model
204
- faster to answer, but it may not be able to go deep enough in its reasoning.
205
- A good value is 100 to 512.
206
-
207
- Using smaller number of tokens for the final answer will make the model
208
- to be less verbose, but it may not be able to give a complete answer.
209
- A good value is 512 to 1024.
210
-
211
- **Do sample** uses another strategie to select the next token to complete the
212
- answer. It's commonly better to leave it checked.
213
-
214
- **Temperature** indicates how much the model could be "creative". 0.7 is a common value.
215
- If you set a too high value (like 1.0) the model could be incoherent. With a low value
216
- (like 0.3), the model will produce very predictives answers.
217
- """)
218
- gr.Markdown("""
219
- This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
220
- Feel free to fork the application and try others instruct models.
221
- """)
222
-
223
- # when the user submit a message, the bot will answer
224
- msg.submit(
225
- user_input,
226
- [msg, chatbot], # inputs
227
- [msg, chatbot], # outputs
228
- ).then(
229
- bot,
230
- [
231
- chatbot,
232
- num_tokens,
233
- final_num_tokens,
234
- do_sample,
235
- temperature,
236
- ], # actually, the "history" input
237
- chatbot, # to store the new history from the output
238
- )
239
 
240
  if __name__ == "__main__":
241
- demo.queue().launch()
 
1
+ import os
2
+ from threading import Thread
 
3
  import gradio as gr
4
  import spaces
5
+ import torch
6
+ from dotenv import load_dotenv
7
+ from transformers import pipeline, TextIteratorStreamer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ load_dotenv()
 
 
 
 
10
 
11
+ model_id = "facebook/MobileLLM-R1.5-950M"
12
 
13
+ pipe = pipeline(
14
+ "text-generation",
15
+ model=model_id,
16
+ torch_dtype="auto",
17
+ device_map="auto",
18
+ token=os.getenv("HF_TOKEN")
19
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  @spaces.GPU
22
+ def chat(message, history):
23
+ messages = []
24
+ messages.append({
25
+ "role": "system",
26
+ "content": "Please reason step by step, and put your final answer within \\boxed{}."
27
+ })
28
+
29
+ for user_msg, assistant_msg in history:
30
+ messages.append({"role": "user", "content": user_msg})
31
+ messages.append({"role": "assistant", "content": assistant_msg})
32
+
33
+ messages.append({"role": "user", "content": message})
34
 
35
+ streamer = TextIteratorStreamer(
36
+ pipe.tokenizer,
37
+ skip_prompt=True,
38
+ skip_special_tokens=True
 
39
  )
40
 
41
+ generation_kwargs = dict(
42
+ text_inputs=messages,
43
+ streamer=streamer,
44
+ max_new_tokens=8192,
45
+ do_sample=True,
46
+ temperature=0.7,
 
 
 
 
47
  )
48
 
49
+ thread = Thread(target=pipe, kwargs=generation_kwargs)
50
+ thread.start()
51
+
52
+ response = ""
53
+ for new_text in streamer:
54
+ response += new_text
55
+ yield response
56
+
57
+ demo = gr.ChatInterface(
58
+ fn=chat,
59
+ title="MobileLLM-R1.5-950M Chat",
60
+ description="Reasoning model running on GPU.",
61
+ examples=[
62
+ "Compute: $1-2+3-4+5- \\dots +99-100$.",
63
+ "Write a Python function that returns the square of a number.",
64
+ "Explain the theory of relativity in simple terms."
65
+ ],
66
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  if __name__ == "__main__":
69
+ demo.launch()