|
|
import re |
|
|
import threading |
|
|
|
|
|
import gradio as gr |
|
|
import spaces |
|
|
import transformers |
|
|
from transformers import pipeline |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
model_name = "Ksjsjjdj/nucleus-bitnet-v2" |
|
|
if gr.NO_RELOAD: |
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model=model_name, |
|
|
device_map="auto", |
|
|
torch_dtype="auto", |
|
|
token=os.getenv("HF_TOKEN"), |
|
|
) |
|
|
|
|
|
|
|
|
ANSWER_MARKER = "**ANSWER**" |
|
|
|
|
|
|
|
|
rethink_prepends = [ |
|
|
"OK, I need to figure out ", |
|
|
"I think ", |
|
|
"Wait, I think ", |
|
|
"Let me check if ", |
|
|
"I should also remember that ", |
|
|
"Another thing to note is that ", |
|
|
"I also recall that ", |
|
|
"I think I have a good grasp ", |
|
|
"Now, using all the above information, I can answer the question using the original language used for the question:" |
|
|
"\n{question}\n" |
|
|
f"\n{ANSWER_MARKER}\n", |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
latex_delimiters = [ |
|
|
{"left": "$$", "right": "$$", "display": True}, |
|
|
{"left": "$", "right": "$", "display": False}, |
|
|
] |
|
|
|
|
|
|
|
|
def reformat_math(text): |
|
|
"""Fix MathJax delimiters to use the Gradio syntax (Katex). |
|
|
This is a workaround to display math formulas in Gradio. For now, I havn't found a way to |
|
|
make it work as expected using others latex_delimiters... |
|
|
""" |
|
|
text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL) |
|
|
text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL) |
|
|
return text |
|
|
|
|
|
|
|
|
def user_input(message, history: list): |
|
|
"""Append the user input in the history and clean the input textbox""" |
|
|
return "", history + [ |
|
|
gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, "")) |
|
|
] |
|
|
|
|
|
|
|
|
def rebuild_messages(history: list): |
|
|
"""Rebuid the messages from the history to be used by the model without the intermediate thoughs""" |
|
|
messages = [] |
|
|
for h in history: |
|
|
if isinstance(h, dict) and not h.get("metadata", {}).get("title", False): |
|
|
messages.append(h) |
|
|
elif ( |
|
|
isinstance(h, gr.ChatMessage) |
|
|
and h.metadata.get("title") |
|
|
and isinstance(h.content, str) |
|
|
): |
|
|
messages.append({"role": h.role, "content": h.content}) |
|
|
return messages |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def bot( |
|
|
history: list, |
|
|
max_num_tokens: int, |
|
|
final_num_tokens: int, |
|
|
do_sample: bool, |
|
|
temperature: float, |
|
|
): |
|
|
"""Make the model answering the question""" |
|
|
|
|
|
|
|
|
streamer = transformers.TextIteratorStreamer( |
|
|
pipe.tokenizer, |
|
|
skip_special_tokens=True, |
|
|
skip_prompt=True, |
|
|
) |
|
|
|
|
|
|
|
|
question = history[-1]["content"] |
|
|
|
|
|
|
|
|
history.append( |
|
|
gr.ChatMessage( |
|
|
role="assistant", |
|
|
content=str(""), |
|
|
metadata={"title": "🧠 Thinking...", "status": "pending"}, |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
messages = rebuild_messages(history) |
|
|
for i, prepend in enumerate(rethink_prepends): |
|
|
if i > 0: |
|
|
messages[-1]["content"] += "\n\n" |
|
|
messages[-1]["content"] += prepend.format(question=question) |
|
|
|
|
|
num_tokens = int( |
|
|
max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens |
|
|
) |
|
|
t = threading.Thread( |
|
|
target=pipe, |
|
|
args=(messages,), |
|
|
kwargs=dict( |
|
|
max_new_tokens=num_tokens, |
|
|
streamer=streamer, |
|
|
do_sample=do_sample, |
|
|
temperature=temperature, |
|
|
), |
|
|
) |
|
|
t.start() |
|
|
|
|
|
|
|
|
history[-1].content += prepend.format(question=question) |
|
|
if ANSWER_MARKER in prepend: |
|
|
history[-1].metadata = {"title": "💭 Thoughs", "status": "done"} |
|
|
|
|
|
history.append(gr.ChatMessage(role="assistant", content="")) |
|
|
for token in streamer: |
|
|
history[-1].content += token |
|
|
history[-1].content = reformat_math(history[-1].content) |
|
|
yield history |
|
|
t.join() |
|
|
|
|
|
yield history |
|
|
|
|
|
|
|
|
with gr.Blocks(fill_height=True, title="Making any LLM model reasoning") as demo: |
|
|
with gr.Row(scale=1): |
|
|
with gr.Column(scale=5): |
|
|
gr.Markdown(f""" |
|
|
# Force reasoning for any LLM |
|
|
|
|
|
This is a simple proof-of-concept to get any LLM (Large language Model) to reason ahead of its response. |
|
|
This interface uses *{model_name}* model **which is not a reasoning model**. The used method |
|
|
is only to force some "reasoning" steps with prefixes to help the model to enhance the answer. |
|
|
See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning) |
|
|
""") |
|
|
chatbot = gr.Chatbot( |
|
|
scale=1, |
|
|
type="messages", |
|
|
latex_delimiters=latex_delimiters, |
|
|
) |
|
|
msg = gr.Textbox( |
|
|
submit_btn=True, |
|
|
label="", |
|
|
show_label=False, |
|
|
placeholder="Type your question here.", |
|
|
autofocus=True, |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("""## Tweaking""") |
|
|
num_tokens = gr.Slider( |
|
|
50, |
|
|
1024, |
|
|
100, |
|
|
step=1, |
|
|
label="Max tokens per reasoning step", |
|
|
interactive=True, |
|
|
) |
|
|
final_num_tokens = gr.Slider( |
|
|
50, |
|
|
1024, |
|
|
512, |
|
|
step=1, |
|
|
label="Max token for the final answer", |
|
|
interactive=True, |
|
|
) |
|
|
do_sample = gr.Checkbox(True, label="Do sample") |
|
|
temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature") |
|
|
gr.Markdown(""" |
|
|
Using smaller number of tokens in the reasoning steps will make the model |
|
|
faster to answer, but it may not be able to go deep enough in its reasoning. |
|
|
A good value is 100 to 512. |
|
|
Using smaller number of tokens for the final answer will make the model |
|
|
to be less verbose, but it may not be able to give a complete answer. |
|
|
A good value is 512 to 1024. |
|
|
**Do sample** uses another strategie to select the next token to complete the |
|
|
answer. It's commonly better to leave it checked. |
|
|
**Temperature** indicates how much the model could be "creative". 0.7 is a common value. |
|
|
If you set a too high value (like 1.0) the model could be incoherent. With a low value |
|
|
(like 0.3), the model will produce very predictives answers. |
|
|
""") |
|
|
gr.Markdown(""" |
|
|
This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop). |
|
|
Feel free to fork the application and try others instruct models. |
|
|
""") |
|
|
|
|
|
|
|
|
msg.submit( |
|
|
user_input, |
|
|
[msg, chatbot], |
|
|
[msg, chatbot], |
|
|
).then( |
|
|
bot, |
|
|
[ |
|
|
chatbot, |
|
|
num_tokens, |
|
|
final_num_tokens, |
|
|
do_sample, |
|
|
temperature, |
|
|
], |
|
|
chatbot, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch() |
|
|
|