force-reasoning-any-model

Paused

App Files Files Community

Ksjsjjdj commited on Dec 2, 2025

Commit

5ae3817

verified ·

1 Parent(s): 2593844

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -227

app.py CHANGED Viewed

@@ -1,241 +1,69 @@
-import re
-import threading
 import gradio as gr
 import spaces
-import transformers
-from transformers import pipeline
-# loading model and tokenizer
-model_name = "Ksjsjjdj/nucleus-model-v10142"
-if gr.NO_RELOAD:
-    pipe = pipeline(
-        "text-generation",
-        model=model_name,
-        device_map="auto",
-        torch_dtype="auto",
-    )
-    # --- FIX START: Manually set a chat template if one is missing ---
-    # This uses a standard ChatML format (User: ... Assistant: ...)
-    if pipe.tokenizer.chat_template is None:
-        pipe.tokenizer.chat_template = (
-            "{% for message in messages %}"
-            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
-            "{% endfor %}"
-            "{% if add_generation_prompt %}"
-            "{{ '<|im_start|>assistant\n' }}"
-            "{% endif %}"
-        )
-    # --- FIX END ---
-# the answer marker to detect final answer
-ANSWER_MARKER = "**ANSWER**"
-# the sentences starting the reasoning step by step
-rethink_prepends = [
-    "OK, I need to figure out ",
-    "I think ",
-    "Wait, I think ",
-    "Let me check if ",
-    "I should also remember that ",
-    "Another thing to note is that ",
-    "I also recall that ",
-    "I think I have a good grasp ",
-    "Now, using all the above information, I can answer the question using the original language used for the question:"
-    "\n{question}\n"
-    f"\n{ANSWER_MARKER}\n",
-]
-# to fix some problems with math display
-latex_delimiters = [
-    {"left": "$$", "right": "$$", "display": True},
-    {"left": "$", "right": "$", "display": False},
-]
-def reformat_math(text):
-    """Fix MathJax delimiters to use the Gradio syntax (Katex).
-    This is a workaround to display math formulas in Gradio. For now, I havn't found a way to
-    make it work as expected using others latex_delimiters...
-    """
-    text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
-    text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL)
-    return text
-def user_input(message, history: list):
-    """Append the user input in the history and clean the input textbox"""
-    return "", history + [
-        gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
-    ]
-def rebuild_messages(history: list):
-    """Rebuid the messages from the history to be used by the model without the intermediate thoughs"""
-    messages = []
-    for h in history:
-        if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
-            messages.append(h)
-        elif (
-            isinstance(h, gr.ChatMessage)
-            and h.metadata.get("title")
-            and isinstance(h.content, str)
-        ):
-            messages.append({"role": h.role, "content": h.content})
-    return messages
 @spaces.GPU
-def bot(
-    history: list,
-    max_num_tokens: int,
-    final_num_tokens: int,
-    do_sample: bool,
-    temperature: float,
-):
-    """Make the model answering the question"""
-    # to get token as a stream, later in a thread
-    streamer = transformers.TextIteratorStreamer(
-        pipe.tokenizer,  # pyright: ignore
-        skip_special_tokens=True,
-        skip_prompt=True,
     )
-    # to reinsert the question in the reasoning if needed
-    question = history[-1]["content"]
-    # prepare the assistant message
-    history.append(
-        gr.ChatMessage(
-            role="assistant",
-            content=str(""),
-            metadata={"title": "🧠 Thinking...", "status": "pending"},
-        )
     )
-    # for the moment, make the reasoning to be displayed in the chat
-    messages = rebuild_messages(history)
-    for i, prepend in enumerate(rethink_prepends):
-        if i > 0:
-            messages[-1]["content"] += "\n\n"
-        messages[-1]["content"] += prepend.format(question=question)
-        num_tokens = int(
-            max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
-        )
-        t = threading.Thread(
-            target=pipe,
-            args=(messages,),
-            kwargs=dict(
-                max_new_tokens=num_tokens,
-                streamer=streamer,
-                do_sample=do_sample,
-                temperature=temperature,
-            ),
-        )
-        t.start()
-        # rebuild the history with the new content
-        history[-1].content += prepend.format(question=question)
-        if ANSWER_MARKER in prepend:
-            history[-1].metadata = {"title": "💭 Thoughs", "status": "done"}
-            # stop thinking, this is the answer now (no metadata for intermediate steps)
-            history.append(gr.ChatMessage(role="assistant", content=""))
-        for token in streamer:
-            history[-1].content += token
-            history[-1].content = reformat_math(history[-1].content)
-            yield history
-        t.join()
-    yield history
-with gr.Blocks(fill_height=True, title="Making any LLM model reasoning") as demo:
-    with gr.Row(scale=1):
-        with gr.Column(scale=5):
-            gr.Markdown(f"""
-            # Force reasoning for any LLM
-            This is a simple proof-of-concept to get any LLM (Large language Model) to reason ahead of its response.
-            This interface uses *{model_name}* model **which is not a reasoning model**. The used method
-            is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.
-            See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
-            """)
-            chatbot = gr.Chatbot(
-                scale=1,
-                type="messages",
-                latex_delimiters=latex_delimiters,
-            )
-            msg = gr.Textbox(
-                submit_btn=True,
-                label="",
-                show_label=False,
-                placeholder="Type your question here.",
-                autofocus=True,
-            )
-        with gr.Column(scale=1):
-            gr.Markdown("""## Tweaking""")
-            num_tokens = gr.Slider(
-                50,
-                1024,
-                100,
-                step=1,
-                label="Max tokens per reasoning step",
-                interactive=True,
-            )
-            final_num_tokens = gr.Slider(
-                50,
-                1024,
-                512,
-                step=1,
-                label="Max token for the final answer",
-                interactive=True,
-            )
-            do_sample = gr.Checkbox(True, label="Do sample")
-            temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature")
-            gr.Markdown("""
-            Using smaller number of tokens in the reasoning steps will make the model
-            faster to answer, but it may not be able to go deep enough in its reasoning.
-            A good value is 100 to 512.
-            Using smaller number of tokens for the final answer will make the model
-            to be less verbose, but it may not be able to give a complete answer.
-            A good value is 512 to 1024.
-            **Do sample** uses another strategie to select the next token to complete the
-            answer. It's commonly better to leave it checked.
-            **Temperature** indicates how much the model could be "creative". 0.7 is a common value.
-            If you set a too high value (like 1.0) the model could be incoherent. With a low value
-            (like 0.3), the model will produce very predictives answers.
-            """)
-            gr.Markdown("""
-            This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
-            Feel free to fork the application and try others instruct models.
-            """)
-    # when the user submit a message, the bot will answer
-    msg.submit(
-        user_input,
-        [msg, chatbot],  # inputs
-        [msg, chatbot],  # outputs
-    ).then(
-        bot,
-        [
-            chatbot,
-            num_tokens,
-            final_num_tokens,
-            do_sample,
-            temperature,
-        ],  # actually, the "history" input
-        chatbot,  # to store the new history from the output
-    )
 if __name__ == "__main__":
-    demo.queue().launch()

+import os
+from threading import Thread
 import gradio as gr
 import spaces
+import torch
+from dotenv import load_dotenv
+from transformers import pipeline, TextIteratorStreamer
+load_dotenv()
+model_id = "facebook/MobileLLM-R1.5-950M"
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    torch_dtype="auto",
+    device_map="auto",
+    token=os.getenv("HF_TOKEN")
+)
 @spaces.GPU
+def chat(message, history):
+    messages = []
+    messages.append({
+        "role": "system",
+        "content": "Please reason step by step, and put your final answer within \\boxed{}."
+    })
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    streamer = TextIteratorStreamer(
+        pipe.tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
     )
+    generation_kwargs = dict(
+        text_inputs=messages,
+        streamer=streamer,
+        max_new_tokens=8192,
+        do_sample=True,
+        temperature=0.7,
     )
+    thread = Thread(target=pipe, kwargs=generation_kwargs)
+    thread.start()
+    response = ""
+    for new_text in streamer:
+        response += new_text
+        yield response
+demo = gr.ChatInterface(
+    fn=chat,
+    title="MobileLLM-R1.5-950M Chat",
+    description="Reasoning model running on GPU.",
+    examples=[
+        "Compute: $1-2+3-4+5- \\dots +99-100$.",
+        "Write a Python function that returns the square of a number.",
+        "Explain the theory of relativity in simple terms."
+    ],
+)
 if __name__ == "__main__":
+    demo.launch()