Spaces:

basmala12
/

smollm_interface5

Sleeping

App Files Files Community

basmala12 commited on Nov 21, 2025

Commit

af83bc6

verified ·

1 Parent(s): 0a70310

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -65

app.py CHANGED Viewed

@@ -1,40 +1,21 @@
-import re
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 MODEL_NAME = "basmala12/smollm_finetuning5"
-# Load model & tokenizer once
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
 )
-def truncate_to_n_sentences(text: str, n: int = 2) -> str:
-    """Force output to a maximum of N sentences."""
-    parts = re.split(r'([.!?])', text)
-    sentences = []
-    current = ""
-    for chunk in parts:
-        current += chunk
-        if chunk in [".", "!", "?"]:
-            sentences.append(current.strip())
-            current = ""
-        if len(sentences) >= n:
-            break
-    if not sentences:
-        return text.strip()
-    return " ".join(sentences).strip()
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     """
     ChatInterface (type='messages') passes:
@@ -44,61 +25,30 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     We return a plain string: the assistant reply.
     """
-    # Few-shot prompt to enforce behavior
-    few_shot_prompt = """
-You are a concise reasoning assistant.
-Rules:
-1. ALWAYS answer the user's LAST question only.
-2. Give exactly 1–2 short sentences.
-3. Provide brief, correct reasoning.
-4. Never repeat earlier answers.
-5. Never invent scientific facts.
-Examples:
-User: Why do we sweat?
-Assistant: We sweat to cool the body because evaporation removes heat from the skin. This helps regulate temperature.
-User: Why does metal feel colder than wood?
-Assistant: Metal pulls heat from your skin faster because it conducts heat better than wood. This faster heat transfer makes it feel colder.
-User: Why do birds fly in a V formation?
-Assistant: Birds fly in a V to save energy because each bird rides the lift from the bird ahead. This reduces effort for the whole group.
-""".strip()
-    # Build messages with few-shot + user-configurable system message
-    messages = [
-        {"role": "system", "content": few_shot_prompt},
-        {"role": "system", "content": system_message},
-    ]
     messages.extend(history)
     messages.append({"role": "user", "content": message})
-    # Apply chat template
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
     )
-    # Generate
     out = pipe(
         prompt,
-        max_new_tokens=int(max_tokens),
-        temperature=float(temperature),
-        top_p=float(top_p),
         do_sample=True,
     )[0]["generated_text"]
-    # Extract assistant part
     if "<|im_start|>assistant" in out:
         out = out.split("<|im_start|>assistant", 1)[-1]
     out = out.replace("<|im_end|>", "").strip()
-    # Enforce 1–2 sentence max
-    out = truncate_to_n_sentences(out, n=2)
     return out
@@ -107,15 +57,13 @@ chatbot = gr.ChatInterface(
     type="messages",
     additional_inputs=[
         gr.Textbox(
-            value="Answer in 1–2 short sentences with brief logical reasoning. Do not exceed 2 sentences.",
             label="System message",
         ),
-        gr.Slider(1, 128, value=64, step=1, label="Max new tokens"),
-        gr.Slider(0.1, 2.0, value=0.3, step=0.1, label="Temperature"),
         gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"),
     ],
-    title="SmolLM2 – Short Reasoning Chat",
-    description="Fine-tuned SmolLM2 (basmala12/smollm_finetuning5) that answers with 1–2 short sentences and brief reasoning.",
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 MODEL_NAME = "basmala12/smollm_finetuning5"
+# Load model & tokenizer once at startup
+tokenizer = AutoModelForCausalLM = None  # just to avoid lints
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+# Text-generation pipeline on CPU
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
 )
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     """
     ChatInterface (type='messages') passes:
     We return a plain string: the assistant reply.
     """
+    # Build full chat messages for the chat template
+    messages = [{"role": "system", "content": system_message}]
     messages.extend(history)
     messages.append({"role": "user", "content": message})
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
     )
     out = pipe(
         prompt,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
         do_sample=True,
     )[0]["generated_text"]
+    # Keep only the assistant part after the template
     if "<|im_start|>assistant" in out:
         out = out.split("<|im_start|>assistant", 1)[-1]
     out = out.replace("<|im_end|>", "").strip()
     return out
     type="messages",
     additional_inputs=[
         gr.Textbox(
+            value="Give short answers with brief logical reasoning.",
             label="System message",
         ),
+        gr.Slider(1, 512, value=256, step=1, label="Max new tokens"),
+        gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p"),
     ],
 )
 if __name__ == "__main__":