Spaces:

dzehuggingface
/

SmallChat-FxnCaller

Sleeping

App Files Files Community

DylanZimmer commited on Aug 14, 2025

Commit

4ecd9e6

1 Parent(s): 4a043ab

SmolL suggested way

Browse files

Files changed (1) hide show

app.py +49 -42

app.py CHANGED Viewed

@@ -1,56 +1,63 @@
 import gradio as gr
-from transformers import pipeline
-pipe = pipeline(
-    "text-generation",
-    model="HuggingFaceTB/SmolLM3-3B-Base",
-    device_map="auto"
-)
-MAX_HISTORY = 10  # keep last 10 exchanges to avoid huge payloads
 def respond(message, history):
-    # Initialize history if empty
-    if history is None:
-        history = []
-    # Convert to OpenAI-style messages
-    messages = []
-    for user_msg, bot_msg in history[-MAX_HISTORY:]:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
-    messages.append({"role": "user", "content": message})
-    # Build prompt
-    conversation_text = ""
-    for m in messages:
-        conversation_text += f"{m['role'].capitalize()}: {m['content']}\n"
-    conversation_text += "Assistant:"
-    # Generate reply
-    outputs = pipe(
-        conversation_text,
-        max_new_tokens=256,
         temperature=0.7,
         top_p=0.95,
         do_sample=True
     )
-    reply = outputs[0]["generated_text"].replace(conversation_text, "").strip()
-    # Update history and trim
-    history.append([message, reply])
-    history = history[-MAX_HISTORY:]
     return reply, history
-# Chatbot component
-chatbot = gr.Chatbot(height=400)
-iface = gr.Interface(
     fn=respond,
-    inputs=[gr.Textbox(placeholder="Type a message..."), gr.State([])],
-    outputs=[chatbot, gr.State()]
 )
 if __name__ == "__main__":
-    iface.launch(share=True)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# ------------------------
+# Model Setup
+# ------------------------
+model_name = "HuggingFaceTB/SmolLM3-3B"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
+# ------------------------
+# Chat Function
+# ------------------------
+# `history` is a list of dicts: {"role": "user"/"assistant", "content": str}
 def respond(message, history):
+    # Append current user message
+    history = history + [{"role": "user", "content": message}]
+    # Build input with tokenizer’s chat template
+    text = tokenizer.apply_chat_template(
+        history,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # Generate response
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=512,
         temperature=0.7,
         top_p=0.95,
         do_sample=True
     )
+    # Extract only the model's reply (exclude input tokens)
+    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
+    reply = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+    # Append assistant reply to history
+    history.append({"role": "assistant", "content": reply})
     return reply, history
+# ------------------------
+# Gradio Interface
+# ------------------------
+demo = gr.ChatInterface(
     fn=respond,
+    chatbot=gr.Chatbot(type="messages", height=400),
+    textbox=gr.Textbox(placeholder="Type a message..."),
+    title="SmallChat with History",
+    description="Persistent chat history using OpenAI-style messages"
 )
+# ------------------------
+# Launch
+# ------------------------
 if __name__ == "__main__":
+    demo.launch()