Smllm2

Sleeping

App Files Files Community

ghosthets commited on Dec 12, 2025

Commit

58b71d4

verified ·

1 Parent(s): aa4e8c5

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -18

app.py CHANGED Viewed

@@ -6,23 +6,20 @@ import torch
 app = flask.Flask(__name__)
 # ---------------------------
-# SMALL LLM MODEL (1–2 GB)
 # ---------------------------
-# Best small model: SmolLM-1.7B-Chat
-model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
 print(f"🔄 Loading {model_id} model...")
-# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Load model (auto dtype to avoid errors)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 )
-# Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
@@ -31,7 +28,7 @@ print(f"✅ {model_id} loaded successfully!")
 # ---------------------------
 # Chat Endpoint
 # ---------------------------
-@app.route('/chat', methods=['POST'])
 def chat():
     try:
         data = request.get_json()
@@ -40,25 +37,23 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        # SmolLM uses normal text prompt (no ChatML)
-        prompt = f"<|user|>\n{msg}\n<|assistant|>\n"
         inputs = tokenizer(prompt, return_tensors="pt").to(device)
         output = model.generate(
             **inputs,
-            max_new_tokens=256,
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.8,
             pad_token_id=tokenizer.eos_token_id,
         )
         reply = tokenizer.decode(output[0], skip_special_tokens=True)
-        # Extract only assistant part
-        if "<|assistant|>" in reply:
-            reply = reply.split("<|assistant|>")[-1].strip()
         return jsonify({"reply": reply})
@@ -67,4 +62,4 @@ def chat():
 if __name__ == "__main__":
-    app.run(host='0.0.0.0', port=7860)

 app = flask.Flask(__name__)
 # ---------------------------
+# SUPER FAST SMALL MODEL
 # ---------------------------
+model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
 print(f"🔄 Loading {model_id} model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Best dtype for CPU speed = bfloat16
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
 )
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # ---------------------------
 # Chat Endpoint
 # ---------------------------
+@app.route("/chat", methods=["POST"])
 def chat():
     try:
         data = request.get_json()
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # SmolLM2 format: no chat special tokens needed
+        prompt = f"User: {msg}\nAssistant:"
         inputs = tokenizer(prompt, return_tensors="pt").to(device)
         output = model.generate(
             **inputs,
+            max_new_tokens=128,   # fast
+            do_sample=False,      # FASTEST
             pad_token_id=tokenizer.eos_token_id,
         )
         reply = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Remove prompt text from output
+        if "Assistant:" in reply:
+            reply = reply.split("Assistant:")[-1].strip()
         return jsonify({"reply": reply})
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)