Smllm2 / app.py
ghosthets's picture
Update app.py
58b71d4 verified
import flask
from flask import request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
app = flask.Flask(__name__)
# ---------------------------
# SUPER FAST SMALL MODEL
# ---------------------------
model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
print(f"🔄 Loading {model_id} model...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Best dtype for CPU speed = bfloat16
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"✅ {model_id} loaded successfully!")
# ---------------------------
# Chat Endpoint
# ---------------------------
@app.route("/chat", methods=["POST"])
def chat():
try:
data = request.get_json()
msg = data.get("message", "")
if not msg:
return jsonify({"error": "No message sent"}), 400
# SmolLM2 format: no chat special tokens needed
prompt = f"User: {msg}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(
**inputs,
max_new_tokens=128, # fast
do_sample=False, # FASTEST
pad_token_id=tokenizer.eos_token_id,
)
reply = tokenizer.decode(output[0], skip_special_tokens=True)
# Remove prompt text from output
if "Assistant:" in reply:
reply = reply.split("Assistant:")[-1].strip()
return jsonify({"reply": reply})
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)