import flask from flask import request, jsonify from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = flask.Flask(__name__) # --------------------------- # SUPER FAST SMALL MODEL # --------------------------- model_id = "HuggingFaceTB/SmolLM2-360M-Instruct" print(f"🔄 Loading {model_id} model...") tokenizer = AutoTokenizer.from_pretrained(model_id) # Best dtype for CPU speed = bfloat16 model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print(f"✅ {model_id} loaded successfully!") # --------------------------- # Chat Endpoint # --------------------------- @app.route("/chat", methods=["POST"]) def chat(): try: data = request.get_json() msg = data.get("message", "") if not msg: return jsonify({"error": "No message sent"}), 400 # SmolLM2 format: no chat special tokens needed prompt = f"User: {msg}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt").to(device) output = model.generate( **inputs, max_new_tokens=128, # fast do_sample=False, # FASTEST pad_token_id=tokenizer.eos_token_id, ) reply = tokenizer.decode(output[0], skip_special_tokens=True) # Remove prompt text from output if "Assistant:" in reply: reply = reply.split("Assistant:")[-1].strip() return jsonify({"reply": reply}) except Exception as e: return jsonify({"error": str(e)}), 500 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)