from flask import Flask, request, jsonify from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = Flask(__name__) MODEL_ID = "newtechdevng/math-tutor-smollm2-360M" SYSTEM_PROMPT = "You are a helpful math assistant." print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.float32, device_map="auto" ) model.eval() print("✅ Model ready!") @app.route("/", methods=["GET"]) def home(): return jsonify({"status": "ok", "message": "Math model API is running!"}) @app.route("/generate", methods=["POST"]) def generate(): data = request.get_json() if not data or "question" not in data: return jsonify({"error": "Send JSON with 'question' key"}), 400 question = data["question"].strip() max_new_tokens = data.get("max_new_tokens", 256) prompt = f"""<|im_start|>system {SYSTEM_PROMPT}<|im_end|> <|im_start|>user {question}<|im_end|> <|im_start|>assistant """ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) new_tokens = outputs[0][inputs["input_ids"].shape[1]:] answer = tokenizer.decode(new_tokens, skip_special_tokens=True) return jsonify({"question": question, "answer": answer}) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)