from flask import Flask, request, jsonify from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch app = Flask(__name__) MODEL_NAME = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True, revision="gptq-4bit-32g-actorder_True" # Based on the quant ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer) @app.route("/chat", methods=["POST"]) def chat(): user_input = request.json.get("message", "") if not user_input: return jsonify({"error": "Empty message"}), 400 prompt = f"[INST] You are a helpful assistant for food ordering.\n{user_input} [/INST]" result = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)[0]['generated_text'] return jsonify({"response": result}) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)