from flask import Flask, request, jsonify from transformers import AutoModelForCausalLM, AutoTokenizer import torch app = Flask(__name__) model_path = "./llama" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, device_map="auto" ) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) @app.route('/generate', methods=['POST']) def generate_response(): input_data = request.json prompt = input_data.get("prompt", "") if not prompt: return jsonify({"error": "No prompt provided"}), 400 # Tokenize the prompt and generate response inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=50) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return jsonify({"response": response}) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)