from flask import Flask, request, jsonify import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline app = Flask(__name__) # モデルロード(起動時1回) torch.random.manual_seed(0) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", device_map="cpu", torch_dtype="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "microsoft/Phi-3-mini-4k-instruct" ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer ) generation_args = { "max_new_tokens": 500, "return_full_text": False, "temperature": 0.0, "do_sample": False, } # ----------------------- # ルートページ (HTML) # ----------------------- @app.route("/") def index(): return """ Local LLM Chat

Local Phi-3 Chat

""" # ----------------------- # OpenAI互換API # ----------------------- @app.route("/v1/chat/completions", methods=["POST"]) def chat_completions(): data = request.json messages = data.get("messages", []) result = pipe(messages, **generation_args) text = result[0]["generated_text"] response = { "id": "chatcmpl-local", "object": "chat.completion", "choices": [ { "index": 0, "message": { "role": "assistant", "content": text }, "finish_reason": "stop" } ] } return jsonify(response) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)