File size: 2,947 Bytes
7843c42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | import sys
import os
import multiprocessing
from flask import Flask, request, Response
from waitress import serve
import json
import traceback
# --- 1. SETUP LOGGING ---
def log(msg):
print(f"[ENGINE] {msg}", flush=True)
# --- 2. PATH SETUP ---
if getattr(sys, 'frozen', False):
BASE_DIR = os.path.dirname(sys.executable)
else:
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "model.gguf")
log(f"Base Directory: {BASE_DIR}")
app = Flask(__name__)
# --- 3. THE "MONKEY PATCH" (CRITICAL FIX) ---
# We intercept the library's attempt to set up logging and stop it.
try:
import llama_cpp
# Create a dummy function that does NOTHING
def dummy_log_set(callback, user_data):
return
# Overwrite the library's internal function with our dummy
# Now, when Llama() runs, it CALLS this instead of the C function.
llama_cpp.llama_log_set = dummy_log_set
log("Successfully patched Llama logging.")
except Exception as e:
log(f"Patch warning: {e}")
# --- 4. LOAD MODEL ---
llm = None
try:
from llama_cpp import Llama
total_cores = multiprocessing.cpu_count()
safe_threads = max(1, int(total_cores * 0.5))
if not os.path.exists(MODEL_PATH):
log("CRITICAL ERROR: model.gguf is missing!")
else:
log("Loading Model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=safe_threads,
n_gpu_layers=0,
verbose=False,
chat_format="gemma",
use_mmap=False
)
log("Model Loaded Successfully!")
except Exception as e:
log(f"CRITICAL EXCEPTION during load: {e}")
log(traceback.format_exc())
@app.route('/', methods=['GET'])
def health_check():
if llm: return "OK", 200
return "MODEL_FAILED", 500
@app.route('/chat_stream', methods=['POST'])
def chat_stream():
if not llm:
return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream')
data = request.json
messages = [{"role": "user", "content": data.get('message', '')}]
def generate():
try:
stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True)
for chunk in stream:
if 'content' in chunk['choices'][0]['delta']:
yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n"
except Exception as e:
log(f"Gen Error: {e}")
yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
if __name__ == '__main__':
log("Starting Waitress Server on Port 5000...")
try:
serve(app, host='127.0.0.1', port=5000, threads=6)
except Exception as e:
log(f"Server Crash: {e}") |