Roshan1162003's picture
Fresh clean upload (history reset)
7843c42
raw
history blame
2.95 kB
import sys
import os
import multiprocessing
from flask import Flask, request, Response
from waitress import serve
import json
import traceback
# --- 1. SETUP LOGGING ---
def log(msg):
print(f"[ENGINE] {msg}", flush=True)
# --- 2. PATH SETUP ---
if getattr(sys, 'frozen', False):
BASE_DIR = os.path.dirname(sys.executable)
else:
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "model.gguf")
log(f"Base Directory: {BASE_DIR}")
app = Flask(__name__)
# --- 3. THE "MONKEY PATCH" (CRITICAL FIX) ---
# We intercept the library's attempt to set up logging and stop it.
try:
import llama_cpp
# Create a dummy function that does NOTHING
def dummy_log_set(callback, user_data):
return
# Overwrite the library's internal function with our dummy
# Now, when Llama() runs, it CALLS this instead of the C function.
llama_cpp.llama_log_set = dummy_log_set
log("Successfully patched Llama logging.")
except Exception as e:
log(f"Patch warning: {e}")
# --- 4. LOAD MODEL ---
llm = None
try:
from llama_cpp import Llama
total_cores = multiprocessing.cpu_count()
safe_threads = max(1, int(total_cores * 0.5))
if not os.path.exists(MODEL_PATH):
log("CRITICAL ERROR: model.gguf is missing!")
else:
log("Loading Model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=safe_threads,
n_gpu_layers=0,
verbose=False,
chat_format="gemma",
use_mmap=False
)
log("Model Loaded Successfully!")
except Exception as e:
log(f"CRITICAL EXCEPTION during load: {e}")
log(traceback.format_exc())
@app.route('/', methods=['GET'])
def health_check():
if llm: return "OK", 200
return "MODEL_FAILED", 500
@app.route('/chat_stream', methods=['POST'])
def chat_stream():
if not llm:
return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream')
data = request.json
messages = [{"role": "user", "content": data.get('message', '')}]
def generate():
try:
stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True)
for chunk in stream:
if 'content' in chunk['choices'][0]['delta']:
yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n"
except Exception as e:
log(f"Gen Error: {e}")
yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
if __name__ == '__main__':
log("Starting Waitress Server on Port 5000...")
try:
serve(app, host='127.0.0.1', port=5000, threads=6)
except Exception as e:
log(f"Server Crash: {e}")