Roshan1162003's picture
Fresh clean upload (history reset)
7843c42
import sys
import os
import multiprocessing
from flask import Flask, request, Response
from waitress import serve
import json
import traceback
# --- 1. SETUP LOGGING ---
def log(msg):
print(f"[ENGINE] {msg}", flush=True)
# --- 2. PATH SETUP ---
if getattr(sys, 'frozen', False):
BASE_DIR = os.path.dirname(sys.executable)
else:
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "model.gguf")
log(f"Base Directory: {BASE_DIR}")
app = Flask(__name__)
# --- 3. THE "MONKEY PATCH" (CRITICAL FIX) ---
# We intercept the library's attempt to set up logging and stop it.
try:
import llama_cpp
# Create a dummy function that does NOTHING
def dummy_log_set(callback, user_data):
return
# Overwrite the library's internal function with our dummy
# Now, when Llama() runs, it CALLS this instead of the C function.
llama_cpp.llama_log_set = dummy_log_set
log("Successfully patched Llama logging.")
except Exception as e:
log(f"Patch warning: {e}")
# --- 4. LOAD MODEL ---
llm = None
try:
from llama_cpp import Llama
total_cores = multiprocessing.cpu_count()
safe_threads = max(1, int(total_cores * 0.5))
if not os.path.exists(MODEL_PATH):
log("CRITICAL ERROR: model.gguf is missing!")
else:
log("Loading Model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=safe_threads,
n_gpu_layers=0,
verbose=False,
chat_format="gemma",
use_mmap=False
)
log("Model Loaded Successfully!")
except Exception as e:
log(f"CRITICAL EXCEPTION during load: {e}")
log(traceback.format_exc())
@app.route('/', methods=['GET'])
def health_check():
if llm: return "OK", 200
return "MODEL_FAILED", 500
@app.route('/chat_stream', methods=['POST'])
def chat_stream():
if not llm:
return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream')
data = request.json
messages = [{"role": "user", "content": data.get('message', '')}]
def generate():
try:
stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True)
for chunk in stream:
if 'content' in chunk['choices'][0]['delta']:
yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n"
except Exception as e:
log(f"Gen Error: {e}")
yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
if __name__ == '__main__':
log("Starting Waitress Server on Port 5000...")
try:
serve(app, host='127.0.0.1', port=5000, threads=6)
except Exception as e:
log(f"Server Crash: {e}")