import sys import os import multiprocessing from flask import Flask, request, Response from waitress import serve import json import traceback # --- 1. SETUP LOGGING --- def log(msg): print(f"[ENGINE] {msg}", flush=True) # --- 2. PATH SETUP --- if getattr(sys, 'frozen', False): BASE_DIR = os.path.dirname(sys.executable) else: BASE_DIR = os.path.dirname(os.path.abspath(__file__)) MODEL_PATH = os.path.join(BASE_DIR, "model.gguf") log(f"Base Directory: {BASE_DIR}") app = Flask(__name__) # --- 3. THE "MONKEY PATCH" (CRITICAL FIX) --- # We intercept the library's attempt to set up logging and stop it. try: import llama_cpp # Create a dummy function that does NOTHING def dummy_log_set(callback, user_data): return # Overwrite the library's internal function with our dummy # Now, when Llama() runs, it CALLS this instead of the C function. llama_cpp.llama_log_set = dummy_log_set log("Successfully patched Llama logging.") except Exception as e: log(f"Patch warning: {e}") # --- 4. LOAD MODEL --- llm = None try: from llama_cpp import Llama total_cores = multiprocessing.cpu_count() safe_threads = max(1, int(total_cores * 0.5)) if not os.path.exists(MODEL_PATH): log("CRITICAL ERROR: model.gguf is missing!") else: log("Loading Model...") llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=safe_threads, n_gpu_layers=0, verbose=False, chat_format="gemma", use_mmap=False ) log("Model Loaded Successfully!") except Exception as e: log(f"CRITICAL EXCEPTION during load: {e}") log(traceback.format_exc()) @app.route('/', methods=['GET']) def health_check(): if llm: return "OK", 200 return "MODEL_FAILED", 500 @app.route('/chat_stream', methods=['POST']) def chat_stream(): if not llm: return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream') data = request.json messages = [{"role": "user", "content": data.get('message', '')}] def generate(): try: stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True) for chunk in stream: if 'content' in chunk['choices'][0]['delta']: yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n" except Exception as e: log(f"Gen Error: {e}") yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n" return Response(stream_with_context(generate()), mimetype='text/event-stream') if __name__ == '__main__': log("Starting Waitress Server on Port 5000...") try: serve(app, host='127.0.0.1', port=5000, threads=6) except Exception as e: log(f"Server Crash: {e}")