File size: 2,947 Bytes
7843c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import sys
import os
import multiprocessing
from flask import Flask, request, Response
from waitress import serve
import json
import traceback

# --- 1. SETUP LOGGING ---
def log(msg):
    print(f"[ENGINE] {msg}", flush=True)

# --- 2. PATH SETUP ---
if getattr(sys, 'frozen', False):
    BASE_DIR = os.path.dirname(sys.executable)
else:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

MODEL_PATH = os.path.join(BASE_DIR, "model.gguf")
log(f"Base Directory: {BASE_DIR}")

app = Flask(__name__)

# --- 3. THE "MONKEY PATCH" (CRITICAL FIX) ---
# We intercept the library's attempt to set up logging and stop it.
try:
    import llama_cpp
    
    # Create a dummy function that does NOTHING
    def dummy_log_set(callback, user_data):
        return
    
    # Overwrite the library's internal function with our dummy
    # Now, when Llama() runs, it CALLS this instead of the C function.
    llama_cpp.llama_log_set = dummy_log_set
    
    log("Successfully patched Llama logging.")
except Exception as e:
    log(f"Patch warning: {e}")

# --- 4. LOAD MODEL ---
llm = None
try:
    from llama_cpp import Llama
    
    total_cores = multiprocessing.cpu_count()
    safe_threads = max(1, int(total_cores * 0.5))

    if not os.path.exists(MODEL_PATH):
        log("CRITICAL ERROR: model.gguf is missing!")
    else:
        log("Loading Model...")
        llm = Llama(
            model_path=MODEL_PATH, 
            n_ctx=4096,             
            n_threads=safe_threads,
            n_gpu_layers=0,
            verbose=False,    
            chat_format="gemma",
            use_mmap=False    
        )
        log("Model Loaded Successfully!")

except Exception as e:
    log(f"CRITICAL EXCEPTION during load: {e}")
    log(traceback.format_exc())

@app.route('/', methods=['GET'])
def health_check():
    if llm: return "OK", 200
    return "MODEL_FAILED", 500

@app.route('/chat_stream', methods=['POST'])
def chat_stream():
    if not llm:
        return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream')

    data = request.json
    messages = [{"role": "user", "content": data.get('message', '')}]

    def generate():
        try:
            stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True)
            for chunk in stream:
                if 'content' in chunk['choices'][0]['delta']:
                    yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n"
        except Exception as e:
            log(f"Gen Error: {e}")
            yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n"

    return Response(stream_with_context(generate()), mimetype='text/event-stream')

if __name__ == '__main__':
    log("Starting Waitress Server on Port 5000...")
    try:
        serve(app, host='127.0.0.1', port=5000, threads=6)
    except Exception as e:
        log(f"Server Crash: {e}")