| | import sys |
| | import os |
| | import multiprocessing |
| | from flask import Flask, request, Response |
| | from waitress import serve |
| | import json |
| | import traceback |
| |
|
| | |
| | def log(msg): |
| | print(f"[ENGINE] {msg}", flush=True) |
| |
|
| | |
| | if getattr(sys, 'frozen', False): |
| | BASE_DIR = os.path.dirname(sys.executable) |
| | else: |
| | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| |
|
| | MODEL_PATH = os.path.join(BASE_DIR, "model.gguf") |
| | log(f"Base Directory: {BASE_DIR}") |
| |
|
| | app = Flask(__name__) |
| |
|
| | |
| | |
| | try: |
| | import llama_cpp |
| | |
| | |
| | def dummy_log_set(callback, user_data): |
| | return |
| | |
| | |
| | |
| | llama_cpp.llama_log_set = dummy_log_set |
| | |
| | log("Successfully patched Llama logging.") |
| | except Exception as e: |
| | log(f"Patch warning: {e}") |
| |
|
| | |
| | llm = None |
| | try: |
| | from llama_cpp import Llama |
| | |
| | total_cores = multiprocessing.cpu_count() |
| | safe_threads = max(1, int(total_cores * 0.5)) |
| |
|
| | if not os.path.exists(MODEL_PATH): |
| | log("CRITICAL ERROR: model.gguf is missing!") |
| | else: |
| | log("Loading Model...") |
| | llm = Llama( |
| | model_path=MODEL_PATH, |
| | n_ctx=4096, |
| | n_threads=safe_threads, |
| | n_gpu_layers=0, |
| | verbose=False, |
| | chat_format="gemma", |
| | use_mmap=False |
| | ) |
| | log("Model Loaded Successfully!") |
| |
|
| | except Exception as e: |
| | log(f"CRITICAL EXCEPTION during load: {e}") |
| | log(traceback.format_exc()) |
| |
|
| | @app.route('/', methods=['GET']) |
| | def health_check(): |
| | if llm: return "OK", 200 |
| | return "MODEL_FAILED", 500 |
| |
|
| | @app.route('/chat_stream', methods=['POST']) |
| | def chat_stream(): |
| | if not llm: |
| | return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream') |
| |
|
| | data = request.json |
| | messages = [{"role": "user", "content": data.get('message', '')}] |
| |
|
| | def generate(): |
| | try: |
| | stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True) |
| | for chunk in stream: |
| | if 'content' in chunk['choices'][0]['delta']: |
| | yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n" |
| | except Exception as e: |
| | log(f"Gen Error: {e}") |
| | yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n" |
| |
|
| | return Response(stream_with_context(generate()), mimetype='text/event-stream') |
| |
|
| | if __name__ == '__main__': |
| | log("Starting Waitress Server on Port 5000...") |
| | try: |
| | serve(app, host='127.0.0.1', port=5000, threads=6) |
| | except Exception as e: |
| | log(f"Server Crash: {e}") |