remiai3
/

RemiAI_Framework

Text Generation

Model card Files Files and versions

RemiAI_Framework / engine /cpu_avx /api.py

Roshan1162003's picture

Fresh clean upload (history reset)

7843c42 about 1 month ago

history blame contribute delete

2.95 kB

	import sys
	import os
	import multiprocessing
	from flask import Flask, request, Response
	from waitress import serve
	import json
	import traceback

	# --- 1. SETUP LOGGING ---
	def log(msg):
	print(f"[ENGINE] {msg}", flush=True)

	# --- 2. PATH SETUP ---
	if getattr(sys, 'frozen', False):
	BASE_DIR = os.path.dirname(sys.executable)
	else:
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	MODEL_PATH = os.path.join(BASE_DIR, "model.gguf")
	log(f"Base Directory: {BASE_DIR}")

	app = Flask(__name__)

	# --- 3. THE "MONKEY PATCH" (CRITICAL FIX) ---
	# We intercept the library's attempt to set up logging and stop it.
	try:
	import llama_cpp

	# Create a dummy function that does NOTHING
	def dummy_log_set(callback, user_data):
	return

	# Overwrite the library's internal function with our dummy
	# Now, when Llama() runs, it CALLS this instead of the C function.
	llama_cpp.llama_log_set = dummy_log_set

	log("Successfully patched Llama logging.")
	except Exception as e:
	log(f"Patch warning: {e}")

	# --- 4. LOAD MODEL ---
	llm = None
	try:
	from llama_cpp import Llama

	total_cores = multiprocessing.cpu_count()
	safe_threads = max(1, int(total_cores * 0.5))

	if not os.path.exists(MODEL_PATH):
	log("CRITICAL ERROR: model.gguf is missing!")
	else:
	log("Loading Model...")
	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096,
	n_threads=safe_threads,
	n_gpu_layers=0,
	verbose=False,
	chat_format="gemma",
	use_mmap=False
	)
	log("Model Loaded Successfully!")

	except Exception as e:
	log(f"CRITICAL EXCEPTION during load: {e}")
	log(traceback.format_exc())

	@app.route('/', methods=['GET'])
	def health_check():
	if llm: return "OK", 200
	return "MODEL_FAILED", 500

	@app.route('/chat_stream', methods=['POST'])
	def chat_stream():
	if not llm:
	return Response("data: " + json.dumps({'chunk': "Error: Brain failed initialization."}) + "\n\n", mimetype='text/event-stream')

	data = request.json
	messages = [{"role": "user", "content": data.get('message', '')}]

	def generate():
	try:
	stream = llm.create_chat_completion(messages=messages, max_tokens=1000, stream=True)
	for chunk in stream:
	if 'content' in chunk['choices'][0]['delta']:
	yield f"data: {json.dumps({'chunk': chunk['choices'][0]['delta']['content']})}\n\n"
	except Exception as e:
	log(f"Gen Error: {e}")
	yield f"data: {json.dumps({'chunk': ' Error.'})}\n\n"

	return Response(stream_with_context(generate()), mimetype='text/event-stream')

	if __name__ == '__main__':
	log("Starting Waitress Server on Port 5000...")
	try:
	serve(app, host='127.0.0.1', port=5000, threads=6)
	except Exception as e:
	log(f"Server Crash: {e}")