import argparse import ast import json import operator import re import threading from http import HTTPStatus from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from urllib.parse import urlparse from ministral_3b_hmc_chat import ( DEFAULT_ADAPTER_DIR, DEFAULT_MODEL_ID, SYSTEM_PROMPT, build_prompt, generate_reply, load_model, ) SERVER_VERSION = "ministral-hmc-server-2026-03-22-v1" MATH_SYSTEM_PROMPT = "You are RubiNet. Solve math problems carefully and step by step. Verify arithmetic before answering. Keep the reasoning concise but clear, and end with 'Final answer: ...'." MATH_KEYWORDS = ( "calculate", "compute", "evaluate", "solve", "equation", "math", "algebra", "geometry", "probability", "percentage", "percent", "sum", "product", "difference", "quotient", ) ALLOWED_CALC_NODES = { ast.Expression, ast.BinOp, ast.UnaryOp, ast.Constant, ast.Add, ast.Sub, ast.Mult, ast.Div, ast.FloorDiv, ast.Mod, ast.Pow, ast.USub, ast.UAdd, } CALC_BIN_OPS = { ast.Add: operator.add, ast.Sub: operator.sub, ast.Mult: operator.mul, ast.Div: operator.truediv, ast.FloorDiv: operator.floordiv, ast.Mod: operator.mod, ast.Pow: operator.pow, } CALC_UNARY_OPS = { ast.UAdd: operator.pos, ast.USub: operator.neg, } HTML_PAGE = """ RubiNet Chat

RubiNet Local Chat

Version: __VERSION__
Model: __MODEL__
Adapter: __ADAPTER__
""" def looks_like_math_query(message: str) -> bool: normalized = message.strip().lower() if not normalized: return False if re.search(r"\d", normalized) and re.search(r"[+\-*/=^×÷%]", normalized): return True return any(keyword in normalized for keyword in MATH_KEYWORDS) def extract_simple_expression(message: str) -> tuple[str, str] | None: normalized = message.strip() normalized = re.sub(r"(?i)\bwhat is\b", "", normalized) normalized = re.sub(r"(?i)\bcalculate\b", "", normalized) normalized = re.sub(r"(?i)\bcompute\b", "", normalized) normalized = re.sub(r"(?i)\bevaluate\b", "", normalized) normalized = re.sub(r"(?i)\bsolve\b", "", normalized) normalized = normalized.replace("×", "*").replace("÷", "/").replace("^", "**") normalized = normalized.replace("=?", "").replace("= ?", "").replace("=", "") normalized = normalized.replace("?", "").strip() if not normalized: return None if not re.fullmatch(r"[0-9\s\.+\-*/()%]*", normalized): return None if not re.search(r"\d", normalized) or not re.search(r"[+\-*/%()]", normalized): return None compact = re.sub(r"\s+", "", normalized) return normalized, compact def _eval_calc_node(node): if type(node) not in ALLOWED_CALC_NODES: raise ValueError("Unsupported expression.") if isinstance(node, ast.Expression): return _eval_calc_node(node.body) if isinstance(node, ast.Constant): if not isinstance(node.value, (int, float)): raise ValueError("Unsupported constant.") return float(node.value) if isinstance(node, ast.UnaryOp): op_type = type(node.op) if op_type not in CALC_UNARY_OPS: raise ValueError("Unsupported unary operator.") return CALC_UNARY_OPS[op_type](_eval_calc_node(node.operand)) if isinstance(node, ast.BinOp): op_type = type(node.op) if op_type not in CALC_BIN_OPS: raise ValueError("Unsupported binary operator.") left = _eval_calc_node(node.left) right = _eval_calc_node(node.right) return CALC_BIN_OPS[op_type](left, right) raise ValueError("Unsupported expression.") def evaluate_simple_expression(expression: str) -> str: parsed = ast.parse(expression, mode="eval") value = _eval_calc_node(parsed) if isinstance(value, float) and value.is_integer(): return str(int(value)) return f"{value:.12g}" class MinistralHMCService: def __init__(self, model_id: str, adapter_dir: str, system_prompt: str, max_new_tokens: int, temperature: float, top_p: float, use_4bit: bool, cpu_dtype: str, offload_folder: str): self.model_id = model_id self.adapter_dir = adapter_dir self.system_prompt = system_prompt self.max_new_tokens = max_new_tokens self.temperature = temperature self.top_p = top_p self.use_4bit = use_4bit self.cpu_dtype = cpu_dtype self.offload_folder = offload_folder self.tokenizer = None self.model = None self._generation_lock = threading.Lock() def load(self): self.tokenizer, self.model = load_model( self.model_id, self.adapter_dir, self.use_4bit, self.cpu_dtype, self.offload_folder, ) def reply(self, message: str) -> str: with self._generation_lock: simple_expression = extract_simple_expression(message) if simple_expression is not None: original_expression, compact_expression = simple_expression exact_answer = evaluate_simple_expression(compact_expression) return f"Expression: {original_expression}\nVerified result: {exact_answer}\nFinal answer: {exact_answer}" system_prompt = MATH_SYSTEM_PROMPT if looks_like_math_query(message) else self.system_prompt prompt = build_prompt(message, system_prompt) return generate_reply( self.tokenizer, self.model, prompt, self.max_new_tokens, self.temperature, self.top_p, ) class ChatHandler(BaseHTTPRequestHandler): service = None def _send_json(self, payload, status=HTTPStatus.OK): body = json.dumps(payload, ensure_ascii=False).encode("utf-8") self.send_response(status) self.send_header("Content-Type", "application/json; charset=utf-8") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) def _send_html(self, html: str): body = html.encode("utf-8") self.send_response(HTTPStatus.OK) self.send_header("Content-Type", "text/html; charset=utf-8") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) def do_GET(self): path = urlparse(self.path).path if path == "/health": self._send_json({"status": "ok", "model": self.service.model_id, "adapter": self.service.adapter_dir, "version": SERVER_VERSION}) return if path != "/": self.send_error(HTTPStatus.NOT_FOUND) return page = HTML_PAGE.replace("__VERSION__", SERVER_VERSION) page = page.replace("__MODEL__", self.service.model_id) page = page.replace("__ADAPTER__", self.service.adapter_dir) self._send_html(page) def do_POST(self): if urlparse(self.path).path != "/chat": self.send_error(HTTPStatus.NOT_FOUND) return try: content_length = int(self.headers.get("Content-Length", "0")) body = self.rfile.read(content_length) data = json.loads(body.decode("utf-8")) message = str(data.get("message", "")).strip() if not message: self._send_json({"error": "Message cannot be empty."}, status=HTTPStatus.BAD_REQUEST) return reply = self.service.reply(message) self._send_json({"reply": reply}) except Exception as exc: self._send_json({"error": str(exc)}, status=HTTPStatus.INTERNAL_SERVER_ERROR) def log_message(self, format, *args): return def main(): parser = argparse.ArgumentParser(description="Serve Ministral 3B HMC on a local web server") parser.add_argument("--host", default="127.0.0.1") parser.add_argument("--port", type=int, default=8036) parser.add_argument("--model-id", default=DEFAULT_MODEL_ID) parser.add_argument("--adapter-dir", default=DEFAULT_ADAPTER_DIR) parser.add_argument("--system-prompt", default=SYSTEM_PROMPT) parser.add_argument("--max-new-tokens", type=int, default=32) parser.add_argument("--temperature", type=float, default=0.0) parser.add_argument("--top-p", type=float, default=1.0) parser.add_argument("--use-4bit", action="store_true") parser.add_argument("--cpu-dtype", choices=["float32", "float16", "bfloat16"], default="bfloat16") parser.add_argument("--offload-folder", default=r"C:\Users\ASUS\CascadeProjects\.hf-offload") args = parser.parse_args() service = MinistralHMCService( model_id=args.model_id, adapter_dir=args.adapter_dir, system_prompt=args.system_prompt, max_new_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, use_4bit=args.use_4bit, cpu_dtype=args.cpu_dtype, offload_folder=args.offload_folder, ) print("Loading Ministral 3B HMC model...") service.load() print(f"Ministral 3B HMC server ready at http://{args.host}:{args.port}") ChatHandler.service = service server = ThreadingHTTPServer((args.host, args.port), ChatHandler) try: server.serve_forever() except KeyboardInterrupt: pass finally: server.server_close() if __name__ == "__main__": main()