import argparse
import ast
import json
import operator
import re
import threading
from http import HTTPStatus
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import urlparse
from ministral_3b_hmc_chat import (
DEFAULT_ADAPTER_DIR,
DEFAULT_MODEL_ID,
SYSTEM_PROMPT,
build_prompt,
generate_reply,
load_model,
)
SERVER_VERSION = "ministral-hmc-server-2026-03-22-v1"
MATH_SYSTEM_PROMPT = "You are RubiNet. Solve math problems carefully and step by step. Verify arithmetic before answering. Keep the reasoning concise but clear, and end with 'Final answer: ...'."
MATH_KEYWORDS = (
"calculate",
"compute",
"evaluate",
"solve",
"equation",
"math",
"algebra",
"geometry",
"probability",
"percentage",
"percent",
"sum",
"product",
"difference",
"quotient",
)
ALLOWED_CALC_NODES = {
ast.Expression,
ast.BinOp,
ast.UnaryOp,
ast.Constant,
ast.Add,
ast.Sub,
ast.Mult,
ast.Div,
ast.FloorDiv,
ast.Mod,
ast.Pow,
ast.USub,
ast.UAdd,
}
CALC_BIN_OPS = {
ast.Add: operator.add,
ast.Sub: operator.sub,
ast.Mult: operator.mul,
ast.Div: operator.truediv,
ast.FloorDiv: operator.floordiv,
ast.Mod: operator.mod,
ast.Pow: operator.pow,
}
CALC_UNARY_OPS = {
ast.UAdd: operator.pos,
ast.USub: operator.neg,
}
HTML_PAGE = """
RubiNet Chat
RubiNet Local Chat
Version: __VERSION__
Model: __MODEL__
Adapter: __ADAPTER__
"""
def looks_like_math_query(message: str) -> bool:
normalized = message.strip().lower()
if not normalized:
return False
if re.search(r"\d", normalized) and re.search(r"[+\-*/=^×÷%]", normalized):
return True
return any(keyword in normalized for keyword in MATH_KEYWORDS)
def extract_simple_expression(message: str) -> tuple[str, str] | None:
normalized = message.strip()
normalized = re.sub(r"(?i)\bwhat is\b", "", normalized)
normalized = re.sub(r"(?i)\bcalculate\b", "", normalized)
normalized = re.sub(r"(?i)\bcompute\b", "", normalized)
normalized = re.sub(r"(?i)\bevaluate\b", "", normalized)
normalized = re.sub(r"(?i)\bsolve\b", "", normalized)
normalized = normalized.replace("×", "*").replace("÷", "/").replace("^", "**")
normalized = normalized.replace("=?", "").replace("= ?", "").replace("=", "")
normalized = normalized.replace("?", "").strip()
if not normalized:
return None
if not re.fullmatch(r"[0-9\s\.+\-*/()%]*", normalized):
return None
if not re.search(r"\d", normalized) or not re.search(r"[+\-*/%()]", normalized):
return None
compact = re.sub(r"\s+", "", normalized)
return normalized, compact
def _eval_calc_node(node):
if type(node) not in ALLOWED_CALC_NODES:
raise ValueError("Unsupported expression.")
if isinstance(node, ast.Expression):
return _eval_calc_node(node.body)
if isinstance(node, ast.Constant):
if not isinstance(node.value, (int, float)):
raise ValueError("Unsupported constant.")
return float(node.value)
if isinstance(node, ast.UnaryOp):
op_type = type(node.op)
if op_type not in CALC_UNARY_OPS:
raise ValueError("Unsupported unary operator.")
return CALC_UNARY_OPS[op_type](_eval_calc_node(node.operand))
if isinstance(node, ast.BinOp):
op_type = type(node.op)
if op_type not in CALC_BIN_OPS:
raise ValueError("Unsupported binary operator.")
left = _eval_calc_node(node.left)
right = _eval_calc_node(node.right)
return CALC_BIN_OPS[op_type](left, right)
raise ValueError("Unsupported expression.")
def evaluate_simple_expression(expression: str) -> str:
parsed = ast.parse(expression, mode="eval")
value = _eval_calc_node(parsed)
if isinstance(value, float) and value.is_integer():
return str(int(value))
return f"{value:.12g}"
class MinistralHMCService:
def __init__(self, model_id: str, adapter_dir: str, system_prompt: str, max_new_tokens: int, temperature: float, top_p: float, use_4bit: bool, cpu_dtype: str, offload_folder: str):
self.model_id = model_id
self.adapter_dir = adapter_dir
self.system_prompt = system_prompt
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
self.use_4bit = use_4bit
self.cpu_dtype = cpu_dtype
self.offload_folder = offload_folder
self.tokenizer = None
self.model = None
self._generation_lock = threading.Lock()
def load(self):
self.tokenizer, self.model = load_model(
self.model_id,
self.adapter_dir,
self.use_4bit,
self.cpu_dtype,
self.offload_folder,
)
def reply(self, message: str) -> str:
with self._generation_lock:
simple_expression = extract_simple_expression(message)
if simple_expression is not None:
original_expression, compact_expression = simple_expression
exact_answer = evaluate_simple_expression(compact_expression)
return f"Expression: {original_expression}\nVerified result: {exact_answer}\nFinal answer: {exact_answer}"
system_prompt = MATH_SYSTEM_PROMPT if looks_like_math_query(message) else self.system_prompt
prompt = build_prompt(message, system_prompt)
return generate_reply(
self.tokenizer,
self.model,
prompt,
self.max_new_tokens,
self.temperature,
self.top_p,
)
class ChatHandler(BaseHTTPRequestHandler):
service = None
def _send_json(self, payload, status=HTTPStatus.OK):
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def _send_html(self, html: str):
body = html.encode("utf-8")
self.send_response(HTTPStatus.OK)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def do_GET(self):
path = urlparse(self.path).path
if path == "/health":
self._send_json({"status": "ok", "model": self.service.model_id, "adapter": self.service.adapter_dir, "version": SERVER_VERSION})
return
if path != "/":
self.send_error(HTTPStatus.NOT_FOUND)
return
page = HTML_PAGE.replace("__VERSION__", SERVER_VERSION)
page = page.replace("__MODEL__", self.service.model_id)
page = page.replace("__ADAPTER__", self.service.adapter_dir)
self._send_html(page)
def do_POST(self):
if urlparse(self.path).path != "/chat":
self.send_error(HTTPStatus.NOT_FOUND)
return
try:
content_length = int(self.headers.get("Content-Length", "0"))
body = self.rfile.read(content_length)
data = json.loads(body.decode("utf-8"))
message = str(data.get("message", "")).strip()
if not message:
self._send_json({"error": "Message cannot be empty."}, status=HTTPStatus.BAD_REQUEST)
return
reply = self.service.reply(message)
self._send_json({"reply": reply})
except Exception as exc:
self._send_json({"error": str(exc)}, status=HTTPStatus.INTERNAL_SERVER_ERROR)
def log_message(self, format, *args):
return
def main():
parser = argparse.ArgumentParser(description="Serve Ministral 3B HMC on a local web server")
parser.add_argument("--host", default="127.0.0.1")
parser.add_argument("--port", type=int, default=8036)
parser.add_argument("--model-id", default=DEFAULT_MODEL_ID)
parser.add_argument("--adapter-dir", default=DEFAULT_ADAPTER_DIR)
parser.add_argument("--system-prompt", default=SYSTEM_PROMPT)
parser.add_argument("--max-new-tokens", type=int, default=32)
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--top-p", type=float, default=1.0)
parser.add_argument("--use-4bit", action="store_true")
parser.add_argument("--cpu-dtype", choices=["float32", "float16", "bfloat16"], default="bfloat16")
parser.add_argument("--offload-folder", default=r"C:\Users\ASUS\CascadeProjects\.hf-offload")
args = parser.parse_args()
service = MinistralHMCService(
model_id=args.model_id,
adapter_dir=args.adapter_dir,
system_prompt=args.system_prompt,
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_p=args.top_p,
use_4bit=args.use_4bit,
cpu_dtype=args.cpu_dtype,
offload_folder=args.offload_folder,
)
print("Loading Ministral 3B HMC model...")
service.load()
print(f"Ministral 3B HMC server ready at http://{args.host}:{args.port}")
ChatHandler.service = service
server = ThreadingHTTPServer((args.host, args.port), ChatHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
pass
finally:
server.server_close()
if __name__ == "__main__":
main()