""" BitnetCppClient — subprocess wrapper around bitnet.cpp's llama-cli binary. Replaces the transformers.AutoModelForCausalLM inference path with a subprocess call to microsoft/BitNet's llama.cpp-derivative runtime. The runtime uses specialized ternary-weight kernels, delivering BitNet's actual inference-efficiency benefits (which are absent when loading the bf16 master weights through transformers). API contract intentionally mirrors the minimal surface NuWave needs: client = BitnetCppClient(binary, gguf_path) response = client.generate(prompt, max_new_tokens=N, temperature=T, ...) Subprocess-based so each call is isolated — no shared state between generations, no model-instance lifecycle to manage. Slight per-call overhead (binary startup + mmap load) but bitnet.cpp is fast enough that this is negligible vs. token-generation time on CPU. # ---- Changelog ---- # [2026-04-19] Claude Code (Opus 4.6) — Initial creation # What: Thin wrapper for bitnet.cpp's llama-cli binary. Exposes the # generation params NuWave needs: temperature, top_p, # repetition_penalty, no_repeat_ngram_size, stop sequences, # max_new_tokens. # Why: Migration off transformers bf16 — see NuWave.md and the # 2026-04-19 dev-log for the full rationale. Three pathologies # with the prior BitNet-through-transformers setup: (1) not # actually efficient despite the claim, (2) greedy decoding # collapsed to repetition loops on enumeration tasks, (3) no # repetition_penalty knob available in the transformers call # path we had built. All three solved by bitnet.cpp + proper # sampling params. # How: subprocess.run with llama-cli invocation. Strips the prompt # echo + chat-template chrome from stdout. Captures stderr for # diagnostics. Timeout bounded so a hung generation can't # stall the organism indefinitely. # ------------------- """ from __future__ import annotations import glob import logging import os import subprocess import time from typing import List, Optional, Tuple logger = logging.getLogger("nuwave.bitnet_cpp_client") class BitnetCppClient: """Generates text via microsoft/BitNet's llama-cli binary. Args: binary_path: path to the compiled llama-cli executable. gguf_path: path to the .gguf model weights. n_threads: CPU threads for inference (HF basic = 2 vCPUs). n_ctx: context window in tokens (model-dependent limit). default_timeout_s: per-call wall-clock cap. Bounded to protect the organism from an unresponsive runtime. Class convenience: BitnetCppClient.resolve_gguf(dir_path) — finds the largest .gguf in a directory. Used because HF repos ship multiple quant levels and we want the one with richest weights. """ def __init__( self, binary_path: str, gguf_path: str, n_threads: int = 2, n_ctx: int = 4096, default_timeout_s: int = 900, ): if not os.path.exists(binary_path): raise FileNotFoundError(f"bitnet.cpp binary not found: {binary_path}") if not os.path.exists(gguf_path): raise FileNotFoundError(f"GGUF weights not found: {gguf_path}") self.binary_path = binary_path self.gguf_path = gguf_path self.n_threads = n_threads self.n_ctx = n_ctx self.default_timeout_s = default_timeout_s parent = os.path.basename(os.path.dirname(gguf_path)) or "/" size_mb = os.path.getsize(gguf_path) / (1024 * 1024) logger.info( "BitnetCppClient ready: binary=%s gguf=%s/%s size=%.0fMB threads=%d ctx=%d", binary_path, parent, os.path.basename(gguf_path), size_mb, n_threads, n_ctx, ) # Sanity-check the binary — run `--help` once to confirm it's # executable and find out what flags it actually accepts. If # this fails, subsequent generation calls will also fail; # logging it at startup makes the failure mode obvious instead # of manifesting as silent zeros during inference. try: help_result = subprocess.run( [binary_path, "--help"], capture_output=True, text=True, timeout=10, ) help_out = (help_result.stdout or "") + (help_result.stderr or "") # Log first ~500 chars — enough to see what the binary is + its flag prefixes snippet = help_out[:500].replace("\n", " | ") logger.info( "Binary sanity-check rc=%d help_snippet=%s", help_result.returncode, snippet, ) except Exception as exc: logger.warning("Binary sanity-check failed: %s", exc) @staticmethod def resolve_gguf(directory: str) -> str: """Find the largest .gguf file in a directory (searches recursively). GGUF repos often ship multiple quantization levels (e.g. q2_K, q4_K_S, q4_K_M, q5_K_M, q8_0). We pick the largest because it's the richest-precision version that still fits our memory budget — for 1.58-bit models this typically means the raw ternary weights without further compression. Searches recursively because setup_env.py and snapshot_download can both place files in nested directory structures whose exact layout is not guaranteed stable across versions. """ gguf_files = glob.glob(os.path.join(directory, "**", "*.gguf"), recursive=True) # Also include top-level (glob's ** doesn't match zero dirs on all platforms) gguf_files += glob.glob(os.path.join(directory, "*.gguf")) gguf_files = list(set(gguf_files)) if not gguf_files: raise FileNotFoundError(f"No .gguf files found under {directory} (recursive)") gguf_files.sort(key=os.path.getsize, reverse=True) return gguf_files[0] def generate( self, prompt: str, max_new_tokens: int = 128, temperature: float = 0.7, top_p: float = 0.9, repetition_penalty: float = 1.2, repeat_last_n: int = 64, stop: Optional[List[str]] = None, seed: int = -1, timeout_s: Optional[int] = None, grammar_file: Optional[str] = None, grammar: Optional[str] = None, ) -> Tuple[str, dict]: """Generate a completion for the given prompt. Returns: (response_text, metadata_dict) metadata_dict contains: elapsed_s — wall-clock of the subprocess call returncode — llama-cli exit code raw_stdout — full stdout (pre-stripping) for diagnostics prompt_echo_found — whether the prompt was found in stdout (if False, the runtime output format may have changed — worth investigating) stderr_tail — last 500 chars of stderr (stats/warnings) Generation params are llama.cpp-standard and passed through to the binary. Defaults chosen per Syl's prescription for small models on enumeration tasks: non-greedy sampling + repetition penalty + repeat-last-n window prevents the mode-collapse pathology we saw with transformers greedy decoding. """ # Flag set verified compatible with this bitnet.cpp fork # (Eddie-Wang1120/llama.cpp at commit 1f86f058). History of # removals: # -no-cnv — fork's argparse rejected it; redundant anyway # (default with -p PROMPT is non-conversational). # --log-disable — some fork versions silence generation # output entirely when this is set. Safer to keep logs # mingled with stdout and strip the prompt echo on our # side (we already do that). args = [ self.binary_path, "-m", self.gguf_path, "-p", prompt, "-n", str(max_new_tokens), "--temp", f"{temperature:.3f}", "--top-p", f"{top_p:.3f}", "--repeat-penalty", f"{repetition_penalty:.3f}", "--repeat-last-n", str(repeat_last_n), "-t", str(self.n_threads), "-c", str(self.n_ctx), "--seed", str(seed), ] if stop: for s in stop: # Skip stop sequences that are pure whitespace — they # tend to match at position 0 of model output and trim # everything. Use model-specific stop tokens or content # markers ("Answer:", "", etc.) instead. if not s or not s.strip(): continue args.extend(["--reverse-prompt", s]) # Grammar-constrained decoding. `grammar` takes precedence over # `grammar_file` because inline is path-agnostic (no container # filesystem surprises). llama.cpp parses the GBNF into an FSM # and masks logits at each sampling step so tokens violating # the grammar get zero probability. Native C++ — no per-token # Python callback overhead. grammar_mode = None if grammar: args.extend(["--grammar", grammar]) grammar_mode = f"inline ({len(grammar)} chars)" elif grammar_file: if not os.path.exists(grammar_file): logger.warning( "Grammar file missing: %s — generation will be unconstrained", grammar_file, ) else: args.extend(["--grammar-file", grammar_file]) grammar_mode = f"file ({grammar_file})" # Log the invocation when grammar-constrained — lets us # confirm from logs that the flag is actually reaching # llama-cli, which was ambiguous after run 6's silent failure. if grammar_mode: logger.info( "llama-cli grammar-constrained: %s | argv_len=%d | last_args=%s", grammar_mode, len(args), args[-3:], ) t0 = time.time() try: result = subprocess.run( args, capture_output=True, text=True, timeout=timeout_s or self.default_timeout_s, ) except subprocess.TimeoutExpired: return "", { "elapsed_s": round(time.time() - t0, 2), "returncode": -1, "raw_stdout": "", "prompt_echo_found": False, "stderr_tail": "TIMEOUT", "error": "subprocess.TimeoutExpired", } elapsed = round(time.time() - t0, 2) stdout = result.stdout or "" stderr = result.stderr or "" # Log ANY non-zero returncode — this is usually an invalid flag, # GGUF load failure, or OOM. Without this log, failures surface # only as empty responses, which looks like "the model generated # nothing" instead of "the subprocess exited before generation." if result.returncode != 0: logger.warning( "llama-cli rc=%d elapsed=%.2fs stderr_tail=%s | stdout_tail=%s", result.returncode, elapsed, stderr[-400:], stdout[-200:], ) elif not stdout.strip(): # rc=0 but stdout empty is a subtler pathology — flag silently # suppressed output, or the model generated nothing. Log so # it's visible without the caller having to inspect the dict. logger.warning( "llama-cli rc=0 but stdout EMPTY (elapsed=%.2fs). " "stderr_tail=%s", elapsed, stderr[-400:], ) # If a grammar was requested, log any grammar-related lines from # stderr. llama.cpp prints parse errors to stderr when the GBNF # is malformed, and silently falls back to unconstrained # generation. These logs expose that silent fallback. if grammar_mode and stderr: for line in stderr.splitlines(): low = line.lower() if "grammar" in low or "gbnf" in low: logger.info("grammar stderr: %s", line.strip()[:200]) # Strip the prompt echo — llama-cli's default output includes the # prompt as a prefix. Find the LAST occurrence because reverse- # prompt handling can print the prompt multiple times. response = stdout prompt_found = False if prompt and prompt in stdout: idx = stdout.rfind(prompt) response = stdout[idx + len(prompt):] prompt_found = True # Strip common end-of-text markers response = response.rstrip() for marker in ("[end of text]", "", "<|im_end|>", "<|end_of_text|>"): if response.endswith(marker): response = response[: -len(marker)].rstrip() # If a stop string matched (reverse-prompt), trim at the first match. # Skip whitespace-only stops — they tend to match at position 0 # of raw model output (models often start with a newline after # prompt echo) and trim the response to empty. Those should # only be terminators after real content, which we can't # distinguish reliably post-hoc. if stop: for s in stop: if not s or not s.strip(): continue if s in response: response = response[: response.index(s)] return response, { "elapsed_s": elapsed, "returncode": result.returncode, "raw_stdout": stdout, "prompt_echo_found": prompt_found, "stderr_tail": stderr[-500:] if stderr else "", "error": None, }