| --- |
| license: mit |
| datasets: |
| - crownelius/Opus-4.6-Reasoning-3300x |
| base_model: |
| - microsoft/phi-2 |
| - venkycs/phi-2-instruct |
| pipeline_tag: text-generation |
| --- |
| **LBNET-2.7B-BASE model card** |
|
|
| We introduce the first-ever Logic/Reasoning-based transformer model based on Phi-2. |
| In February 2026, we created an experimental architecture called LBNets, an attempt to inject reasoning-like layers into a model's architecture. In this case, we experimented with Phi-2. |
|
|
| **Here is the logic behind LBNET-2.7B-BASE:** |
| - Split the base model into two halves: pre-reasoning and post-reasoning layers. |
| - Between these layers, you insert: |
| - learnable latent 'reasoning tokens' |
| - reasoning blocks (cross-attention: latent tokens attend to the main hidden states (the “context”), self-attention: latent tokens attend to each other, MLP) |
| - reasoning injector back into the main stream |
|
|
| To make generation workable: |
| - During prefill (the initial prompt, past_length == 0 and seq_len > 1), the model runs the reasoning loop once. |
| - During token-by-token generation with KV-cache (seq_len == 1), the model skips the reasoning loop (otherwise it gets slow and unstable). |
| |
| LBNETS-2.7B-BASE achieves much above average benchmarks for its size compared to other models: |
| |
| | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |
| |-------------|------:|------|-----:|--------|---|-----:|---|-----:| |
| |arc_challenge| 1|none | 0|acc |↑ |0.5324|± |0.0146| |
| | | |none | 0|acc_norm|↑ |0.5478|± |0.0145| |
| |arc_easy | 1|none | 0|acc |↑ |0.8047|± |0.0081| |
| | | |none | 0|acc_norm|↑ |0.7862|± |0.0084| |
| |boolq | 2|none | 0|acc |↑ |0.8346|± |0.0065| |
| |openbookqa | 1|none | 0|acc |↑ |0.4040|± |0.0220| |
| | | |none | 0|acc_norm|↑ |0.5160|± |0.0224| |
| |piqa | 1|none | 0|acc |↑ |0.7889|± |0.0095| |
| | | |none | 0|acc_norm|↑ |0.7949|± |0.0094| |
| |winogrande | 1|none | 0|acc |↑ |0.7577|± |0.0120| |
| |
| We reccommend running this model on at least an RTX 3050 with 8gb of VRAM. |
| FOR FULL MODEL FUNCTIONALITY, YOU MUST USE THE CHAT SCRIPT BELOW: |
| |
| The script is ROCm-friendly. May need tweaking for CUDA setups. |
| |
| |
| |
| ```python |
| |
| import os |
| import argparse |
| import torch |
| from transformers import AutoTokenizer |
| |
| from configuration import PhiReasoningConfig |
| from modeling import PhiForLogicalReasoning |
| |
| # ROCm allocator hint (helps fragmentation on AMD ROCm) |
| os.environ.setdefault("PYTORCH_HIP_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:64") |
|
|
| DEFAULT_SYSTEM_PROMPT = "You are LBNets, a helpful assistant." |
|
|
|
|
| def format_prompt(system_prompt: str, user_text: str, history, max_turns: int = 6) -> str: |
| """ |
| Build a single instruction that includes recent chat history. |
| This keeps compatibility with your training template. |
| |
| history: list of (user, assistant) tuples |
| """ |
| system_prompt = (system_prompt or "").strip() |
| user_text = (user_text or "").strip() |
| |
| convo = "" |
| for u, a in history[-max_turns:]: |
| convo += f"User: {u}\nAssistant: {a}\n" |
| |
| instruction = "" |
| if convo: |
| instruction += "Conversation so far:\n" + convo + "\n" |
| instruction += "Current user message:\n" + user_text |
| |
| return ( |
| f"### System:\n{system_prompt}\n\n" |
| f"### Instruction:\n{instruction}\n\n" |
| f"### Response:\n" |
| ) |
| |
|
|
| @torch.inference_mode() |
| def generate_text(model, tok, prompt_text: str, device: str, max_new_tokens: int = 256) -> str: |
| inputs = tok( |
| prompt_text, |
| return_tensors="pt", |
| add_special_tokens=False, |
| truncation=True, |
| max_length=768, # history makes prompts longer; keep sane |
| ).to(device) |
| |
| in_len = inputs["input_ids"].shape[1] |
| |
| out_ids = model.generate( |
| **inputs, |
| do_sample=False, # greedy |
| use_cache=True, # KV cache (fast) |
| max_new_tokens=max_new_tokens, |
| min_new_tokens=1, |
| |
| # general anti-loop controls (not per-problem patching) |
| repetition_penalty=1.10, |
| no_repeat_ngram_size=3, |
| |
| pad_token_id=tok.pad_token_id, |
| eos_token_id=tok.eos_token_id, |
| ) |
| |
| new_ids = out_ids[0][in_len:] |
| text = tok.decode(new_ids, skip_special_tokens=True) |
| |
| # Avoid "blank" replies from leading newline spam |
| return text.lstrip("\n").rstrip() |
| |
|
|
| def load_model(model_path: str, device: str): |
| cfg = PhiReasoningConfig.from_pretrained(model_path) |
| cfg.attn_implementation = "eager" |
| cfg.use_cache = True |
| |
| tok = AutoTokenizer.from_pretrained(model_path) |
| if tok.pad_token is None: |
| tok.pad_token = tok.eos_token |
| tok.pad_token_id = tok.eos_token_id |
| |
| model = PhiForLogicalReasoning.from_pretrained( |
| model_path, |
| config=cfg, |
| torch_dtype=torch.float16, # often faster/more compatible on ROCm than bf16 |
| low_cpu_mem_usage=True, |
| ).to(device) |
| |
| model.eval() |
| |
| gate = model.model.reasoning_injector.gate_scale.detach().float().cpu().numpy() |
| total_params = sum(p.numel() for p in model.parameters()) |
| print(f"Loaded: {model_path}") |
| print(f"Parameters: {total_params:,}") |
| print(f"Gate scale: {gate}") |
| print(f"Device: {device}") |
| |
| return model, tok |
| |
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--model_path", default="Aclevo/LBNET-2.7B-BASE") |
| ap.add_argument("--device", default="cuda:0") |
| ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT) |
| ap.add_argument("--max_new_tokens", type=int, default=256) |
| ap.add_argument("--history_turns", type=int, default=6) |
| args = ap.parse_args() |
| |
| model, tok = load_model(args.model_path, args.device) |
| |
| history = [] |
| |
| print("\n============================================================") |
| print("LBNets Chat Ready!") |
| print("Commands: 'quit' to exit | 'reset' to clear conversation") |
| print("============================================================\n") |
| |
| while True: |
| user = input("User: ").strip() |
| if not user: |
| continue |
| |
| if user.lower() in ("quit", "exit", "q"): |
| break |
| |
| if user.lower() in ("reset", "/reset"): |
| history.clear() |
| print("AI: Conversation reset.\n") |
| continue |
| |
| prompt = format_prompt(args.system_prompt, user, history, max_turns=args.history_turns) |
| resp = generate_text(model, tok, prompt, args.device, max_new_tokens=args.max_new_tokens) |
| |
| print(f"AI: {resp}\n") |
| |
| # store turn |
| history.append((user, resp)) |
| |
|
|
| if __name__ == "__main__": |
| main() |
| |
| ``` |
| |
| If you like our work and services, give us a star on Github: https://github.com/Aclevo, or give us a mention in your work! |
|
|
| -Aclevo Team |
|
|
|
|