File size: 6,881 Bytes
7a156f5 40fd46e 9a47171 7a156f5 54cc1f4 7a156f5 10a0239 9a47171 7a156f5 96e2592 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | ---
license: mit
datasets:
- crownelius/Opus-4.6-Reasoning-3300x
base_model:
- microsoft/phi-2
- venkycs/phi-2-instruct
pipeline_tag: text-generation
---
**LBNET-2.7B-BASE model card**
We introduce the first-ever Logic/Reasoning-based transformer model based on Phi-2.
In February 2026, we created an experimental architecture called LBNets, an attempt to inject reasoning-like layers into a model's architecture. In this case, we experimented with Phi-2.
**Here is the logic behind LBNET-2.7B-BASE:**
- Split the base model into two halves: pre-reasoning and post-reasoning layers.
- Between these layers, you insert:
- learnable latent 'reasoning tokens'
- reasoning blocks (cross-attention: latent tokens attend to the main hidden states (the “context”), self-attention: latent tokens attend to each other, MLP)
- reasoning injector back into the main stream
To make generation workable:
- During prefill (the initial prompt, past_length == 0 and seq_len > 1), the model runs the reasoning loop once.
- During token-by-token generation with KV-cache (seq_len == 1), the model skips the reasoning loop (otherwise it gets slow and unstable).
LBNETS-2.7B-BASE achieves much above average benchmarks for its size compared to other models:
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|-------------|------:|------|-----:|--------|---|-----:|---|-----:|
|arc_challenge| 1|none | 0|acc |↑ |0.5324|± |0.0146|
| | |none | 0|acc_norm|↑ |0.5478|± |0.0145|
|arc_easy | 1|none | 0|acc |↑ |0.8047|± |0.0081|
| | |none | 0|acc_norm|↑ |0.7862|± |0.0084|
|boolq | 2|none | 0|acc |↑ |0.8346|± |0.0065|
|openbookqa | 1|none | 0|acc |↑ |0.4040|± |0.0220|
| | |none | 0|acc_norm|↑ |0.5160|± |0.0224|
|piqa | 1|none | 0|acc |↑ |0.7889|± |0.0095|
| | |none | 0|acc_norm|↑ |0.7949|± |0.0094|
|winogrande | 1|none | 0|acc |↑ |0.7577|± |0.0120|
We reccommend running this model on at least an RTX 3050 with 8gb of VRAM.
FOR FULL MODEL FUNCTIONALITY, YOU MUST USE THE CHAT SCRIPT BELOW:
The script is ROCm-friendly. May need tweaking for CUDA setups.
```python
import os
import argparse
import torch
from transformers import AutoTokenizer
from configuration import PhiReasoningConfig
from modeling import PhiForLogicalReasoning
# ROCm allocator hint (helps fragmentation on AMD ROCm)
os.environ.setdefault("PYTORCH_HIP_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:64")
DEFAULT_SYSTEM_PROMPT = "You are LBNets, a helpful assistant."
def format_prompt(system_prompt: str, user_text: str, history, max_turns: int = 6) -> str:
"""
Build a single instruction that includes recent chat history.
This keeps compatibility with your training template.
history: list of (user, assistant) tuples
"""
system_prompt = (system_prompt or "").strip()
user_text = (user_text or "").strip()
convo = ""
for u, a in history[-max_turns:]:
convo += f"User: {u}\nAssistant: {a}\n"
instruction = ""
if convo:
instruction += "Conversation so far:\n" + convo + "\n"
instruction += "Current user message:\n" + user_text
return (
f"### System:\n{system_prompt}\n\n"
f"### Instruction:\n{instruction}\n\n"
f"### Response:\n"
)
@torch.inference_mode()
def generate_text(model, tok, prompt_text: str, device: str, max_new_tokens: int = 256) -> str:
inputs = tok(
prompt_text,
return_tensors="pt",
add_special_tokens=False,
truncation=True,
max_length=768, # history makes prompts longer; keep sane
).to(device)
in_len = inputs["input_ids"].shape[1]
out_ids = model.generate(
**inputs,
do_sample=False, # greedy
use_cache=True, # KV cache (fast)
max_new_tokens=max_new_tokens,
min_new_tokens=1,
# general anti-loop controls (not per-problem patching)
repetition_penalty=1.10,
no_repeat_ngram_size=3,
pad_token_id=tok.pad_token_id,
eos_token_id=tok.eos_token_id,
)
new_ids = out_ids[0][in_len:]
text = tok.decode(new_ids, skip_special_tokens=True)
# Avoid "blank" replies from leading newline spam
return text.lstrip("\n").rstrip()
def load_model(model_path: str, device: str):
cfg = PhiReasoningConfig.from_pretrained(model_path)
cfg.attn_implementation = "eager"
cfg.use_cache = True
tok = AutoTokenizer.from_pretrained(model_path)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
tok.pad_token_id = tok.eos_token_id
model = PhiForLogicalReasoning.from_pretrained(
model_path,
config=cfg,
torch_dtype=torch.float16, # often faster/more compatible on ROCm than bf16
low_cpu_mem_usage=True,
).to(device)
model.eval()
gate = model.model.reasoning_injector.gate_scale.detach().float().cpu().numpy()
total_params = sum(p.numel() for p in model.parameters())
print(f"Loaded: {model_path}")
print(f"Parameters: {total_params:,}")
print(f"Gate scale: {gate}")
print(f"Device: {device}")
return model, tok
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model_path", default="Aclevo/LBNET-2.7B-BASE")
ap.add_argument("--device", default="cuda:0")
ap.add_argument("--system_prompt", default=DEFAULT_SYSTEM_PROMPT)
ap.add_argument("--max_new_tokens", type=int, default=256)
ap.add_argument("--history_turns", type=int, default=6)
args = ap.parse_args()
model, tok = load_model(args.model_path, args.device)
history = []
print("\n============================================================")
print("LBNets Chat Ready!")
print("Commands: 'quit' to exit | 'reset' to clear conversation")
print("============================================================\n")
while True:
user = input("User: ").strip()
if not user:
continue
if user.lower() in ("quit", "exit", "q"):
break
if user.lower() in ("reset", "/reset"):
history.clear()
print("AI: Conversation reset.\n")
continue
prompt = format_prompt(args.system_prompt, user, history, max_turns=args.history_turns)
resp = generate_text(model, tok, prompt, args.device, max_new_tokens=args.max_new_tokens)
print(f"AI: {resp}\n")
# store turn
history.append((user, resp))
if __name__ == "__main__":
main()
```
If you like our work and services, give us a star on Github: https://github.com/Aclevo, or give us a mention in your work!
-Aclevo Team
|