fein / infer.py
kieraisverybored's picture
Create infer.py
0709cbe verified
#!/usr/bin/env python
"""
infer.py – chat with fein
Usage:
python chat_fein.py # load from HF repo
python chat_fein.py --model . # load from local folder
"""
import os, sys, argparse, torch, readline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# ----------------------------------------------------------------------
# 1. CLI args
# ----------------------------------------------------------------------
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
default="kieraisverybored/fein", # default = Hub repo
help="HF repo ID *or* path to a local model folder",
)
parser.add_argument("--load-8bit", action="store_true",
help="Load in 8-bit (else 4-bit)")
args = parser.parse_args()
MODEL_ID = args.model
SYSTEM_MSG = "You are a helpful assistant. You are the 'fein 14b' model by kieradev, a 14b LLM fine tuned from Qwen3."
# ----------------------------------------------------------------------
# 2. Load tokenizer & model
# ----------------------------------------------------------------------
print(f"Loading model from: {MODEL_ID}")
dtype = torch.bfloat16 # or torch.float16 if your GPU prefers
bnb_cfg = BitsAndBytesConfig(
load_in_4bit=not args.load_8bit,
load_in_8bit=args.load_8bit,
bnb_4bit_compute_dtype=dtype,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype=dtype,
quantization_config=bnb_cfg,
)
model.eval()
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# ----------------------------------------------------------------------
# 3. Prompt builder, chat loop
# ----------------------------------------------------------------------
T_START, T_END = "<|im_start|>", "<|im_end|>"
def build_prompt(history, user_msg):
prompt = f"{T_START}system\n{SYSTEM_MSG}{T_END}\n"
for u, a in history:
prompt += f"{T_START}user\n{u}{T_END}\n"
prompt += f"{T_START}assistant\n{a}{T_END}\n"
prompt += f"{T_START}user\n{user_msg}{T_END}\n"
prompt += f"{T_START}assistant\n"
return prompt
history = []
print("\nChat ready! Type 'exit' or Ctrl-C to quit.\n")
while True:
try:
user_in = input("User: ").strip()
except (KeyboardInterrupt, EOFError):
print("\nBye.")
break
if user_in.lower() in {"exit", "quit"}:
break
if not user_in:
continue
prompt = build_prompt(history, user_in)
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
gen_ids = model.generate(
**input_ids,
max_new_tokens=1024,
do_sample=True,
temperature=0.7,
top_p=0.95,
pad_token_id=tokenizer.eos_token_id,
)
full = tokenizer.decode(gen_ids[0], skip_special_tokens=False)
answer = full.split(f"{T_START}assistant\n")[-1].split(T_END)[0].strip()
print(f"Assistant: {answer}\n")
history.append((user_in, answer))