| |
| """ |
| infer.py – chat with fein |
| |
| Usage: |
| python chat_fein.py # load from HF repo |
| python chat_fein.py --model . # load from local folder |
| """ |
| import os, sys, argparse, torch, readline |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
| |
| |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--model", |
| default="kieraisverybored/fein", |
| help="HF repo ID *or* path to a local model folder", |
| ) |
| parser.add_argument("--load-8bit", action="store_true", |
| help="Load in 8-bit (else 4-bit)") |
| args = parser.parse_args() |
|
|
| MODEL_ID = args.model |
| SYSTEM_MSG = "You are a helpful assistant. You are the 'fein 14b' model by kieradev, a 14b LLM fine tuned from Qwen3." |
|
|
| |
| |
| |
| print(f"Loading model from: {MODEL_ID}") |
| dtype = torch.bfloat16 |
| bnb_cfg = BitsAndBytesConfig( |
| load_in_4bit=not args.load_8bit, |
| load_in_8bit=args.load_8bit, |
| bnb_4bit_compute_dtype=dtype, |
| ) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| device_map="auto", |
| torch_dtype=dtype, |
| quantization_config=bnb_cfg, |
| ) |
| model.eval() |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| |
| |
| T_START, T_END = "<|im_start|>", "<|im_end|>" |
|
|
| def build_prompt(history, user_msg): |
| prompt = f"{T_START}system\n{SYSTEM_MSG}{T_END}\n" |
| for u, a in history: |
| prompt += f"{T_START}user\n{u}{T_END}\n" |
| prompt += f"{T_START}assistant\n{a}{T_END}\n" |
| prompt += f"{T_START}user\n{user_msg}{T_END}\n" |
| prompt += f"{T_START}assistant\n" |
| return prompt |
|
|
| history = [] |
| print("\nChat ready! Type 'exit' or Ctrl-C to quit.\n") |
| while True: |
| try: |
| user_in = input("User: ").strip() |
| except (KeyboardInterrupt, EOFError): |
| print("\nBye.") |
| break |
| if user_in.lower() in {"exit", "quit"}: |
| break |
| if not user_in: |
| continue |
|
|
| prompt = build_prompt(history, user_in) |
| input_ids = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| gen_ids = model.generate( |
| **input_ids, |
| max_new_tokens=1024, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.95, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| full = tokenizer.decode(gen_ids[0], skip_special_tokens=False) |
| answer = full.split(f"{T_START}assistant\n")[-1].split(T_END)[0].strip() |
|
|
| print(f"Assistant: {answer}\n") |
| history.append((user_in, answer)) |