File size: 2,203 Bytes
35697c2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | """
Stack 2.9 - Local Inference Script
Run the fine-tuned model locally on your machine
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Model path (your fine-tuned model)
MODEL_PATH = "/Users/walidsobhi/stack-2-9-final-model"
# Or use HuggingFace Hub version (if you want to test base model)
# MODEL_PATH = "Qwen/Qwen2.5-Coder-1.5B"
def load_model():
"""Load model and tokenizer"""
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16, # Half precision - faster
device_map="auto", # Auto-detect device (GPU/CPU)
trust_remote_code=True
)
print("Model loaded!")
return model, tokenizer
def generate(prompt, system_prompt="You are a helpful coding assistant.", max_tokens=512, temperature=0.7):
"""Generate response from the model"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
]
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.pad_token_id
)
# Decode - remove input prompt from response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(text):].strip()
def chat():
"""Interactive chat loop"""
print("\n" + "="*50)
print("Stack 2.9 - Local Chat")
print("="*50)
print("Type 'quit' to exit\n")
while True:
user_input = input("You: ")
if user_input.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
response = generate(user_input)
print(f"\nAssistant: {response}\n")
# Load model once
model, tokenizer = load_model()
# Run chat
if __name__ == "__main__":
chat()
|