CodeSage / system3_inference.py
Aditya
Use float16 on GPU, float32 fallback on CPU for fine-tuned model
d3e5c89
Raw
History Blame Contribute Delete
2.8 kB
"""
Run inference with the fine-tuned model.
After running system3_finetune_colab.ipynb on Google Colab:
1. Download finetuned_model.zip from Colab
2. Unzip into this project root as finetuned_model/
3. Then run this file
Base model: Qwen/Qwen2.5-1.5B-Instruct (upgraded from TinyLlama)
"""
import os
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
FINETUNED_PATH = "./checkpoint-25"
_model = None
_tokenizer = None
def load_model():
global _model, _tokenizer
if _model is not None:
return _model, _tokenizer
print("Loading fine-tuned model (this takes ~30s first time)...")
_tokenizer = AutoTokenizer.from_pretrained(FINETUNED_PATH)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
low_cpu_mem_usage=True,
)
_model = PeftModel.from_pretrained(base, FINETUNED_PATH)
_model.eval()
print("Fine-tuned model ready.")
return _model, _tokenizer
def ask_finetuned(question: str) -> dict:
if not os.path.exists(FINETUNED_PATH):
return {
"system": "Fine-tuned",
"question": question,
"answer": "Fine-tuned model not found. Run system3_finetune_colab.ipynb on Google Colab first, then unzip finetuned_model.zip here.",
"response_time": 0,
}
model, tokenizer = load_model()
start = time.time()
messages = [
{
"role": "system",
"content": (
"You are a programming tutor specializing in Data Structures, "
"Algorithms, and Web Development. Answer questions clearly and concisely."
),
},
{"role": "user", "content": question},
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=150,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
generated_ids = outputs[0][inputs.input_ids.shape[1]:]
answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
elapsed = round(time.time() - start, 2)
return {
"system": "Fine-tuned",
"question": question,
"answer": answer,
"response_time": elapsed,
}
if __name__ == "__main__":
test_q = "What is binary search?"
print(f"Question: {test_q}\n")
result = ask_finetuned(test_q)
print(f"Answer:\n{result['answer']}")
print(f"\nResponse time: {result['response_time']}s")