""" Run inference with the fine-tuned model. After running system3_finetune_colab.ipynb on Google Colab: 1. Download finetuned_model.zip from Colab 2. Unzip into this project root as finetuned_model/ 3. Then run this file Base model: Qwen/Qwen2.5-1.5B-Instruct (upgraded from TinyLlama) """ import os import time import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct" FINETUNED_PATH = "./checkpoint-25" _model = None _tokenizer = None def load_model(): global _model, _tokenizer if _model is not None: return _model, _tokenizer print("Loading fine-tuned model (this takes ~30s first time)...") _tokenizer = AutoTokenizer.from_pretrained(FINETUNED_PATH) base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, low_cpu_mem_usage=True, ) _model = PeftModel.from_pretrained(base, FINETUNED_PATH) _model.eval() print("Fine-tuned model ready.") return _model, _tokenizer def ask_finetuned(question: str) -> dict: if not os.path.exists(FINETUNED_PATH): return { "system": "Fine-tuned", "question": question, "answer": "Fine-tuned model not found. Run system3_finetune_colab.ipynb on Google Colab first, then unzip finetuned_model.zip here.", "response_time": 0, } model, tokenizer = load_model() start = time.time() messages = [ { "role": "system", "content": ( "You are a programming tutor specializing in Data Structures, " "Algorithms, and Web Development. Answer questions clearly and concisely." ), }, {"role": "user", "content": question}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer([text], return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=150, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) generated_ids = outputs[0][inputs.input_ids.shape[1]:] answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() elapsed = round(time.time() - start, 2) return { "system": "Fine-tuned", "question": question, "answer": answer, "response_time": elapsed, } if __name__ == "__main__": test_q = "What is binary search?" print(f"Question: {test_q}\n") result = ask_finetuned(test_q) print(f"Answer:\n{result['answer']}") print(f"\nResponse time: {result['response_time']}s")