Spaces:
Sleeping
Sleeping
| """ | |
| P4: Evaluation - Test zero-shot and fine-tuned model performance. | |
| """ | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| from modelscope import snapshot_download | |
| import json | |
| def load_finetuned_model(base_model: str, lora_path: str): | |
| """Load base model + LoRA adapters.""" | |
| tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| model = PeftModel.from_pretrained(model, lora_path) | |
| model.eval() | |
| return model, tokenizer | |
| def predict(model, tokenizer, item_info: str) -> str: | |
| """Run inference on a single item.""" | |
| prompt = f"### Instruction:\nBased on the following item information, predict whether the user would be interested (Yes/No).\n\n### Input:\n{item_info}\n\n### Response:\n" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=10, | |
| temperature=0.1, | |
| do_sample=False | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the response part | |
| if "### Response:" in response: | |
| response = response.split("### Response:")[-1].strip() | |
| return response | |
| def evaluate(model, tokenizer, test_data: list) -> dict: | |
| """Evaluate model on test set.""" | |
| correct = 0 | |
| total = len(test_data) | |
| for sample in test_data: | |
| pred = predict(model, tokenizer, sample['input']) | |
| expected = sample['output'] | |
| if expected.lower() in pred.lower(): | |
| correct += 1 | |
| accuracy = correct / total if total > 0 else 0 | |
| return {"accuracy": accuracy, "correct": correct, "total": total} | |
| if __name__ == "__main__": | |
| import sys | |
| BASE_MODEL = snapshot_download("qwen/Qwen2-1.5B-Instruct") | |
| LORA_PATH = "./lora_output" | |
| print("Loading fine-tuned model...") | |
| model, tokenizer = load_finetuned_model(BASE_MODEL, LORA_PATH) | |
| # Load test data (use last 100 samples from training data as pseudo-test) | |
| with open("training_data.json", 'r') as f: | |
| all_data = json.load(f) | |
| test_data = all_data[-100:] | |
| print(f"Evaluating on {len(test_data)} samples...") | |
| results = evaluate(model, tokenizer, test_data) | |
| print(f"Results: Accuracy = {results['accuracy']*100:.1f}% ({results['correct']}/{results['total']})") | |