sdlc-agent / src /tools /eval_finetuned.py
Veeru-c's picture
initial commit
06bd253
import modal
app = modal.App("eval-finetuned-census")
# Volumes
vol_checkpoints = modal.Volume.from_name("model-checkpoints")
# Image: Same as training to ensure compatibility
image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
.apt_install("git") \
.run_commands(
"pip install --upgrade pip",
"pip install packaging ninja psutil",
"pip install unsloth_zoo", # This will install compatible torch/torchvision
"pip install torchvision", # Ensure torchvision is installed
# Skip flash-attn - it causes OOM during build and is optional
"pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
"pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
) \
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@app.function(
image=image,
volumes={"/data/checkpoints": vol_checkpoints},
gpu="A10G", # Inference can run on smaller GPU
timeout=600
)
def evaluate_model(questions: list):
from unsloth import FastLanguageModel
import torch
print("πŸš€ Loading Fine-tuned Model...")
max_seq_length = 2048
dtype = None
load_in_4bit = True
# Load model + adapter
# Note: We saved to /data/checkpoints/phi3-census-lora
model_path = "/data/checkpoints/phi3-census-lora"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_path, # Load from local path (volume)
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)
print("βœ… Model Loaded. Running Inference...")
results = []
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
"""
for q in questions:
instruction = q["question"]
input_context = q.get("context", "")
inputs = tokenizer(
[alpaca_prompt.format(instruction, input_context)],
return_tensors="pt"
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
response = tokenizer.batch_decode(outputs)
# Extract response part
full_text = response[0]
# Simple parsing
if "### Response:\n" in full_text:
answer = full_text.split("### Response:\n")[1].split("<|endoftext|>")[0]
else:
answer = full_text
results.append({
"question": instruction,
"answer": answer.strip()
})
return results
@app.local_entrypoint()
def main():
questions = [
{
"question": "What is the population of Tokyo?",
"context": "Context: Japan Census data."
},
{
"question": "What is the average income in Osaka?",
"context": "Context: Japan Economy & Labor data."
},
{
"question": "How many households are in Hokkaido?",
"context": "Context: Japan Census data."
}
]
answers = evaluate_model.remote(questions)
for item in answers:
print(f"\nQ: {item['question']}")
print(f"A: {item['answer']}")