Spaces:
Runtime error
Runtime error
| import modal | |
| app = modal.App("eval-finetuned-census") | |
| # Volumes | |
| vol_checkpoints = modal.Volume.from_name("model-checkpoints") | |
| # Image: Same as training to ensure compatibility | |
| image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \ | |
| .apt_install("git") \ | |
| .run_commands( | |
| "pip install --upgrade pip", | |
| "pip install packaging ninja psutil", | |
| "pip install unsloth_zoo", # This will install compatible torch/torchvision | |
| "pip install torchvision", # Ensure torchvision is installed | |
| # Skip flash-attn - it causes OOM during build and is optional | |
| "pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops", | |
| "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'" | |
| ) \ | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| def evaluate_model(questions: list): | |
| from unsloth import FastLanguageModel | |
| import torch | |
| print("π Loading Fine-tuned Model...") | |
| max_seq_length = 2048 | |
| dtype = None | |
| load_in_4bit = True | |
| # Load model + adapter | |
| # Note: We saved to /data/checkpoints/phi3-census-lora | |
| model_path = "/data/checkpoints/phi3-census-lora" | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=model_path, # Load from local path (volume) | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| ) | |
| FastLanguageModel.for_inference(model) | |
| print("β Model Loaded. Running Inference...") | |
| results = [] | |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {} | |
| ### Input: | |
| {} | |
| ### Response: | |
| """ | |
| for q in questions: | |
| instruction = q["question"] | |
| input_context = q.get("context", "") | |
| inputs = tokenizer( | |
| [alpaca_prompt.format(instruction, input_context)], | |
| return_tensors="pt" | |
| ).to("cuda") | |
| outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True) | |
| response = tokenizer.batch_decode(outputs) | |
| # Extract response part | |
| full_text = response[0] | |
| # Simple parsing | |
| if "### Response:\n" in full_text: | |
| answer = full_text.split("### Response:\n")[1].split("<|endoftext|>")[0] | |
| else: | |
| answer = full_text | |
| results.append({ | |
| "question": instruction, | |
| "answer": answer.strip() | |
| }) | |
| return results | |
| def main(): | |
| questions = [ | |
| { | |
| "question": "What is the population of Tokyo?", | |
| "context": "Context: Japan Census data." | |
| }, | |
| { | |
| "question": "What is the average income in Osaka?", | |
| "context": "Context: Japan Economy & Labor data." | |
| }, | |
| { | |
| "question": "How many households are in Hokkaido?", | |
| "context": "Context: Japan Census data." | |
| } | |
| ] | |
| answers = evaluate_model.remote(questions) | |
| for item in answers: | |
| print(f"\nQ: {item['question']}") | |
| print(f"A: {item['answer']}") | |