import modal app = modal.App("ask-finetuned-model") vol_checkpoints = modal.Volume.from_name("model-checkpoints") image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \ .apt_install("git") \ .run_commands( "pip install --upgrade pip", "pip install packaging ninja psutil", "pip install unsloth_zoo", "pip install torchvision", "pip install xformers trl peft accelerate bitsandbytes scipy huggingface_hub protobuf sentencepiece einops", "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'" ) \ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) @app.function( image=image, volumes={"/data/checkpoints": vol_checkpoints}, gpu="A10G", timeout=600 ) def ask_question(question: str, context: str = "Context: Japan Census data."): from unsloth import FastLanguageModel print(f"Loading model...") model_path = "/data/checkpoints/phi3-census-lora" model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_path, max_seq_length=2048, dtype=None, load_in_4bit=True, ) FastLanguageModel.for_inference(model) alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: """ inputs = tokenizer( [alpaca_prompt.format(question, context)], return_tensors="pt" ).to("cuda") outputs = model.generate(**inputs, max_new_tokens=150, use_cache=True, temperature=0.1) response = tokenizer.batch_decode(outputs)[0] # Extract answer if "### Response:\n" in response: answer = response.split("### Response:\n")[1].split("