File size: 5,653 Bytes
5aadfb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import modal
import os

app = modal.App("finetune-census-phi3")

# Volumes
vol_dataset = modal.Volume.from_name("finetune-dataset")
vol_checkpoints = modal.Volume.from_name("model-checkpoints", create_if_missing=True)

# Image: Build from CUDA base to ensure compatibility
image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
    .apt_install("git") \
    .run_commands(
        "pip install --upgrade pip",
        "pip install packaging ninja psutil",
        "pip install unsloth_zoo",  # This will install compatible torch/torchvision
        "pip install torchvision",  # Ensure torchvision is installed
        # Skip flash-attn - it causes OOM during build and is optional
        "pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
        "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
    ) \
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})

@app.function(
    image=image,
    volumes={
        "/data/dataset": vol_dataset,
        "/data/checkpoints": vol_checkpoints
    },
    gpu="H200",  # Fastest GPU - 3-4x faster than A100
    timeout=86400, # 24 hours
)
def finetune():
    from unsloth import FastLanguageModel
    from trl import SFTTrainer
    from transformers import TrainingArguments
    from datasets import load_dataset
    import torch
    
    print("πŸš€ Starting Fine-tuning Job...")
    
    # 1. Configuration
    max_seq_length = 2048 # Can go up to 4096 for Phi-3
    dtype = None # Auto detection
    load_in_4bit = True # Use 4bit quantization to reduce memory usage
    
    model_name = "unsloth/Phi-3-mini-4k-instruct"
    
    # 2. Load Model and Tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    
    # 3. Add LoRA Adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=16, # Rank
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha=16,
        lora_dropout=0, # Supports any, but = 0 is optimized
        bias="none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
        random_state=3407,
        use_rslora=False,  # Rank stabilized LoRA
        loftq_config=None, # LoftQ
    )
    
    # 4. Load Dataset
    # We generated JSONL files.
    # Format: {"instruction": ..., "input": ..., "output": ...}
    dataset = load_dataset("json", data_files={"train": "/data/dataset/train.jsonl", "test": "/data/dataset/val.jsonl"})
    
    # 5. Formatting Function
    # Alpaca format
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
    def formatting_prompts_func(examples):
        instructions = examples["instruction"]
        inputs       = examples["input"]
        outputs      = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return { "text" : texts, }

    dataset = dataset.map(formatting_prompts_func, batched=True)
    
    # 6. Training Arguments (Optimized for H200)
    training_args = TrainingArguments(
        per_device_train_batch_size=4,  # Increased for H200's 141GB memory
        gradient_accumulation_steps=2,  # Effective batch size = 8
        warmup_steps=100,  # Increased for larger dataset
        max_steps=10000,  # ~4% of full epoch, completes in ~90 minutes
        # num_train_epochs=1,  # Full epoch takes ~30 hours with 1.9M samples
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=100,  # Log less frequently
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Disable wandb logging
        save_strategy="steps",
        save_steps=10000,  # Save checkpoints every 10k steps
        save_total_limit=2,  # Keep only 2 checkpoints
    )
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False, # Can make training 5x faster for short sequences
        args=training_args,
    )
    
    # 7. Train
    print("Training...")
    trainer_stats = trainer.train()
    
    # 8. Save Model
    print("Saving model to /data/checkpoints/phi3-census-lora...")
    model.save_pretrained("/data/checkpoints/phi3-census-lora")
    tokenizer.save_pretrained("/data/checkpoints/phi3-census-lora")
    
    # Also save to GGUF if possible? Unsloth supports it.
    # model.save_pretrained_gguf("/data/checkpoints/phi3-census-gguf", tokenizer, quantization_method = "q4_k_m")
    
    # Commit volume
    vol_checkpoints.commit()
    print("βœ… Fine-tuning Complete!")

@app.local_entrypoint()
def main():
    finetune.remote()