Mr-FineTuner commited on
Commit
c990de5
·
verified ·
1 Parent(s): 5163bf4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +68 -3
README.md CHANGED
@@ -6,9 +6,74 @@ tags:
6
 
7
  # Model Card for Model ID
8
 
9
- <!-- Provide a quick summary of what the model is/does. -->
10
-
11
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  ## Model Details
14
 
 
6
 
7
  # Model Card for Model ID
8
 
9
+ Map: 100%
10
+  2920/2920 [00:01<00:00, 1602.09 examples/s]
11
+ [365/365 4:25:54]
12
+ Test Loss: 1.0123
13
+
14
+ Step Training Loss Validation Loss
15
+ 250 0.983800 0.957103
16
+ 500 0.937900 0.954966
17
+ 750 0.862300 0.968044
18
+ 1000 0.800900 0.986456
19
+ 1250 0.712600 1.017532
20
+ 1500 0.652100 1.035168
21
+ 1750 0.600500 1.051357
22
+ 2000 0.412800 1.152156
23
+ 2250 0.386200 1.168790
24
+ 2500 0.377300 1.185837
25
+ 2750 0.346600 1.223637
26
+ 3000 0.351300 1.254214
27
+ 3250 0.321700 1.273642
28
+ 3500 0.329900 1.280087
29
+
30
+ train_dataset_transformed = train_dataset_transformed.shuffle(seed=3407)
31
+
32
+ trainer = SFTTrainer(
33
+ model=model,
34
+ tokenizer=tokenizer,
35
+ train_dataset=train_dataset_transformed,
36
+ eval_dataset=val_dataset_transformed,
37
+ max_seq_length=max_seq_length,
38
+ dataset_num_proc=2,
39
+ packing=False,
40
+ args=TrainingArguments(
41
+ per_device_train_batch_size=8, # Increased batch size
42
+ gradient_accumulation_steps=1, # Reduced from 4
43
+ warmup_ratio=0.05, # Better than fixed 5 steps for 20K samples
44
+ num_train_epochs=2, # Compromise between 1 and 3
45
+ learning_rate=1.5e-4, # Try between 1e-4 and 2e-4
46
+ fp16=not is_bfloat16_supported(),
47
+ bf16=is_bfloat16_supported(),
48
+ logging_steps=50,
49
+ optim="adamw_8bit",
50
+ weight_decay=0.02, # Increased regularization
51
+ lr_scheduler_type="cosine_with_restarts",
52
+ seed=3407,
53
+ output_dir="outputs",
54
+ evaluation_strategy="steps",
55
+ eval_steps=250, # More frequent validation
56
+ save_strategy="steps",
57
+ save_steps=250,
58
+ load_best_model_at_end=True,
59
+ metric_for_best_model="eval_loss", # Changed from "loss"
60
+ greater_is_better=False,
61
+ ),
62
+ )
63
+
64
+
65
+ # another revise
66
+ model = FastLanguageModel.get_peft_model(
67
+ model,
68
+ r = 32, # Reduced from 64 for better generalization
69
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
70
+ "gate_proj", "up_proj", "down_proj"],
71
+ lora_alpha = 16, # Reduced from 32 (alpha = r/2 is common)
72
+ lora_dropout = 0.1, # Slight regularization
73
+ bias = "none",
74
+ use_gradient_checkpointing = "unsloth",
75
+ random_state = 3407,
76
+ )
77
 
78
  ## Model Details
79