crossroderick commited on
Commit
e3d02e2
·
1 Parent(s): 11632a3

Script and README updates

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. src/train_t5.py +6 -6
README.md CHANGED
@@ -125,9 +125,9 @@ The model training process follows a curriculum learning format and is comprised
125
  | 1 | 20000 | 15 | No | Expose the base T5 model to Syriac morphology
126
  | 2 | 40000 | 30 | Yes | Introduce short sentences to AramT5
127
  | 3 | 60000 | 50 | Yes | Introduce medium sentences to AramT5
128
- | 4 | 80000 | 70 | Yes | Introduce longer sentences to AramT5
129
- | 5 | 100000 | 100 | Yes | Reinforce longer sentences to AramT5
130
- | 6 | 120000 | 150 | Yes | Introduce the full practical corpus to AramT5
131
 
132
  To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
133
 
 
125
  | 1 | 20000 | 15 | No | Expose the base T5 model to Syriac morphology
126
  | 2 | 40000 | 30 | Yes | Introduce short sentences to AramT5
127
  | 3 | 60000 | 50 | Yes | Introduce medium sentences to AramT5
128
+ | 4 | 120000 | 70 | Yes | Introduce longer sentences to AramT5
129
+ | 5 | 150000 | 100 | Yes | Reinforce longer sentences to AramT5
130
+ | 6 | 180000 | 150 | Yes | Introduce the full practical corpus to AramT5
131
 
132
  To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
133
 
src/train_t5.py CHANGED
@@ -190,37 +190,37 @@ STAGE_CONFIGS = {
190
  },
191
  4: {
192
  "description": "Extension: longer phrases",
193
- "num_samples": 80_000,
194
  "max_src_length": 70,
195
  "short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
196
  "short_threshold": 50, # ≤50 chars (Stage 1+2+3)
197
  "new_range_ratio": 0.45, # 45% from new range (51-70 chars)
198
  "new_range_min": 51,
199
  "num_epochs": 20,
200
- "learning_rate": 6e-5, # Lower LR to prevent forgetting
201
  },
202
  5: {
203
  "description": "Extension: longer sentences",
204
- "num_samples": 100_000,
205
  "max_src_length": 100,
206
  "short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
207
  "short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
208
  "new_range_ratio": 0.45, # 45% from new range (71-100 chars)
209
  "new_range_min": 71,
210
  "num_epochs": 20,
211
- "learning_rate": 4e-5, # Lower LR to prevent forgetting
212
  "repetition_penalty": 1.2,
213
  },
214
  6: {
215
  "description": "Full practical corpus: sentences and short paragraphs",
216
- "num_samples": 120_000,
217
  "max_src_length": 150,
218
  "short_mix_ratio": 0.20, # 20% short examples from previous stages (highest retention)
219
  "short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
220
  "new_range_ratio": 0.40, # 40% from new range (101-150 chars)
221
  "new_range_min": 101,
222
  "num_epochs": 15,
223
- "learning_rate": 3e-5, # Lower LR to prevent forgetting
224
  "repetition_penalty": 1.2,
225
  },
226
  }
 
190
  },
191
  4: {
192
  "description": "Extension: longer phrases",
193
+ "num_samples": 120_000, # Increased to better learn multi-word patterns
194
  "max_src_length": 70,
195
  "short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
196
  "short_threshold": 50, # ≤50 chars (Stage 1+2+3)
197
  "new_range_ratio": 0.45, # 45% from new range (51-70 chars)
198
  "new_range_min": 51,
199
  "num_epochs": 20,
200
+ "learning_rate": 8e-5, # Higher LR to unlearn early-stopping bias from imbalanced data
201
  },
202
  5: {
203
  "description": "Extension: longer sentences",
204
+ "num_samples": 150_000, # Increased to better learn multi-word patterns
205
  "max_src_length": 100,
206
  "short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
207
  "short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
208
  "new_range_ratio": 0.45, # 45% from new range (71-100 chars)
209
  "new_range_min": 71,
210
  "num_epochs": 20,
211
+ "learning_rate": 5e-5, # Slightly higher to reinforce multi-word patterns
212
  "repetition_penalty": 1.2,
213
  },
214
  6: {
215
  "description": "Full practical corpus: sentences and short paragraphs",
216
+ "num_samples": 180_000, # Increased to better learn multi-word patterns
217
  "max_src_length": 150,
218
  "short_mix_ratio": 0.20, # 20% short examples from previous stages (highest retention)
219
  "short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
220
  "new_range_ratio": 0.40, # 40% from new range (101-150 chars)
221
  "new_range_min": 101,
222
  "num_epochs": 15,
223
+ "learning_rate": 4e-5, # Fine-tuning polish
224
  "repetition_penalty": 1.2,
225
  },
226
  }