Commit ·
e3d02e2
1
Parent(s): 11632a3
Script and README updates
Browse files- README.md +3 -3
- src/train_t5.py +6 -6
README.md
CHANGED
|
@@ -125,9 +125,9 @@ The model training process follows a curriculum learning format and is comprised
|
|
| 125 |
| 1 | 20000 | 15 | No | Expose the base T5 model to Syriac morphology
|
| 126 |
| 2 | 40000 | 30 | Yes | Introduce short sentences to AramT5
|
| 127 |
| 3 | 60000 | 50 | Yes | Introduce medium sentences to AramT5
|
| 128 |
-
| 4 |
|
| 129 |
-
| 5 |
|
| 130 |
-
| 6 |
|
| 131 |
|
| 132 |
To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
|
| 133 |
|
|
|
|
| 125 |
| 1 | 20000 | 15 | No | Expose the base T5 model to Syriac morphology
|
| 126 |
| 2 | 40000 | 30 | Yes | Introduce short sentences to AramT5
|
| 127 |
| 3 | 60000 | 50 | Yes | Introduce medium sentences to AramT5
|
| 128 |
+
| 4 | 120000 | 70 | Yes | Introduce longer sentences to AramT5
|
| 129 |
+
| 5 | 150000 | 100 | Yes | Reinforce longer sentences to AramT5
|
| 130 |
+
| 6 | 180000 | 150 | Yes | Introduce the full practical corpus to AramT5
|
| 131 |
|
| 132 |
To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
|
| 133 |
|
src/train_t5.py
CHANGED
|
@@ -190,37 +190,37 @@ STAGE_CONFIGS = {
|
|
| 190 |
},
|
| 191 |
4: {
|
| 192 |
"description": "Extension: longer phrases",
|
| 193 |
-
"num_samples":
|
| 194 |
"max_src_length": 70,
|
| 195 |
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 196 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 197 |
"new_range_ratio": 0.45, # 45% from new range (51-70 chars)
|
| 198 |
"new_range_min": 51,
|
| 199 |
"num_epochs": 20,
|
| 200 |
-
"learning_rate":
|
| 201 |
},
|
| 202 |
5: {
|
| 203 |
"description": "Extension: longer sentences",
|
| 204 |
-
"num_samples":
|
| 205 |
"max_src_length": 100,
|
| 206 |
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 207 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 208 |
"new_range_ratio": 0.45, # 45% from new range (71-100 chars)
|
| 209 |
"new_range_min": 71,
|
| 210 |
"num_epochs": 20,
|
| 211 |
-
"learning_rate":
|
| 212 |
"repetition_penalty": 1.2,
|
| 213 |
},
|
| 214 |
6: {
|
| 215 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 216 |
-
"num_samples":
|
| 217 |
"max_src_length": 150,
|
| 218 |
"short_mix_ratio": 0.20, # 20% short examples from previous stages (highest retention)
|
| 219 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 220 |
"new_range_ratio": 0.40, # 40% from new range (101-150 chars)
|
| 221 |
"new_range_min": 101,
|
| 222 |
"num_epochs": 15,
|
| 223 |
-
"learning_rate":
|
| 224 |
"repetition_penalty": 1.2,
|
| 225 |
},
|
| 226 |
}
|
|
|
|
| 190 |
},
|
| 191 |
4: {
|
| 192 |
"description": "Extension: longer phrases",
|
| 193 |
+
"num_samples": 120_000, # Increased to better learn multi-word patterns
|
| 194 |
"max_src_length": 70,
|
| 195 |
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 196 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 197 |
"new_range_ratio": 0.45, # 45% from new range (51-70 chars)
|
| 198 |
"new_range_min": 51,
|
| 199 |
"num_epochs": 20,
|
| 200 |
+
"learning_rate": 8e-5, # Higher LR to unlearn early-stopping bias from imbalanced data
|
| 201 |
},
|
| 202 |
5: {
|
| 203 |
"description": "Extension: longer sentences",
|
| 204 |
+
"num_samples": 150_000, # Increased to better learn multi-word patterns
|
| 205 |
"max_src_length": 100,
|
| 206 |
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 207 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 208 |
"new_range_ratio": 0.45, # 45% from new range (71-100 chars)
|
| 209 |
"new_range_min": 71,
|
| 210 |
"num_epochs": 20,
|
| 211 |
+
"learning_rate": 5e-5, # Slightly higher to reinforce multi-word patterns
|
| 212 |
"repetition_penalty": 1.2,
|
| 213 |
},
|
| 214 |
6: {
|
| 215 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 216 |
+
"num_samples": 180_000, # Increased to better learn multi-word patterns
|
| 217 |
"max_src_length": 150,
|
| 218 |
"short_mix_ratio": 0.20, # 20% short examples from previous stages (highest retention)
|
| 219 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 220 |
"new_range_ratio": 0.40, # 40% from new range (101-150 chars)
|
| 221 |
"new_range_min": 101,
|
| 222 |
"num_epochs": 15,
|
| 223 |
+
"learning_rate": 4e-5, # Fine-tuning polish
|
| 224 |
"repetition_penalty": 1.2,
|
| 225 |
},
|
| 226 |
}
|