Commit ·
2ff8795
1
Parent(s): 72a17f5
README update
Browse files- README.md +10 -8
- src/train_t5.py +9 -9
README.md
CHANGED
|
@@ -118,14 +118,16 @@ Given the total size of the datasets, they haven't been included in this model's
|
|
| 118 |
3. Run `generate_clean_corpus.sh` to clean the West and East Syriac corpora files and shuffle the datasets
|
| 119 |
4. Run `train_tokeniser.py` to train the tokeniser on the cleaned corpora
|
| 120 |
|
| 121 |
-
The model training process follows
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
| 129 |
|
| 130 |
To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
|
| 131 |
|
|
|
|
| 118 |
3. Run `generate_clean_corpus.sh` to clean the West and East Syriac corpora files and shuffle the datasets
|
| 119 |
4. Run `train_tokeniser.py` to train the tokeniser on the cleaned corpora
|
| 120 |
|
| 121 |
+
The model training process follows a curriculum learning format and is comprised of 6 stages:
|
| 122 |
+
|
| 123 |
+
| Stage | Samples | Max. sentence len. | Mixes shorter sen. | Objective
|
| 124 |
+
|-------|---------|---------------|--------------------|--------------------------
|
| 125 |
+
| 1 | 20000 | 15 | No | Expose the base T5 model to Syriac morphology
|
| 126 |
+
| 2 | 40000 | 30 | Yes | Introduce short sentences to AramT5
|
| 127 |
+
| 3 | 60000 | 50 | Yes | Introduce medium sentences to AramT5
|
| 128 |
+
| 4 | 80000 | 70 | Yes | Introduce longer sentences to AramT5
|
| 129 |
+
| 5 | 100000 | 100 | Yes | Reinforce longer sentences to AramT5
|
| 130 |
+
| 6 | 120000 | 150 | Yes | Introduce the full practical corpus to AramT5
|
| 131 |
|
| 132 |
To do a stage 1-based training run, just run the script directly from your IDE or use the following command:
|
| 133 |
|
src/train_t5.py
CHANGED
|
@@ -76,35 +76,35 @@ STAGE_CONFIGS = {
|
|
| 76 |
"description": "Extension: longer phrases",
|
| 77 |
"num_samples": 80_000,
|
| 78 |
"max_src_length": 70,
|
| 79 |
-
"short_mix_ratio": 0.
|
| 80 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 81 |
-
"new_range_ratio": 0.
|
| 82 |
"new_range_min": 51,
|
| 83 |
"num_epochs": 20,
|
| 84 |
-
"learning_rate":
|
| 85 |
},
|
| 86 |
5: {
|
| 87 |
"description": "Extension: longer sentences",
|
| 88 |
"num_samples": 100_000,
|
| 89 |
"max_src_length": 100,
|
| 90 |
-
"short_mix_ratio": 0.
|
| 91 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 92 |
-
"new_range_ratio": 0.
|
| 93 |
"new_range_min": 71,
|
| 94 |
"num_epochs": 20,
|
| 95 |
-
"learning_rate":
|
| 96 |
"repetition_penalty": 1.2,
|
| 97 |
},
|
| 98 |
6: {
|
| 99 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 100 |
"num_samples": 120_000,
|
| 101 |
"max_src_length": 150,
|
| 102 |
-
"short_mix_ratio": 0.
|
| 103 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 104 |
-
"new_range_ratio": 0.
|
| 105 |
"new_range_min": 101,
|
| 106 |
"num_epochs": 15,
|
| 107 |
-
"learning_rate":
|
| 108 |
"repetition_penalty": 1.2,
|
| 109 |
},
|
| 110 |
}
|
|
|
|
| 76 |
"description": "Extension: longer phrases",
|
| 77 |
"num_samples": 80_000,
|
| 78 |
"max_src_length": 70,
|
| 79 |
+
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 80 |
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 81 |
+
"new_range_ratio": 0.45, # 45% from new range (51-70 chars)
|
| 82 |
"new_range_min": 51,
|
| 83 |
"num_epochs": 20,
|
| 84 |
+
"learning_rate": 6e-5, # Lower LR to prevent forgetting
|
| 85 |
},
|
| 86 |
5: {
|
| 87 |
"description": "Extension: longer sentences",
|
| 88 |
"num_samples": 100_000,
|
| 89 |
"max_src_length": 100,
|
| 90 |
+
"short_mix_ratio": 0.18, # 18% short examples from previous stages (boosted for retention)
|
| 91 |
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 92 |
+
"new_range_ratio": 0.45, # 45% from new range (71-100 chars)
|
| 93 |
"new_range_min": 71,
|
| 94 |
"num_epochs": 20,
|
| 95 |
+
"learning_rate": 4e-5, # Lower LR to prevent forgetting
|
| 96 |
"repetition_penalty": 1.2,
|
| 97 |
},
|
| 98 |
6: {
|
| 99 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 100 |
"num_samples": 120_000,
|
| 101 |
"max_src_length": 150,
|
| 102 |
+
"short_mix_ratio": 0.20, # 20% short examples from previous stages (highest retention)
|
| 103 |
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 104 |
+
"new_range_ratio": 0.40, # 40% from new range (101-150 chars)
|
| 105 |
"new_range_min": 101,
|
| 106 |
"num_epochs": 15,
|
| 107 |
+
"learning_rate": 3e-5, # Lower LR to prevent forgetting
|
| 108 |
"repetition_penalty": 1.2,
|
| 109 |
},
|
| 110 |
}
|