v1 update

Browse files

Files changed (5) hide show

README.md +9 -7
generation_config.json +1 -1
model.safetensors +1 -1
src/train_t5.py +123 -41
tokenizer.json +16 -2

README.md CHANGED Viewed

@@ -22,16 +22,16 @@ model-index:
       metrics:
         - name: Training Loss
           type: loss
-          value: 1.4516
         - name: Evaluation Loss
           type: loss
-          value: 3.6643
         - name: CER
           type: cer
-          value: 0.4476
         - name: Exact Match
           type: accuracy
-          value: 0.2880
 ---
 # AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
@@ -44,8 +44,8 @@ model-index:
 - Less stable on very long or morphologically complex words
 > Development information
-> - 🚧 **Current version:** Baseline (stage 1)
-> - ⏳ **Upcoming release:** v1 (stage 2)
 >
 > **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
@@ -149,4 +149,6 @@ uv run python src/train_t5.py --stage 2 --hf-model your-username/model-name
 ## 📋 Version Changelog
-* **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner

       metrics:
         - name: Training Loss
           type: loss
+          value: 1.5293
         - name: Evaluation Loss
           type: loss
+          value: 1.8535
         - name: CER
           type: cer
+          value: 0.1863
         - name: Exact Match
           type: accuracy
+          value: 0.5999
 ---
 # AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
 - Less stable on very long or morphologically complex words
 > Development information
+> - 🚧 **Current version:** v1 (stage 2)
+> - ⏳ **Upcoming release:** v2 (stage 3)
 >
 > **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
 ## 📋 Version Changelog
+* **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
+* **AramT5 v1 (May 20, 2026):** AraamT5 fine-tuned on 40k records, across 20 epochs, leveraging the stage 2 configuration. A massive upgrade compared to the baseline version, v1 showcases significantly improved morphological handling of not only single words but also sequences with noticeable complexity

generation_config.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "decoder_start_token_id": 0,
   "early_stopping": true,
   "eos_token_id": 3,
-  "length_penalty": 1.05,
   "max_length": 24,
   "min_length": 2,
   "no_repeat_ngram_size": 2,

   "decoder_start_token_id": 0,
   "early_stopping": true,
   "eos_token_id": 3,
+  "length_penalty": 1.12,
   "max_length": 24,
   "min_length": 2,
   "no_repeat_ngram_size": 2,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:563b4428bcaa67f2f3b8313847698c5978f0ca0d694796706b5c86ca606b2295
 size 209216552

 version https://git-lfs.github.com/spec/v1
+oid sha256:259f4329c111dd8191b255ad65cf5d6fa2aedb0fa90634d005d31ebfca8cee88
 size 209216552

src/train_t5.py CHANGED Viewed

@@ -51,30 +51,61 @@ STAGE_CONFIGS = {
         "learning_rate": 3e-4,
     },
     2: {
-        "description": "Expansion: medium-long sequences",
         "num_samples": 60_000,
         "max_src_length": 50,
-        "short_mix_ratio": 0.12,  # 12% short examples
         "num_epochs": 20,
         "learning_rate": 1e-4,
     },
-    3: {
-        "description": "Extension: medium-long sequences",
         "num_samples": 100_000,
         "max_src_length": 100,
-        "short_mix_ratio": 0.10,  # 10% short examples
         "num_epochs": 20,
         "learning_rate": 6e-5,
-        "repetition_penalty": 1.2,  # Prevent repetitive outputs
     },
-    4: {
         "description": "Full practical corpus: sentences and short paragraphs",
         "num_samples": 120_000,
-        "max_src_length": 150,  # Practical sentence length
-        "short_mix_ratio": 0.10,  # 10% short examples
         "num_epochs": 15,
         "learning_rate": 4e-5,
-        "repetition_penalty": 1.2,  # Prevent repetitive outputs
     },
 }
@@ -89,8 +120,8 @@ def parse_args():
         "--stage",
         type=int,
         default=1,
-        choices=[1, 2, 3, 4],
-        help="Training stage (1=baseline, 2=medium-long, 3=extension, 4=full practical)",
     )
     parser.add_argument(
         "--hf-model",
@@ -307,8 +338,8 @@ def load_and_prepare_data(
         num_middle = int(num_samples * 0.40)  # 40% from middle range (15-100)
         num_main = num_samples - num_short - num_middle
-        # Short examples (≤10 chars) for forgetting mitigation
-        short_threshold = 10
         short_examples = full_dataset.filter(
             lambda x: x["src_length"] <= short_threshold
         )
@@ -343,36 +374,87 @@ def load_and_prepare_data(
         sampled_dataset = sampled_dataset.shuffle(seed=42)
     elif short_mix_ratio > 0 and stage > 1:
         num_short = int(num_samples * short_mix_ratio)
-        num_main = num_samples - num_short
-        # Get short examples (first N after sorting by length)
-        short_threshold = 10  # Characters
-        short_examples = full_dataset.filter(
-            lambda x: x["src_length"] <= short_threshold
-        )
-        short_examples = short_examples.shuffle(seed=42).select(
-            range(min(num_short, len(short_examples)))
-        )
-        print(f"  Short examples (for forgetting mitigation): {len(short_examples)}")
-        # Get main examples from filtered dataset
-        # Apply minimum length filter for main examples in later stages
-        min_len = stage_config.get("min_src_length", 0)
-        if min_len > 0:
-            main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
-            print(f"  Main pool after min_length={min_len} filter: {len(main_pool)} examples")
-        else:
-            main_pool = filtered_dataset
-        main_examples = main_pool.shuffle(seed=42).select(
-            range(min(num_main, len(main_pool)))
-        )
-        print(f"  Main examples: {len(main_examples)}")
-        # Combine and shuffle
-        sampled_dataset = concatenate_datasets([short_examples, main_examples])
-        sampled_dataset = sampled_dataset.shuffle(seed=42)
     else:
         sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))

         "learning_rate": 3e-4,
     },
     2: {
+        "description": "Expansion: short phrases",
+        "num_samples": 40_000,
+        "max_src_length": 30,
+        "short_mix_ratio": 0.12,  # 12% short examples from previous stages
+        "short_threshold": 15,  # ≤15 chars (Stage 1)
+        "new_range_ratio": 0.50,  # 50% from new range (16-30 chars)
+        "new_range_min": 16,
+        "num_epochs": 20,
+        "learning_rate": 1.2e-4,
+    },
+    3: {
+        "description": "Expansion: medium phrases",
         "num_samples": 60_000,
         "max_src_length": 50,
+        "short_mix_ratio": 0.12,  # 12% short examples from previous stages
+        "short_threshold": 30,  # ≤30 chars (Stage 1+2)
+        "new_range_ratio": 0.50,  # 50% from new range (31-50 chars)
+        "new_range_min": 31,
         "num_epochs": 20,
         "learning_rate": 1e-4,
     },
+    4: {
+        "description": "Extension: longer phrases",
+        "num_samples": 80_000,
+        "max_src_length": 70,
+        "short_mix_ratio": 0.10,  # 10% short examples from previous stages
+        "short_threshold": 50,  # ≤50 chars (Stage 1+2+3)
+        "new_range_ratio": 0.50,  # 50% from new range (51-70 chars)
+        "new_range_min": 51,
+        "num_epochs": 20,
+        "learning_rate": 8e-5,
+    },
+    5: {
+        "description": "Extension: longer sentences",
         "num_samples": 100_000,
         "max_src_length": 100,
+        "short_mix_ratio": 0.10,  # 10% short examples from previous stages
+        "short_threshold": 70,  # ≤70 chars (Stage 1+2+3+4)
+        "new_range_ratio": 0.50,  # 50% from new range (71-100 chars)
+        "new_range_min": 71,
         "num_epochs": 20,
         "learning_rate": 6e-5,
+        "repetition_penalty": 1.2,
     },
+    6: {
         "description": "Full practical corpus: sentences and short paragraphs",
         "num_samples": 120_000,
+        "max_src_length": 150,
+        "short_mix_ratio": 0.10,  # 10% short examples from previous stages
+        "short_threshold": 100,  # ≤100 chars (Stage 1+2+3+4+5)
+        "new_range_ratio": 0.50,  # 50% from new range (101-150 chars)
+        "new_range_min": 101,
         "num_epochs": 15,
         "learning_rate": 4e-5,
+        "repetition_penalty": 1.2,
     },
 }
         "--stage",
         type=int,
         default=1,
+        choices=[1, 2, 3, 4, 5, 6],
+        help="Training stage (1=baseline, 2=medium-long, 3=expansion, 4=extension, 5=longer sentences, 6=full practical)",
     )
     parser.add_argument(
         "--hf-model",
         num_middle = int(num_samples * 0.40)  # 40% from middle range (15-100)
         num_main = num_samples - num_short - num_middle
+        # Short examples (≤15 chars = Stage 1 range) for forgetting mitigation
+        short_threshold = 15
         short_examples = full_dataset.filter(
             lambda x: x["src_length"] <= short_threshold
         )
         sampled_dataset = sampled_dataset.shuffle(seed=42)
     elif short_mix_ratio > 0 and stage > 1:
+        # Stratified sampling: ensure we get examples from the NEW length range
+        new_range_ratio = stage_config.get("new_range_ratio", 0)
+        new_range_min = stage_config.get("new_range_min", 0)
         num_short = int(num_samples * short_mix_ratio)
+        if new_range_ratio > 0 and new_range_min > 0:
+            # Stratified: short + new_range + remainder
+            num_new_range = int(num_samples * new_range_ratio)
+            num_remainder = num_samples - num_short - num_new_range
+            # Short examples = everything from previous stages (for forgetting mitigation)
+            short_threshold = stage_config.get("short_threshold", 15)
+            short_examples = full_dataset.filter(
+                lambda x, thresh=short_threshold: x["src_length"] <= thresh
+            )
+            short_examples = short_examples.shuffle(seed=42).select(
+                range(min(num_short, len(short_examples)))
+            )
+            print(f"  Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
+            # New range examples - these are what the model needs to learn
+            new_range_examples = filtered_dataset.filter(
+                lambda x, min_len=new_range_min: x["src_length"] >= min_len
+            )
+            print(f"  New range pool ({new_range_min}-{max_len} chars): {len(new_range_examples)} available")
+            # Oversample if needed (these are scarce!)
+            if len(new_range_examples) < num_new_range:
+                if len(new_range_examples) > 0:
+                    repeats_needed = (num_new_range // len(new_range_examples)) + 1
+                    new_range_repeated = concatenate_datasets([new_range_examples] * repeats_needed)
+                    new_range_examples = new_range_repeated.shuffle(seed=42).select(range(num_new_range))
+                    print(f"  New range examples (oversampled {repeats_needed}x): {len(new_range_examples)}")
+                else:
+                    print(f"  WARNING: No examples in new range!")
+                    new_range_examples = full_dataset.filter(lambda x: False)  # empty
+            else:
+                new_range_examples = new_range_examples.shuffle(seed=42).select(range(num_new_range))
+                print(f"  New range examples: {len(new_range_examples)}")
+            # Remainder from full filtered set (includes all lengths up to max)
+            remainder_examples = filtered_dataset.shuffle(seed=43).select(
+                range(min(num_remainder, len(filtered_dataset)))
+            )
+            print(f"  Remainder examples: {len(remainder_examples)}")
+            # Combine and shuffle
+            sampled_dataset = concatenate_datasets([short_examples, new_range_examples, remainder_examples])
+            sampled_dataset = sampled_dataset.shuffle(seed=42)
+        else:
+            # Original logic: just short + main
+            num_main = num_samples - num_short
+            # Get short examples = everything from previous stages
+            short_threshold = stage_config.get("short_threshold", 15)
+            short_examples = full_dataset.filter(
+                lambda x, thresh=short_threshold: x["src_length"] <= thresh
+            )
+            short_examples = short_examples.shuffle(seed=42).select(
+                range(min(num_short, len(short_examples)))
+            )
+            print(f"  Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
+            # Get main examples from filtered dataset
+            # Apply minimum length filter for main examples in later stages
+            min_len = stage_config.get("min_src_length", 0)
+            if min_len > 0:
+                main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
+                print(f"  Main pool after min_length={min_len} filter: {len(main_pool)} examples")
+            else:
+                main_pool = filtered_dataset
+            main_examples = main_pool.shuffle(seed=42).select(
+                range(min(num_main, len(main_pool)))
+            )
+            print(f"  Main examples: {len(main_examples)}")
+            # Combine and shuffle
+            sampled_dataset = concatenate_datasets([short_examples, main_examples])
+            sampled_dataset = sampled_dataset.shuffle(seed=42)
     else:
         sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))

tokenizer.json CHANGED Viewed

@@ -1,7 +1,21 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 128,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 128
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "<pad>"
+  },
   "added_tokens": [
     {
       "id": 0,