crossroderick commited on
Commit
523af6f
·
1 Parent(s): ea95949

v1 update

Browse files
Files changed (5) hide show
  1. README.md +9 -7
  2. generation_config.json +1 -1
  3. model.safetensors +1 -1
  4. src/train_t5.py +123 -41
  5. tokenizer.json +16 -2
README.md CHANGED
@@ -22,16 +22,16 @@ model-index:
22
  metrics:
23
  - name: Training Loss
24
  type: loss
25
- value: 1.4516
26
  - name: Evaluation Loss
27
  type: loss
28
- value: 3.6643
29
  - name: CER
30
  type: cer
31
- value: 0.4476
32
  - name: Exact Match
33
  type: accuracy
34
- value: 0.2880
35
  ---
36
  # AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
37
 
@@ -44,8 +44,8 @@ model-index:
44
  - Less stable on very long or morphologically complex words
45
 
46
  > Development information
47
- > - 🚧 **Current version:** Baseline (stage 1)
48
- > - ⏳ **Upcoming release:** v1 (stage 2)
49
  >
50
  > **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
51
 
@@ -149,4 +149,6 @@ uv run python src/train_t5.py --stage 2 --hf-model your-username/model-name
149
 
150
  ## 📋 Version Changelog
151
 
152
- * **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
 
 
 
22
  metrics:
23
  - name: Training Loss
24
  type: loss
25
+ value: 1.5293
26
  - name: Evaluation Loss
27
  type: loss
28
+ value: 1.8535
29
  - name: CER
30
  type: cer
31
+ value: 0.1863
32
  - name: Exact Match
33
  type: accuracy
34
+ value: 0.5999
35
  ---
36
  # AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
37
 
 
44
  - Less stable on very long or morphologically complex words
45
 
46
  > Development information
47
+ > - 🚧 **Current version:** v1 (stage 2)
48
+ > - ⏳ **Upcoming release:** v2 (stage 3)
49
  >
50
  > **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
51
 
 
149
 
150
  ## 📋 Version Changelog
151
 
152
+ * **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
153
+
154
+ * **AramT5 v1 (May 20, 2026):** AraamT5 fine-tuned on 40k records, across 20 epochs, leveraging the stage 2 configuration. A massive upgrade compared to the baseline version, v1 showcases significantly improved morphological handling of not only single words but also sequences with noticeable complexity
generation_config.json CHANGED
@@ -2,7 +2,7 @@
2
  "decoder_start_token_id": 0,
3
  "early_stopping": true,
4
  "eos_token_id": 3,
5
- "length_penalty": 1.05,
6
  "max_length": 24,
7
  "min_length": 2,
8
  "no_repeat_ngram_size": 2,
 
2
  "decoder_start_token_id": 0,
3
  "early_stopping": true,
4
  "eos_token_id": 3,
5
+ "length_penalty": 1.12,
6
  "max_length": 24,
7
  "min_length": 2,
8
  "no_repeat_ngram_size": 2,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:563b4428bcaa67f2f3b8313847698c5978f0ca0d694796706b5c86ca606b2295
3
  size 209216552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:259f4329c111dd8191b255ad65cf5d6fa2aedb0fa90634d005d31ebfca8cee88
3
  size 209216552
src/train_t5.py CHANGED
@@ -51,30 +51,61 @@ STAGE_CONFIGS = {
51
  "learning_rate": 3e-4,
52
  },
53
  2: {
54
- "description": "Expansion: medium-long sequences",
 
 
 
 
 
 
 
 
 
 
 
55
  "num_samples": 60_000,
56
  "max_src_length": 50,
57
- "short_mix_ratio": 0.12, # 12% short examples
 
 
 
58
  "num_epochs": 20,
59
  "learning_rate": 1e-4,
60
  },
61
- 3: {
62
- "description": "Extension: medium-long sequences",
 
 
 
 
 
 
 
 
 
 
 
63
  "num_samples": 100_000,
64
  "max_src_length": 100,
65
- "short_mix_ratio": 0.10, # 10% short examples
 
 
 
66
  "num_epochs": 20,
67
  "learning_rate": 6e-5,
68
- "repetition_penalty": 1.2, # Prevent repetitive outputs
69
  },
70
- 4: {
71
  "description": "Full practical corpus: sentences and short paragraphs",
72
  "num_samples": 120_000,
73
- "max_src_length": 150, # Practical sentence length
74
- "short_mix_ratio": 0.10, # 10% short examples
 
 
 
75
  "num_epochs": 15,
76
  "learning_rate": 4e-5,
77
- "repetition_penalty": 1.2, # Prevent repetitive outputs
78
  },
79
  }
80
 
@@ -89,8 +120,8 @@ def parse_args():
89
  "--stage",
90
  type=int,
91
  default=1,
92
- choices=[1, 2, 3, 4],
93
- help="Training stage (1=baseline, 2=medium-long, 3=extension, 4=full practical)",
94
  )
95
  parser.add_argument(
96
  "--hf-model",
@@ -307,8 +338,8 @@ def load_and_prepare_data(
307
  num_middle = int(num_samples * 0.40) # 40% from middle range (15-100)
308
  num_main = num_samples - num_short - num_middle
309
 
310
- # Short examples (≤10 chars) for forgetting mitigation
311
- short_threshold = 10
312
  short_examples = full_dataset.filter(
313
  lambda x: x["src_length"] <= short_threshold
314
  )
@@ -343,36 +374,87 @@ def load_and_prepare_data(
343
  sampled_dataset = sampled_dataset.shuffle(seed=42)
344
 
345
  elif short_mix_ratio > 0 and stage > 1:
 
 
 
 
346
  num_short = int(num_samples * short_mix_ratio)
347
- num_main = num_samples - num_short
348
-
349
- # Get short examples (first N after sorting by length)
350
- short_threshold = 10 # Characters
351
- short_examples = full_dataset.filter(
352
- lambda x: x["src_length"] <= short_threshold
353
- )
354
- short_examples = short_examples.shuffle(seed=42).select(
355
- range(min(num_short, len(short_examples)))
356
- )
357
- print(f" Short examples (for forgetting mitigation): {len(short_examples)}")
358
-
359
- # Get main examples from filtered dataset
360
- # Apply minimum length filter for main examples in later stages
361
- min_len = stage_config.get("min_src_length", 0)
362
- if min_len > 0:
363
- main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
364
- print(f" Main pool after min_length={min_len} filter: {len(main_pool)} examples")
365
- else:
366
- main_pool = filtered_dataset
367
 
368
- main_examples = main_pool.shuffle(seed=42).select(
369
- range(min(num_main, len(main_pool)))
370
- )
371
- print(f" Main examples: {len(main_examples)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
- # Combine and shuffle
374
- sampled_dataset = concatenate_datasets([short_examples, main_examples])
375
- sampled_dataset = sampled_dataset.shuffle(seed=42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  else:
377
  sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))
378
 
 
51
  "learning_rate": 3e-4,
52
  },
53
  2: {
54
+ "description": "Expansion: short phrases",
55
+ "num_samples": 40_000,
56
+ "max_src_length": 30,
57
+ "short_mix_ratio": 0.12, # 12% short examples from previous stages
58
+ "short_threshold": 15, # ≤15 chars (Stage 1)
59
+ "new_range_ratio": 0.50, # 50% from new range (16-30 chars)
60
+ "new_range_min": 16,
61
+ "num_epochs": 20,
62
+ "learning_rate": 1.2e-4,
63
+ },
64
+ 3: {
65
+ "description": "Expansion: medium phrases",
66
  "num_samples": 60_000,
67
  "max_src_length": 50,
68
+ "short_mix_ratio": 0.12, # 12% short examples from previous stages
69
+ "short_threshold": 30, # ≤30 chars (Stage 1+2)
70
+ "new_range_ratio": 0.50, # 50% from new range (31-50 chars)
71
+ "new_range_min": 31,
72
  "num_epochs": 20,
73
  "learning_rate": 1e-4,
74
  },
75
+ 4: {
76
+ "description": "Extension: longer phrases",
77
+ "num_samples": 80_000,
78
+ "max_src_length": 70,
79
+ "short_mix_ratio": 0.10, # 10% short examples from previous stages
80
+ "short_threshold": 50, # ≤50 chars (Stage 1+2+3)
81
+ "new_range_ratio": 0.50, # 50% from new range (51-70 chars)
82
+ "new_range_min": 51,
83
+ "num_epochs": 20,
84
+ "learning_rate": 8e-5,
85
+ },
86
+ 5: {
87
+ "description": "Extension: longer sentences",
88
  "num_samples": 100_000,
89
  "max_src_length": 100,
90
+ "short_mix_ratio": 0.10, # 10% short examples from previous stages
91
+ "short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
92
+ "new_range_ratio": 0.50, # 50% from new range (71-100 chars)
93
+ "new_range_min": 71,
94
  "num_epochs": 20,
95
  "learning_rate": 6e-5,
96
+ "repetition_penalty": 1.2,
97
  },
98
+ 6: {
99
  "description": "Full practical corpus: sentences and short paragraphs",
100
  "num_samples": 120_000,
101
+ "max_src_length": 150,
102
+ "short_mix_ratio": 0.10, # 10% short examples from previous stages
103
+ "short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
104
+ "new_range_ratio": 0.50, # 50% from new range (101-150 chars)
105
+ "new_range_min": 101,
106
  "num_epochs": 15,
107
  "learning_rate": 4e-5,
108
+ "repetition_penalty": 1.2,
109
  },
110
  }
111
 
 
120
  "--stage",
121
  type=int,
122
  default=1,
123
+ choices=[1, 2, 3, 4, 5, 6],
124
+ help="Training stage (1=baseline, 2=medium-long, 3=expansion, 4=extension, 5=longer sentences, 6=full practical)",
125
  )
126
  parser.add_argument(
127
  "--hf-model",
 
338
  num_middle = int(num_samples * 0.40) # 40% from middle range (15-100)
339
  num_main = num_samples - num_short - num_middle
340
 
341
+ # Short examples (≤15 chars = Stage 1 range) for forgetting mitigation
342
+ short_threshold = 15
343
  short_examples = full_dataset.filter(
344
  lambda x: x["src_length"] <= short_threshold
345
  )
 
374
  sampled_dataset = sampled_dataset.shuffle(seed=42)
375
 
376
  elif short_mix_ratio > 0 and stage > 1:
377
+ # Stratified sampling: ensure we get examples from the NEW length range
378
+ new_range_ratio = stage_config.get("new_range_ratio", 0)
379
+ new_range_min = stage_config.get("new_range_min", 0)
380
+
381
  num_short = int(num_samples * short_mix_ratio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ if new_range_ratio > 0 and new_range_min > 0:
384
+ # Stratified: short + new_range + remainder
385
+ num_new_range = int(num_samples * new_range_ratio)
386
+ num_remainder = num_samples - num_short - num_new_range
387
+
388
+ # Short examples = everything from previous stages (for forgetting mitigation)
389
+ short_threshold = stage_config.get("short_threshold", 15)
390
+ short_examples = full_dataset.filter(
391
+ lambda x, thresh=short_threshold: x["src_length"] <= thresh
392
+ )
393
+ short_examples = short_examples.shuffle(seed=42).select(
394
+ range(min(num_short, len(short_examples)))
395
+ )
396
+ print(f" Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
397
+
398
+ # New range examples - these are what the model needs to learn
399
+ new_range_examples = filtered_dataset.filter(
400
+ lambda x, min_len=new_range_min: x["src_length"] >= min_len
401
+ )
402
+ print(f" New range pool ({new_range_min}-{max_len} chars): {len(new_range_examples)} available")
403
+
404
+ # Oversample if needed (these are scarce!)
405
+ if len(new_range_examples) < num_new_range:
406
+ if len(new_range_examples) > 0:
407
+ repeats_needed = (num_new_range // len(new_range_examples)) + 1
408
+ new_range_repeated = concatenate_datasets([new_range_examples] * repeats_needed)
409
+ new_range_examples = new_range_repeated.shuffle(seed=42).select(range(num_new_range))
410
+ print(f" New range examples (oversampled {repeats_needed}x): {len(new_range_examples)}")
411
+ else:
412
+ print(f" WARNING: No examples in new range!")
413
+ new_range_examples = full_dataset.filter(lambda x: False) # empty
414
+ else:
415
+ new_range_examples = new_range_examples.shuffle(seed=42).select(range(num_new_range))
416
+ print(f" New range examples: {len(new_range_examples)}")
417
+
418
+ # Remainder from full filtered set (includes all lengths up to max)
419
+ remainder_examples = filtered_dataset.shuffle(seed=43).select(
420
+ range(min(num_remainder, len(filtered_dataset)))
421
+ )
422
+ print(f" Remainder examples: {len(remainder_examples)}")
423
+
424
+ # Combine and shuffle
425
+ sampled_dataset = concatenate_datasets([short_examples, new_range_examples, remainder_examples])
426
+ sampled_dataset = sampled_dataset.shuffle(seed=42)
427
+ else:
428
+ # Original logic: just short + main
429
+ num_main = num_samples - num_short
430
 
431
+ # Get short examples = everything from previous stages
432
+ short_threshold = stage_config.get("short_threshold", 15)
433
+ short_examples = full_dataset.filter(
434
+ lambda x, thresh=short_threshold: x["src_length"] <= thresh
435
+ )
436
+ short_examples = short_examples.shuffle(seed=42).select(
437
+ range(min(num_short, len(short_examples)))
438
+ )
439
+ print(f" Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
440
+
441
+ # Get main examples from filtered dataset
442
+ # Apply minimum length filter for main examples in later stages
443
+ min_len = stage_config.get("min_src_length", 0)
444
+ if min_len > 0:
445
+ main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
446
+ print(f" Main pool after min_length={min_len} filter: {len(main_pool)} examples")
447
+ else:
448
+ main_pool = filtered_dataset
449
+
450
+ main_examples = main_pool.shuffle(seed=42).select(
451
+ range(min(num_main, len(main_pool)))
452
+ )
453
+ print(f" Main examples: {len(main_examples)}")
454
+
455
+ # Combine and shuffle
456
+ sampled_dataset = concatenate_datasets([short_examples, main_examples])
457
+ sampled_dataset = sampled_dataset.shuffle(seed=42)
458
  else:
459
  sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))
460
 
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 128,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 128
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 0,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<pad>"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 0,