Commit ·
523af6f
1
Parent(s): ea95949
v1 update
Browse files- README.md +9 -7
- generation_config.json +1 -1
- model.safetensors +1 -1
- src/train_t5.py +123 -41
- tokenizer.json +16 -2
README.md
CHANGED
|
@@ -22,16 +22,16 @@ model-index:
|
|
| 22 |
metrics:
|
| 23 |
- name: Training Loss
|
| 24 |
type: loss
|
| 25 |
-
value: 1.
|
| 26 |
- name: Evaluation Loss
|
| 27 |
type: loss
|
| 28 |
-
value:
|
| 29 |
- name: CER
|
| 30 |
type: cer
|
| 31 |
-
value: 0.
|
| 32 |
- name: Exact Match
|
| 33 |
type: accuracy
|
| 34 |
-
value: 0.
|
| 35 |
---
|
| 36 |
# AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
|
| 37 |
|
|
@@ -44,8 +44,8 @@ model-index:
|
|
| 44 |
- Less stable on very long or morphologically complex words
|
| 45 |
|
| 46 |
> Development information
|
| 47 |
-
> - 🚧 **Current version:**
|
| 48 |
-
> - ⏳ **Upcoming release:**
|
| 49 |
>
|
| 50 |
> **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
|
| 51 |
|
|
@@ -149,4 +149,6 @@ uv run python src/train_t5.py --stage 2 --hf-model your-username/model-name
|
|
| 149 |
|
| 150 |
## 📋 Version Changelog
|
| 151 |
|
| 152 |
-
* **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
|
|
|
|
|
|
|
|
|
| 22 |
metrics:
|
| 23 |
- name: Training Loss
|
| 24 |
type: loss
|
| 25 |
+
value: 1.5293
|
| 26 |
- name: Evaluation Loss
|
| 27 |
type: loss
|
| 28 |
+
value: 1.8535
|
| 29 |
- name: CER
|
| 30 |
type: cer
|
| 31 |
+
value: 0.1863
|
| 32 |
- name: Exact Match
|
| 33 |
type: accuracy
|
| 34 |
+
value: 0.5999
|
| 35 |
---
|
| 36 |
# AramT5 - T5 Fine-Tuned on Syriac-to-Latin Transliteration ♰
|
| 37 |
|
|
|
|
| 44 |
- Less stable on very long or morphologically complex words
|
| 45 |
|
| 46 |
> Development information
|
| 47 |
+
> - 🚧 **Current version:** v1 (stage 2)
|
| 48 |
+
> - ⏳ **Upcoming release:** v2 (stage 3)
|
| 49 |
>
|
| 50 |
> **Note:** As of May 19, 2026, AramT5's training process, which was at stage 4, was reset a baseline level due to inconsistencies found in previous versions of the Serto-Madnḥaya mapping code and lack of data for individual words, which mostly invalidated prior learning efforts
|
| 51 |
|
|
|
|
| 149 |
|
| 150 |
## 📋 Version Changelog
|
| 151 |
|
| 152 |
+
* **AramT5 Baseline (May 20, 2026):** T5 fine-tuned on 20k records, across 30 epochs, leveraging the stage 1 configuration. Baseline version with a surprisingly good initial understanding of how to transliterate properly, shown to capture some roots and Syriac morphology in a limited manner
|
| 153 |
+
|
| 154 |
+
* **AramT5 v1 (May 20, 2026):** AraamT5 fine-tuned on 40k records, across 20 epochs, leveraging the stage 2 configuration. A massive upgrade compared to the baseline version, v1 showcases significantly improved morphological handling of not only single words but also sequences with noticeable complexity
|
generation_config.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"decoder_start_token_id": 0,
|
| 3 |
"early_stopping": true,
|
| 4 |
"eos_token_id": 3,
|
| 5 |
-
"length_penalty": 1.
|
| 6 |
"max_length": 24,
|
| 7 |
"min_length": 2,
|
| 8 |
"no_repeat_ngram_size": 2,
|
|
|
|
| 2 |
"decoder_start_token_id": 0,
|
| 3 |
"early_stopping": true,
|
| 4 |
"eos_token_id": 3,
|
| 5 |
+
"length_penalty": 1.12,
|
| 6 |
"max_length": 24,
|
| 7 |
"min_length": 2,
|
| 8 |
"no_repeat_ngram_size": 2,
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 209216552
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:259f4329c111dd8191b255ad65cf5d6fa2aedb0fa90634d005d31ebfca8cee88
|
| 3 |
size 209216552
|
src/train_t5.py
CHANGED
|
@@ -51,30 +51,61 @@ STAGE_CONFIGS = {
|
|
| 51 |
"learning_rate": 3e-4,
|
| 52 |
},
|
| 53 |
2: {
|
| 54 |
-
"description": "Expansion:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
"num_samples": 60_000,
|
| 56 |
"max_src_length": 50,
|
| 57 |
-
"short_mix_ratio": 0.12, # 12% short examples
|
|
|
|
|
|
|
|
|
|
| 58 |
"num_epochs": 20,
|
| 59 |
"learning_rate": 1e-4,
|
| 60 |
},
|
| 61 |
-
|
| 62 |
-
"description": "Extension:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"num_samples": 100_000,
|
| 64 |
"max_src_length": 100,
|
| 65 |
-
"short_mix_ratio": 0.10, # 10% short examples
|
|
|
|
|
|
|
|
|
|
| 66 |
"num_epochs": 20,
|
| 67 |
"learning_rate": 6e-5,
|
| 68 |
-
"repetition_penalty": 1.2,
|
| 69 |
},
|
| 70 |
-
|
| 71 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 72 |
"num_samples": 120_000,
|
| 73 |
-
"max_src_length": 150,
|
| 74 |
-
"short_mix_ratio": 0.10, # 10% short examples
|
|
|
|
|
|
|
|
|
|
| 75 |
"num_epochs": 15,
|
| 76 |
"learning_rate": 4e-5,
|
| 77 |
-
"repetition_penalty": 1.2,
|
| 78 |
},
|
| 79 |
}
|
| 80 |
|
|
@@ -89,8 +120,8 @@ def parse_args():
|
|
| 89 |
"--stage",
|
| 90 |
type=int,
|
| 91 |
default=1,
|
| 92 |
-
choices=[1, 2, 3, 4],
|
| 93 |
-
help="Training stage (1=baseline, 2=medium-long, 3=
|
| 94 |
)
|
| 95 |
parser.add_argument(
|
| 96 |
"--hf-model",
|
|
@@ -307,8 +338,8 @@ def load_and_prepare_data(
|
|
| 307 |
num_middle = int(num_samples * 0.40) # 40% from middle range (15-100)
|
| 308 |
num_main = num_samples - num_short - num_middle
|
| 309 |
|
| 310 |
-
# Short examples (≤
|
| 311 |
-
short_threshold =
|
| 312 |
short_examples = full_dataset.filter(
|
| 313 |
lambda x: x["src_length"] <= short_threshold
|
| 314 |
)
|
|
@@ -343,36 +374,87 @@ def load_and_prepare_data(
|
|
| 343 |
sampled_dataset = sampled_dataset.shuffle(seed=42)
|
| 344 |
|
| 345 |
elif short_mix_ratio > 0 and stage > 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
num_short = int(num_samples * short_mix_ratio)
|
| 347 |
-
num_main = num_samples - num_short
|
| 348 |
-
|
| 349 |
-
# Get short examples (first N after sorting by length)
|
| 350 |
-
short_threshold = 10 # Characters
|
| 351 |
-
short_examples = full_dataset.filter(
|
| 352 |
-
lambda x: x["src_length"] <= short_threshold
|
| 353 |
-
)
|
| 354 |
-
short_examples = short_examples.shuffle(seed=42).select(
|
| 355 |
-
range(min(num_short, len(short_examples)))
|
| 356 |
-
)
|
| 357 |
-
print(f" Short examples (for forgetting mitigation): {len(short_examples)}")
|
| 358 |
-
|
| 359 |
-
# Get main examples from filtered dataset
|
| 360 |
-
# Apply minimum length filter for main examples in later stages
|
| 361 |
-
min_len = stage_config.get("min_src_length", 0)
|
| 362 |
-
if min_len > 0:
|
| 363 |
-
main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
|
| 364 |
-
print(f" Main pool after min_length={min_len} filter: {len(main_pool)} examples")
|
| 365 |
-
else:
|
| 366 |
-
main_pool = filtered_dataset
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
else:
|
| 377 |
sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))
|
| 378 |
|
|
|
|
| 51 |
"learning_rate": 3e-4,
|
| 52 |
},
|
| 53 |
2: {
|
| 54 |
+
"description": "Expansion: short phrases",
|
| 55 |
+
"num_samples": 40_000,
|
| 56 |
+
"max_src_length": 30,
|
| 57 |
+
"short_mix_ratio": 0.12, # 12% short examples from previous stages
|
| 58 |
+
"short_threshold": 15, # ≤15 chars (Stage 1)
|
| 59 |
+
"new_range_ratio": 0.50, # 50% from new range (16-30 chars)
|
| 60 |
+
"new_range_min": 16,
|
| 61 |
+
"num_epochs": 20,
|
| 62 |
+
"learning_rate": 1.2e-4,
|
| 63 |
+
},
|
| 64 |
+
3: {
|
| 65 |
+
"description": "Expansion: medium phrases",
|
| 66 |
"num_samples": 60_000,
|
| 67 |
"max_src_length": 50,
|
| 68 |
+
"short_mix_ratio": 0.12, # 12% short examples from previous stages
|
| 69 |
+
"short_threshold": 30, # ≤30 chars (Stage 1+2)
|
| 70 |
+
"new_range_ratio": 0.50, # 50% from new range (31-50 chars)
|
| 71 |
+
"new_range_min": 31,
|
| 72 |
"num_epochs": 20,
|
| 73 |
"learning_rate": 1e-4,
|
| 74 |
},
|
| 75 |
+
4: {
|
| 76 |
+
"description": "Extension: longer phrases",
|
| 77 |
+
"num_samples": 80_000,
|
| 78 |
+
"max_src_length": 70,
|
| 79 |
+
"short_mix_ratio": 0.10, # 10% short examples from previous stages
|
| 80 |
+
"short_threshold": 50, # ≤50 chars (Stage 1+2+3)
|
| 81 |
+
"new_range_ratio": 0.50, # 50% from new range (51-70 chars)
|
| 82 |
+
"new_range_min": 51,
|
| 83 |
+
"num_epochs": 20,
|
| 84 |
+
"learning_rate": 8e-5,
|
| 85 |
+
},
|
| 86 |
+
5: {
|
| 87 |
+
"description": "Extension: longer sentences",
|
| 88 |
"num_samples": 100_000,
|
| 89 |
"max_src_length": 100,
|
| 90 |
+
"short_mix_ratio": 0.10, # 10% short examples from previous stages
|
| 91 |
+
"short_threshold": 70, # ≤70 chars (Stage 1+2+3+4)
|
| 92 |
+
"new_range_ratio": 0.50, # 50% from new range (71-100 chars)
|
| 93 |
+
"new_range_min": 71,
|
| 94 |
"num_epochs": 20,
|
| 95 |
"learning_rate": 6e-5,
|
| 96 |
+
"repetition_penalty": 1.2,
|
| 97 |
},
|
| 98 |
+
6: {
|
| 99 |
"description": "Full practical corpus: sentences and short paragraphs",
|
| 100 |
"num_samples": 120_000,
|
| 101 |
+
"max_src_length": 150,
|
| 102 |
+
"short_mix_ratio": 0.10, # 10% short examples from previous stages
|
| 103 |
+
"short_threshold": 100, # ≤100 chars (Stage 1+2+3+4+5)
|
| 104 |
+
"new_range_ratio": 0.50, # 50% from new range (101-150 chars)
|
| 105 |
+
"new_range_min": 101,
|
| 106 |
"num_epochs": 15,
|
| 107 |
"learning_rate": 4e-5,
|
| 108 |
+
"repetition_penalty": 1.2,
|
| 109 |
},
|
| 110 |
}
|
| 111 |
|
|
|
|
| 120 |
"--stage",
|
| 121 |
type=int,
|
| 122 |
default=1,
|
| 123 |
+
choices=[1, 2, 3, 4, 5, 6],
|
| 124 |
+
help="Training stage (1=baseline, 2=medium-long, 3=expansion, 4=extension, 5=longer sentences, 6=full practical)",
|
| 125 |
)
|
| 126 |
parser.add_argument(
|
| 127 |
"--hf-model",
|
|
|
|
| 338 |
num_middle = int(num_samples * 0.40) # 40% from middle range (15-100)
|
| 339 |
num_main = num_samples - num_short - num_middle
|
| 340 |
|
| 341 |
+
# Short examples (≤15 chars = Stage 1 range) for forgetting mitigation
|
| 342 |
+
short_threshold = 15
|
| 343 |
short_examples = full_dataset.filter(
|
| 344 |
lambda x: x["src_length"] <= short_threshold
|
| 345 |
)
|
|
|
|
| 374 |
sampled_dataset = sampled_dataset.shuffle(seed=42)
|
| 375 |
|
| 376 |
elif short_mix_ratio > 0 and stage > 1:
|
| 377 |
+
# Stratified sampling: ensure we get examples from the NEW length range
|
| 378 |
+
new_range_ratio = stage_config.get("new_range_ratio", 0)
|
| 379 |
+
new_range_min = stage_config.get("new_range_min", 0)
|
| 380 |
+
|
| 381 |
num_short = int(num_samples * short_mix_ratio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
+
if new_range_ratio > 0 and new_range_min > 0:
|
| 384 |
+
# Stratified: short + new_range + remainder
|
| 385 |
+
num_new_range = int(num_samples * new_range_ratio)
|
| 386 |
+
num_remainder = num_samples - num_short - num_new_range
|
| 387 |
+
|
| 388 |
+
# Short examples = everything from previous stages (for forgetting mitigation)
|
| 389 |
+
short_threshold = stage_config.get("short_threshold", 15)
|
| 390 |
+
short_examples = full_dataset.filter(
|
| 391 |
+
lambda x, thresh=short_threshold: x["src_length"] <= thresh
|
| 392 |
+
)
|
| 393 |
+
short_examples = short_examples.shuffle(seed=42).select(
|
| 394 |
+
range(min(num_short, len(short_examples)))
|
| 395 |
+
)
|
| 396 |
+
print(f" Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
|
| 397 |
+
|
| 398 |
+
# New range examples - these are what the model needs to learn
|
| 399 |
+
new_range_examples = filtered_dataset.filter(
|
| 400 |
+
lambda x, min_len=new_range_min: x["src_length"] >= min_len
|
| 401 |
+
)
|
| 402 |
+
print(f" New range pool ({new_range_min}-{max_len} chars): {len(new_range_examples)} available")
|
| 403 |
+
|
| 404 |
+
# Oversample if needed (these are scarce!)
|
| 405 |
+
if len(new_range_examples) < num_new_range:
|
| 406 |
+
if len(new_range_examples) > 0:
|
| 407 |
+
repeats_needed = (num_new_range // len(new_range_examples)) + 1
|
| 408 |
+
new_range_repeated = concatenate_datasets([new_range_examples] * repeats_needed)
|
| 409 |
+
new_range_examples = new_range_repeated.shuffle(seed=42).select(range(num_new_range))
|
| 410 |
+
print(f" New range examples (oversampled {repeats_needed}x): {len(new_range_examples)}")
|
| 411 |
+
else:
|
| 412 |
+
print(f" WARNING: No examples in new range!")
|
| 413 |
+
new_range_examples = full_dataset.filter(lambda x: False) # empty
|
| 414 |
+
else:
|
| 415 |
+
new_range_examples = new_range_examples.shuffle(seed=42).select(range(num_new_range))
|
| 416 |
+
print(f" New range examples: {len(new_range_examples)}")
|
| 417 |
+
|
| 418 |
+
# Remainder from full filtered set (includes all lengths up to max)
|
| 419 |
+
remainder_examples = filtered_dataset.shuffle(seed=43).select(
|
| 420 |
+
range(min(num_remainder, len(filtered_dataset)))
|
| 421 |
+
)
|
| 422 |
+
print(f" Remainder examples: {len(remainder_examples)}")
|
| 423 |
+
|
| 424 |
+
# Combine and shuffle
|
| 425 |
+
sampled_dataset = concatenate_datasets([short_examples, new_range_examples, remainder_examples])
|
| 426 |
+
sampled_dataset = sampled_dataset.shuffle(seed=42)
|
| 427 |
+
else:
|
| 428 |
+
# Original logic: just short + main
|
| 429 |
+
num_main = num_samples - num_short
|
| 430 |
|
| 431 |
+
# Get short examples = everything from previous stages
|
| 432 |
+
short_threshold = stage_config.get("short_threshold", 15)
|
| 433 |
+
short_examples = full_dataset.filter(
|
| 434 |
+
lambda x, thresh=short_threshold: x["src_length"] <= thresh
|
| 435 |
+
)
|
| 436 |
+
short_examples = short_examples.shuffle(seed=42).select(
|
| 437 |
+
range(min(num_short, len(short_examples)))
|
| 438 |
+
)
|
| 439 |
+
print(f" Short examples (≤{short_threshold} chars, previous stages): {len(short_examples)}")
|
| 440 |
+
|
| 441 |
+
# Get main examples from filtered dataset
|
| 442 |
+
# Apply minimum length filter for main examples in later stages
|
| 443 |
+
min_len = stage_config.get("min_src_length", 0)
|
| 444 |
+
if min_len > 0:
|
| 445 |
+
main_pool = filtered_dataset.filter(lambda x: x["src_length"] >= min_len)
|
| 446 |
+
print(f" Main pool after min_length={min_len} filter: {len(main_pool)} examples")
|
| 447 |
+
else:
|
| 448 |
+
main_pool = filtered_dataset
|
| 449 |
+
|
| 450 |
+
main_examples = main_pool.shuffle(seed=42).select(
|
| 451 |
+
range(min(num_main, len(main_pool)))
|
| 452 |
+
)
|
| 453 |
+
print(f" Main examples: {len(main_examples)}")
|
| 454 |
+
|
| 455 |
+
# Combine and shuffle
|
| 456 |
+
sampled_dataset = concatenate_datasets([short_examples, main_examples])
|
| 457 |
+
sampled_dataset = sampled_dataset.shuffle(seed=42)
|
| 458 |
else:
|
| 459 |
sampled_dataset = filtered_dataset.shuffle(seed=42).select(range(num_samples))
|
| 460 |
|
tokenizer.json
CHANGED
|
@@ -1,7 +1,21 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": {
|
| 4 |
+
"direction": "Right",
|
| 5 |
+
"max_length": 128,
|
| 6 |
+
"strategy": "LongestFirst",
|
| 7 |
+
"stride": 0
|
| 8 |
+
},
|
| 9 |
+
"padding": {
|
| 10 |
+
"strategy": {
|
| 11 |
+
"Fixed": 128
|
| 12 |
+
},
|
| 13 |
+
"direction": "Right",
|
| 14 |
+
"pad_to_multiple_of": null,
|
| 15 |
+
"pad_id": 0,
|
| 16 |
+
"pad_type_id": 0,
|
| 17 |
+
"pad_token": "<pad>"
|
| 18 |
+
},
|
| 19 |
"added_tokens": [
|
| 20 |
{
|
| 21 |
"id": 0,
|