Translation
Persian
English
Eval Results
radinplaid commited on
Commit
1ce1362
·
verified ·
1 Parent(s): 4bba198

Update eole-config.yaml

Browse files

Show huggingface datasets rather than local files

Files changed (1) hide show
  1. eole-config.yaml +15 -14
eole-config.yaml CHANGED
@@ -8,8 +8,8 @@ tensorboard: true
8
  tensorboard_log_dir: tensorboard
9
 
10
  ### Vocab
11
- src_vocab: faen/fa.eole.vocab
12
- tgt_vocab: faen/en.eole.vocab
13
  src_vocab_size: 32000
14
  tgt_vocab_size: 32000
15
  vocab_size_multiple: 8
@@ -18,26 +18,29 @@ n_sample: 0
18
 
19
  data:
20
  corpus_1:
21
- path_src: faen/train.cleaned.filtered.fa
22
- path_tgt: faen/train.cleaned.filtered.en
 
23
  weight: 2
24
  corpus_2:
25
- path_src: /home/mark/mt/data/newscrawl.backtrans.fa
26
- path_tgt: /home/mark/mt/data/newscrawl.2024.en
 
27
  weight: 1
28
  corpus_3:
29
- path_src: /home/mark/mt/data/madlad.backtrans.fa
30
- path_tgt: /home/mark/mt/data/madlad.en
 
31
  weight: 2
32
  valid:
33
- path_src: faen/dev.fa
34
- path_tgt: faen/dev.en
35
 
36
  transforms: [sentencepiece, filtertoolong]
37
  transforms_configs:
38
  sentencepiece:
39
- src_subword_model: "faen/fa.spm.model"
40
- tgt_subword_model: "faen/en.spm.model"
41
  filtertoolong:
42
  src_seq_length: 256
43
  tgt_seq_length: 256
@@ -55,7 +58,6 @@ training:
55
  gpu_ranks: [0]
56
 
57
  # Batching 120,000 tokens
58
- # For RTX 5090, 15000 batch size, accum_count 8
59
  batch_type: "tokens"
60
  batch_size: 6000
61
  valid_batch_size: 2048
@@ -66,7 +68,6 @@ training:
66
  # Optimizer & Compute
67
  compute_dtype: "fp16"
68
  optim: "adamw"
69
- #use_amp: True
70
  learning_rate: 3.0
71
  warmup_steps: 5000
72
  decay_method: "noam"
 
8
  tensorboard_log_dir: tensorboard
9
 
10
  ### Vocab
11
+ src_vocab: fa.eole.vocab
12
+ tgt_vocab: en.eole.vocab
13
  src_vocab_size: 32000
14
  tgt_vocab_size: 32000
15
  vocab_size_multiple: 8
 
18
 
19
  data:
20
  corpus_1:
21
+ path_src: hf://quickmt/quickmt-train.fa-en/fa
22
+ path_tgt: hf://quickmt/quickmt-train.fa-en/en
23
+ path_sco: hf://quickmt/quickmt-train.fa-en/sco
24
  weight: 2
25
  corpus_2:
26
+ path_src: hf://quickmt/newscrawl2024-en-backtranslated-fa/fa
27
+ path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-fa/en
28
+ path_sco: hf://quickmt/newscrawl2024-en-backtranslated-fa/sco
29
  weight: 1
30
  corpus_3:
31
+ path_src: hf://quickmt/madlad400-en-backtranslated-fa/fa
32
+ path_tgt: hf://quickmt/madlad400-en-backtranslated-fa/en
33
+ path_sco: hf://quickmt/madlad400-en-backtranslated-fa/sco
34
  weight: 2
35
  valid:
36
+ path_src: dev.fa
37
+ path_tgt: dev.en
38
 
39
  transforms: [sentencepiece, filtertoolong]
40
  transforms_configs:
41
  sentencepiece:
42
+ src_subword_model: "fa.spm.model"
43
+ tgt_subword_model: "en.spm.model"
44
  filtertoolong:
45
  src_seq_length: 256
46
  tgt_seq_length: 256
 
58
  gpu_ranks: [0]
59
 
60
  # Batching 120,000 tokens
 
61
  batch_type: "tokens"
62
  batch_size: 6000
63
  valid_batch_size: 2048
 
68
  # Optimizer & Compute
69
  compute_dtype: "fp16"
70
  optim: "adamw"
 
71
  learning_rate: 3.0
72
  warmup_steps: 5000
73
  decay_method: "noam"