Text Generation
Transformers
Safetensors
Portuguese
qwen3
text-generation-inference
conversational
Eval Results (legacy)
nicholasKluge commited on
Commit
e7d1b07
·
verified ·
1 Parent(s): 0759a45

Upload training_config_sft.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_config_sft.yaml +110 -0
training_config_sft.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directory settings
2
+ checkpoint_dir: "/polyglot/portuguese/checkpoints/models/Tucano2-qwen-0.5B-Instruct-SFT"
3
+ train_dataset_dir:
4
+ # Total: ~874 million tokens (x5 epochs)
5
+ # Coding: ~2.3 million tokens
6
+ - /polyglot/portuguese/gigaverbo-v2-sft/code
7
+ # Function Calling: ~17.5 million tokens
8
+ - /polyglot/portuguese/gigaverbo-v2-sft/function_call
9
+ # General Instruction Following: ~700 million tokens
10
+ - /polyglot/portuguese/gigaverbo-v2-sft/general
11
+ # Math and CoT: ~27 million tokens
12
+ - /polyglot/portuguese/gigaverbo-v2-sft/math_cot
13
+ # Retrieval Augmented Generation: ~2.2 million tokens
14
+ - /polyglot/portuguese/gigaverbo-v2-sft/retrieval
15
+ # Structured Outputs: ~35 million tokens
16
+ - /polyglot/portuguese/gigaverbo-v2-sft/structured
17
+ # Summarization: ~290 thousand tokens
18
+ - /polyglot/portuguese/gigaverbo-v2-sft/summarization
19
+ # Translation: ~5.7 million tokens
20
+ - /polyglot/portuguese/gigaverbo-v2-sft/translation
21
+ # Chosen Data from Preference Dataset: ~14 million tokens
22
+ - /polyglot/portuguese/gigaverbo-v2-sft/dpo
23
+ val_dataset_dir: null
24
+ dataset_type: "jsonl"
25
+ cache_dir: "/lustre/mlnvme/data/polyglot/.cache"
26
+
27
+ # Data loading settings
28
+ pin_memory: true
29
+ num_workers_for_dataloader: 16
30
+ shuffle_dataset: true
31
+ mask_eos_token: false
32
+ mask_pad_token: true
33
+
34
+ # Model architecture settings
35
+ vocab_size: 49152
36
+ num_hidden_layers: 28
37
+ num_attention_heads: 16
38
+ num_key_value_heads: 8
39
+ head_dim: 128
40
+ hidden_size: 1024
41
+ intermediate_size: 3072
42
+ max_position_embeddings: 4096
43
+ tie_word_embeddings: true
44
+ hidden_act: "silu"
45
+ output_hidden_states: false
46
+ attn_implementation: "flash_attention_2"
47
+ use_cache: false
48
+ no_rope_layer_interval: null
49
+ rope_theta: 1000000.0
50
+ rope_scale_factor: null
51
+ rms_norm_eps: 0.000001
52
+
53
+ # Training settings
54
+ total_batch_size: 524288
55
+ micro_batch_size: 4
56
+ gradient_accumulation_steps: 4
57
+ eval_micro_batch_size: null
58
+ num_train_epochs: 5
59
+ warmup_ratio: 0.1
60
+ max_learning_rate: 0.000085
61
+ min_learning_rate: 0.0
62
+ muon_learning_rate: null
63
+ weight_decay: 0.0
64
+ beta1: 0.9
65
+ beta2: 0.95
66
+ eps: 0.00000001
67
+ lr_decay_type: "cosine"
68
+ use_sqrt: false
69
+ lr_decay_iters_coef: 1.
70
+ seed: 42
71
+ max_steps: 68635
72
+ max_grad_norm: 1.0
73
+
74
+ # SFT settings
75
+ packing: false
76
+ assistant_only_loss: true
77
+
78
+ # Precision and optimization settings
79
+ torch_compile: false
80
+ mat_mul_precision: "highest"
81
+ tf32: true
82
+ bf16: true
83
+ gradient_checkpointing: false
84
+ use_liger_kernel: true
85
+ static_graph: false
86
+
87
+ # Hub settings
88
+ push_to_hub: false
89
+ hub_token: null
90
+ hub_model_id: null
91
+
92
+ # Tokenizer and Reference model
93
+ tokenizer_name_or_path: "Polygl0t/Tucano2-qwen-0.5B-Base"
94
+ chat_template_path: null
95
+ reference_model: "Polygl0t/Tucano2-qwen-0.5B-Base"
96
+ continual_pretraining: true
97
+
98
+ # Checkpoint settings
99
+ resume_from_checkpoint: null
100
+ checkpointing_steps: 1000
101
+ begin_new_stage: true
102
+ stage_name: "single_cosine"
103
+
104
+ # Miscellaneous settings
105
+ sanity_check: false
106
+ sanity_check_num_samples: 100000
107
+ wandb_token: null
108
+ wandb_id: "tucano2-qwen-0.5b-instruct-sft"
109
+ wandb_project: "Polyglot"
110
+ wandb_desc: "Developing LLMs for low-resource languages"