MTSmash commited on
Commit
1377f79
·
verified ·
1 Parent(s): bfb0e83

Upload 9 files

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {% if system %}<|System|>
3
+ {{ system }}
4
+ {% endif %}
5
+ {% for message in messages %}
6
+ {% if message.role == 'user' %}<|Benutzer|>
7
+ {{ message.content }}
8
+ {% elif message.role == 'assistant' %}<|Assistentin|>
9
+ {{ message.content }}
10
+ {% endif %}
11
+ {% endfor %}
12
+ {% if add_generation_prompt %}<|Assistentin|>
13
+ {% else %}</s>{% endif %}
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 35303,
3
+ "hidden_size": 1792,
4
+ "intermediate_size": 3584,
5
+ "num_hidden_layers": 18,
6
+ "num_attention_heads": 14,
7
+ "num_key_value_heads": 14,
8
+ "head_dim": 128,
9
+ "sliding_window": 4096,
10
+ "hidden_act": "silu",
11
+ "attention_dropout": 0.0,
12
+ "rms_norm_eps": 1e-05,
13
+ "initializer_range": 0.02,
14
+ "is_decoder": false,
15
+ "add_cross_attention": false,
16
+ "use_cache": false,
17
+ "attention_bias": true,
18
+ "num_local_experts": 1,
19
+ "num_experts_per_tok": 1,
20
+ "router_aux_loss_coef": 0.0,
21
+ "output_router_logits": false,
22
+ "layer_types": [
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "full_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 8192,
43
+ "bos_token_id": 1,
44
+ "eos_token_id": 2,
45
+ "pad_token_id": 3,
46
+ "return_dict": true,
47
+ "output_hidden_states": false,
48
+ "dtype": "bfloat16",
49
+ "tie_word_embeddings": false,
50
+ "chunk_size_feed_forward": 0,
51
+ "is_encoder_decoder": false,
52
+ "cross_attention_hidden_size": null,
53
+ "tie_encoder_decoder": false,
54
+ "architectures": [
55
+ "EvaGptForCausalLM"
56
+ ],
57
+ "finetuning_task": null,
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "label2id": {
63
+ "LABEL_0": 0,
64
+ "LABEL_1": 1
65
+ },
66
+ "task_specific_params": null,
67
+ "problem_type": null,
68
+ "tokenizer_class": null,
69
+ "prefix": null,
70
+ "sep_token_id": null,
71
+ "decoder_start_token_id": null,
72
+ "_name_or_path": "./training_outputs/EvaGPT-German-0.7B_2026_7_2026-01-08_17-33-14_2026-01-08_19-10-49",
73
+ "transformers_version": "5.0.0.dev0",
74
+ "model_type": "eva_gpt",
75
+ "output_attentions": false,
76
+ "rope_scaling": {
77
+ "type": "yarn",
78
+ "factor": 2.0,
79
+ "original_max_position_embeddings": 4096
80
+ }
81
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "5.0.0.dev0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27fd1030fcd576515fc7084bbdef8c97a855a685f12db604ebad0140f4bcfef
3
+ size 1410052204
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dddf9d35fc8790d29daa471f4721338f6136df1b7543a05457425f005ff581fe
3
+ size 753494
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "cls_token": "</s>",
7
+ "eos_token": "</s>",
8
+ "is_local": true,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "model_specific_special_tokens": {},
11
+ "pad_token": "<pad>",
12
+ "sep_token": "</s>",
13
+ "sp_model_kwargs": {},
14
+ "spaces_between_special_tokens": false,
15
+ "tokenizer_class": "LlamaTokenizer",
16
+ "unk_token": "<unk>",
17
+ "use_default_system_prompt": false
18
+ }
train_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_dir": "./training_outputs/EvaGPT-German-0.7B_2026_7_2026-01-08_17-33-14",
3
+ "csv_path": "./datasets/output-100-3k.csv",
4
+ "device": "cuda",
5
+ "train_mode": "lora",
6
+ "learning_rate": 0.0002,
7
+ "lr_schedule": "cosine",
8
+ "per_device_train_batch_size": 1,
9
+ "gradient_accumulation_steps": 7,
10
+ "num_train_epochs": 1.0,
11
+ "max_steps": null,
12
+ "chunk_size": 4096,
13
+ "max_seq_length": 4096,
14
+ "template_mode": "dialogplus",
15
+ "column_name": "text",
16
+ "lora_r": 100,
17
+ "lora_alpha": 125,
18
+ "shuffle": true,
19
+ "sort_by_length": true,
20
+ "use_ngrams": false,
21
+ "ngram_max": 12,
22
+ "ngram_top_k": 1500,
23
+ "ngram_min_chars": 16,
24
+ "ngram_min_words": 2,
25
+ "ngram_max_samples": 4000,
26
+ "ngram_budgeted": true,
27
+ "ngram_target_fit": 0.98,
28
+ "ngram_eval_samples": 512,
29
+ "ngram_add_batch": 64,
30
+ "ngram_min_count": 2,
31
+ "ngram_max_token_chars": 384,
32
+ "ngram_max_tokens_per_text": 4096,
33
+ "precision_mode": "bf16",
34
+ "gradient_checkpointing": true,
35
+ "save_dir": "./training_outputs",
36
+ "merge_lora_on_save": true,
37
+ "dataloader_num_workers": 0,
38
+ "max_grad_norm": 1.0,
39
+ "weight_decay": 0.01
40
+ }
training.log ADDED
The diff for this file is too large to render. See raw diff