Nitzanbanin commited on
Commit
b2bcfa3
·
verified ·
1 Parent(s): 93f7604

Save tokenizer and model configuration files (re-attempt)

Browse files
Files changed (3) hide show
  1. schema.yaml +119 -7
  2. tokenizer_config.json +139 -0
  3. vocab.json +5 -0
schema.yaml CHANGED
@@ -1,7 +1,119 @@
1
- tokenizer:
2
- type: factor
3
- factors: [prefix, root, pattern, suffix, morph_tags]
4
- model:
5
- type: transformer
6
- hidden_size: 512
7
- layers: 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_schema:
3
+ metadata:
4
+ name: IvriNet
5
+ version: 0.2
6
+ author: "נִצן בַנין"
7
+ description: >
8
+ Hebrew-first language model built from scratch,
9
+ using factor-based tokenization (prefix-root-pattern-suffix),
10
+ with formal morphological rules encoded in the tokenizer.
11
+ license: CC-BY-SA 4.0
12
+
13
+ tokenization:
14
+ type: factor_based
15
+ factors:
16
+ prefix:
17
+ description: תחיליות / clitics
18
+ examples: [ו-, ב-, כ-, ל-, מ-, ה-, ש-]
19
+ rules:
20
+ - attach_to_root_if_first_token
21
+ - separate_if_multiword_expression
22
+ root:
23
+ description: שורש תלת/ארבעי
24
+ examples: [כ-ת-ב, א-ה-ב, ל-מ-ד]
25
+ rules:
26
+ - detect_consonantal_pattern
27
+ - normalize_alef_variants
28
+ - mark_rare_roots_as_backoff
29
+ pattern:
30
+ description: בניין / משקל / template
31
+ examples: [פָּעַל, הפעיל, התפעל, מקטלה, קטילה]
32
+ rules:
33
+ - infer_from_vowels_or_consonantal_positions
34
+ - map_irregular_patterns_to_standard
35
+ suffix:
36
+ description: סופיות / inflections
37
+ examples: [-י, -ך, -נו, -כם, -יהם]
38
+ rules:
39
+ - map_to_person_number_gender
40
+ - separate_clitics_from_root_if_multiword
41
+ morph_tags:
42
+ description: דקדוק / grammatical features
43
+ examples:
44
+ - gender: ז, נ
45
+ - number: יחיד, רבים
46
+ - person: 1,2,3
47
+ - tense: עבר, הווה, עתיד
48
+ - definiteness: כן, לא
49
+ - smikhut: כן, לא
50
+ rules:
51
+ - assign_to_suffix_or_root_as_appropriate
52
+ backoff:
53
+ description: fallback for unknown or foreign words
54
+ type: byte_level
55
+ examples: [ASCII, Unicode rare chars]
56
+ rules:
57
+ - segment_unknown_words_to_bytes
58
+
59
+ exceptions:
60
+ proper_names:
61
+ handling: keep_as_single_token
62
+ foreign_terms:
63
+ handling: transliteration_or_single_token
64
+ emoticons_and_emoji:
65
+ handling: single_token_backoff
66
+
67
+ model:
68
+ architecture: transformer_decoder
69
+ parameters: 3e9
70
+ layers: 36
71
+ heads: 24
72
+ hidden_size: 4096
73
+ embedding_size: 1024
74
+ dropout: 0.1
75
+ attention_type: RoPE
76
+ context_window: 16384
77
+ factor_embedding_sharing: true
78
+ factor_types: [prefix, root, pattern, suffix, morph_tags, backoff]
79
+
80
+ training:
81
+ corpus:
82
+ size: 50000000000 # Changed 50GB to a numerical value (50 billion)
83
+ sources:
84
+ literary: 30%
85
+ news: 25%
86
+ spoken_transcripts: 20%
87
+ academic: 15%
88
+ mixed_other: 10%
89
+ preprocessing:
90
+ - normalize_unicode
91
+ - remove_html
92
+ - morphological_segmentation
93
+ - clitic_and_affix_detection
94
+ optimizer:
95
+ type: AdamW
96
+ learning_rate: 2e-4
97
+ weight_decay: 0.01
98
+ schedule:
99
+ warmup_steps: 2000
100
+ decay: cosine
101
+ objectives:
102
+ - MaskedRootPrediction
103
+ - TemplateCompletion
104
+ - AffixDenoising
105
+ - DiacriticsImputation
106
+ - SmikhutAgreement
107
+
108
+ multilingual_expansion:
109
+ target_languages: [english, arabic, russian]
110
+ approach:
111
+ - map_factor_layers_across_languages
112
+ - share_embeddings_for_common_factors
113
+ - fine_tune_jointly
114
+ - freeze_hebrew_core_for_stable_base
115
+
116
+ notes:
117
+ - Hebrew-first approach allows more context-awareness and fewer tokens per sentence
118
+ - Factor-based tokenizer encodes inherent linguistic rules
119
+ - Model is designed to be extensible to morphologically rich languages
tokenizer_config.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": true,
13
+ "special": false
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "CLS",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true,
22
+ "special": false
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "SEP",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": true,
31
+ "special": false
32
+ }
33
+ ],
34
+ "normalizer": {
35
+ "type": "Sequence",
36
+ "normalizers": [
37
+ {
38
+ "type": "NFD"
39
+ },
40
+ {
41
+ "type": "StripAccents"
42
+ },
43
+ {
44
+ "type": "Lowercase"
45
+ }
46
+ ]
47
+ },
48
+ "pre_tokenizer": {
49
+ "type": "Whitespace"
50
+ },
51
+ "post_processor": {
52
+ "type": "TemplateProcessing",
53
+ "single": [
54
+ {
55
+ "SpecialToken": {
56
+ "id": "CLS",
57
+ "type_id": 0
58
+ }
59
+ },
60
+ {
61
+ "Sequence": {
62
+ "id": "A",
63
+ "type_id": 0
64
+ }
65
+ },
66
+ {
67
+ "SpecialToken": {
68
+ "id": "SEP",
69
+ "type_id": 0
70
+ }
71
+ }
72
+ ],
73
+ "pair": [
74
+ {
75
+ "SpecialToken": {
76
+ "id": "CLS",
77
+ "type_id": 0
78
+ }
79
+ },
80
+ {
81
+ "Sequence": {
82
+ "id": "A",
83
+ "type_id": 0
84
+ }
85
+ },
86
+ {
87
+ "SpecialToken": {
88
+ "id": "SEP",
89
+ "type_id": 0
90
+ }
91
+ },
92
+ {
93
+ "Sequence": {
94
+ "id": "B",
95
+ "type_id": 0
96
+ }
97
+ },
98
+ {
99
+ "SpecialToken": {
100
+ "id": "SEP",
101
+ "type_id": 0
102
+ }
103
+ }
104
+ ],
105
+ "special_tokens": {
106
+ "CLS": {
107
+ "id": "CLS",
108
+ "ids": [
109
+ 1
110
+ ],
111
+ "tokens": [
112
+ "CLS"
113
+ ]
114
+ },
115
+ "SEP": {
116
+ "id": "SEP",
117
+ "ids": [
118
+ 2
119
+ ],
120
+ "tokens": [
121
+ "SEP"
122
+ ]
123
+ }
124
+ }
125
+ },
126
+ "decoder": null,
127
+ "model": {
128
+ "type": "BPE",
129
+ "dropout": null,
130
+ "unk_token": "[UNK]",
131
+ "continuing_subword_prefix": null,
132
+ "end_of_word_suffix": null,
133
+ "fuse_unk": false,
134
+ "byte_fallback": false,
135
+ "ignore_merges": false,
136
+ "vocab": {},
137
+ "merges": []
138
+ }
139
+ }
vocab.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "SEP": 2,
3
+ "CLS": 1,
4
+ "[UNK]": 0
5
+ }