Save tokenizer and model configuration files (re-attempt)

Browse files

Files changed (3) hide show

schema.yaml +119 -7
tokenizer_config.json +139 -0
vocab.json +5 -0

schema.yaml CHANGED Viewed

@@ -1,7 +1,119 @@
-tokenizer:
-  type: factor
-  factors: [prefix, root, pattern, suffix, morph_tags]
-model:
-  type: transformer
-  hidden_size: 512
-  layers: 8

+model_schema:
+  metadata:
+    name: IvriNet
+    version: 0.2
+    author: "נִצן בַנין"
+    description: >
+      Hebrew-first language model built from scratch,
+      using factor-based tokenization (prefix-root-pattern-suffix),
+      with formal morphological rules encoded in the tokenizer.
+    license: CC-BY-SA 4.0
+  tokenization:
+    type: factor_based
+    factors:
+      prefix:
+        description: תחיליות / clitics
+        examples: [ו-, ב-, כ-, ל-, מ-, ה-, ש-]
+        rules:
+          - attach_to_root_if_first_token
+          - separate_if_multiword_expression
+      root:
+        description: שורש תלת/ארבעי
+        examples: [כ-ת-ב, א-ה-ב, ל-מ-ד]
+        rules:
+          - detect_consonantal_pattern
+          - normalize_alef_variants
+          - mark_rare_roots_as_backoff
+      pattern:
+        description: בניין / משקל / template
+        examples: [פָּעַל, הפעיל, התפעל, מקטלה, קטילה]
+        rules:
+          - infer_from_vowels_or_consonantal_positions
+          - map_irregular_patterns_to_standard
+      suffix:
+        description: סופיות / inflections
+        examples: [-י, -ך, -נו, -כם, -יהם]
+        rules:
+          - map_to_person_number_gender
+          - separate_clitics_from_root_if_multiword
+      morph_tags:
+        description: דקדוק / grammatical features
+        examples:
+          - gender: ז, נ
+          - number: יחיד, רבים
+          - person: 1,2,3
+          - tense: עבר, הווה, עתיד
+          - definiteness: כן, לא
+          - smikhut: כן, לא
+        rules:
+          - assign_to_suffix_or_root_as_appropriate
+      backoff:
+        description: fallback for unknown or foreign words
+        type: byte_level
+        examples: [ASCII, Unicode rare chars]
+        rules:
+          - segment_unknown_words_to_bytes
+    exceptions:
+      proper_names:
+        handling: keep_as_single_token
+      foreign_terms:
+        handling: transliteration_or_single_token
+      emoticons_and_emoji:
+        handling: single_token_backoff
+  model:
+    architecture: transformer_decoder
+    parameters: 3e9
+    layers: 36
+    heads: 24
+    hidden_size: 4096
+    embedding_size: 1024
+    dropout: 0.1
+    attention_type: RoPE
+    context_window: 16384
+    factor_embedding_sharing: true
+    factor_types: [prefix, root, pattern, suffix, morph_tags, backoff]
+  training:
+    corpus:
+      size: 50000000000 # Changed 50GB to a numerical value (50 billion)
+      sources:
+        literary: 30%
+        news: 25%
+        spoken_transcripts: 20%
+        academic: 15%
+        mixed_other: 10%
+      preprocessing:
+        - normalize_unicode
+        - remove_html
+        - morphological_segmentation
+        - clitic_and_affix_detection
+    optimizer:
+      type: AdamW
+      learning_rate: 2e-4
+      weight_decay: 0.01
+    schedule:
+      warmup_steps: 2000
+      decay: cosine
+    objectives:
+      - MaskedRootPrediction
+      - TemplateCompletion
+      - AffixDenoising
+      - DiacriticsImputation
+      - SmikhutAgreement
+  multilingual_expansion:
+    target_languages: [english, arabic, russian]
+    approach:
+      - map_factor_layers_across_languages
+      - share_embeddings_for_common_factors
+      - fine_tune_jointly
+      - freeze_hebrew_core_for_stable_base
+  notes:
+    - Hebrew-first approach allows more context-awareness and fewer tokens per sentence
+    - Factor-based tokenizer encodes inherent linguistic rules
+    - Model is designed to be extensible to morphologically rich languages

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 1,
+      "content": "CLS",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 2,
+      "content": "SEP",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFD"
+      },
+      {
+        "type": "StripAccents"
+      },
+      {
+        "type": "Lowercase"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "CLS",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "SEP",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "CLS",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "SEP",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "SEP",
+          "type_id": 0
+        }
+      }
+    ],
+    "special_tokens": {
+      "CLS": {
+        "id": "CLS",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "CLS"
+        ]
+      },
+      "SEP": {
+        "id": "SEP",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "SEP"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "[UNK]",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {},
+    "merges": []
+  }
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "SEP": 2,
+  "CLS": 1,
+  "[UNK]": 0
+}