Add files using upload-large-folder tool
Browse files- README.md +130 -0
- chat_template.jinja +50 -0
- checkpoint.pt +3 -0
- config.json +29 -0
- generation_config.json +8 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +31 -0
README.md
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
tags:
|
| 4 |
+
- daisy
|
| 5 |
+
- causal-lm
|
| 6 |
+
- pretrained
|
| 7 |
+
license: apache-2.0
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# DaisyCore — daisy_milli
|
| 11 |
+
|
| 12 |
+
## Model Description
|
| 13 |
+
|
| 14 |
+
DaisyCore transformer with 26 layers, 14 attention heads, and a model dimension of 1,792. Uses block-causal sliding window attention (window size 2,048) with standard attention implementation.
|
| 15 |
+
|
| 16 |
+
## Architecture
|
| 17 |
+
|
| 18 |
+
| Property | Value |
|
| 19 |
+
|:---|:---|
|
| 20 |
+
| Architecture | DaisyCore |
|
| 21 |
+
| Layers | 26 |
|
| 22 |
+
| Attention Heads | 14 |
|
| 23 |
+
| Model Dimension | 1,792 |
|
| 24 |
+
| Head Dimension | 128 |
|
| 25 |
+
| Sliding Window Size | 2,048 |
|
| 26 |
+
| Max Sequence Length | 131,072 |
|
| 27 |
+
| Vocabulary Size | 49,152 |
|
| 28 |
+
| Attention Implementation | standard |
|
| 29 |
+
| Value Embeddings | True |
|
| 30 |
+
| Tied Embeddings | False |
|
| 31 |
+
| Skip Mix Mode | linear |
|
| 32 |
+
| Tokenizer | `jonathanmiddleton/daisy` |
|
| 33 |
+
| Dtype | bfloat16 |
|
| 34 |
+
| Parameters (total) | 2,323,120,245 |
|
| 35 |
+
| Parameters (non-embedding) | 1,001,914,485 |
|
| 36 |
+
| Parameters (embedding) | 1,321,205,760 |
|
| 37 |
+
|
| 38 |
+
## Training Progress
|
| 39 |
+
|
| 40 |
+
| Metric | Value |
|
| 41 |
+
|:---|:---|
|
| 42 |
+
| Checkpoint Step | 52,959 |
|
| 43 |
+
| Tokens Processed | 143.26B (143,262,744,576) |
|
| 44 |
+
| Target Tokens | 300.00B (300,000,000,000) |
|
| 45 |
+
| Progress | 47.8% |
|
| 46 |
+
| Best Validation Loss | 2.07058 |
|
| 47 |
+
| Evaluations Performed | 912 |
|
| 48 |
+
| Saved | 2026-03-06 00:15 UTC |
|
| 49 |
+
|
| 50 |
+
## Training Configuration
|
| 51 |
+
|
| 52 |
+
### Optimizers
|
| 53 |
+
|
| 54 |
+
| Optimizer | Parameter Group | Learning Rate |
|
| 55 |
+
|:---|:---|:---|
|
| 56 |
+
| AdamW | head_params | 0.003216 |
|
| 57 |
+
| AdamW | embed_params | 0.1865 |
|
| 58 |
+
| AdamW | scalar_params | 0.02099 |
|
| 59 |
+
| Muon | hidden_matrix_params | 0.025 |
|
| 60 |
+
|
| 61 |
+
### Schedule & Regularization
|
| 62 |
+
|
| 63 |
+
| Parameter | Value |
|
| 64 |
+
|:---|:---|
|
| 65 |
+
| LR Scale | 1.0 |
|
| 66 |
+
| LR Schedule | n_phase_linear |
|
| 67 |
+
| LR Schedule — begin_after_fraction | 0.0 |
|
| 68 |
+
| LR Schedule — cooldown_fraction | 0.0 |
|
| 69 |
+
| LR Schedule — floor | 0.0 |
|
| 70 |
+
| LR Schedule — phases | [{'progress': 0.0, 'scale': 1.0}, {'progress': 0.36117676, 'scale': 0.20527}, {'progress': 1.0, 'scale': 0.1}] |
|
| 71 |
+
| LR Schedule — warmup_fraction | 0.0 |
|
| 72 |
+
| Gradient Accumulation Steps | 5 |
|
| 73 |
+
| Muon Warmup Steps | 300 |
|
| 74 |
+
| Seed | 1337 |
|
| 75 |
+
|
| 76 |
+
### Training Data
|
| 77 |
+
|
| 78 |
+
| Type | Sequence Length | Path |
|
| 79 |
+
|:---|:---|:---|
|
| 80 |
+
| fineweb-edu-dedup | 16,384 | `data/fineweb-edu-dedup/fineweb-edu-dedup_jonathanmiddleton_daisy_train_*.bin[000600:005000]` |
|
| 81 |
+
|
| 82 |
+
### Checkpoint Provenance
|
| 83 |
+
|
| 84 |
+
- **Resumed from**: `JonathanMiddleton/daisy-milli-base-v18d.b`
|
| 85 |
+
|
| 86 |
+
## All Hyperparameters
|
| 87 |
+
|
| 88 |
+
| Parameter | Value |
|
| 89 |
+
|:---|:---|
|
| 90 |
+
| window_size | 2048 |
|
| 91 |
+
| vocab_size | 49152 |
|
| 92 |
+
| eos_token_id | 49131 |
|
| 93 |
+
| num_layers | 26 |
|
| 94 |
+
| num_heads | 14 |
|
| 95 |
+
| model_dim | 1792 |
|
| 96 |
+
| head_dim | 128 |
|
| 97 |
+
| max_seq_len | 131072 |
|
| 98 |
+
| model_spec | daisy_milli |
|
| 99 |
+
| model_class | models.daisy.daisy_core.DaisyCore |
|
| 100 |
+
| target_tokens | 100000000000 |
|
| 101 |
+
| full_window_target_tokens | 3000000000 |
|
| 102 |
+
| torch_coordinate_descent_tuning | False |
|
| 103 |
+
| torch_inductor_config_max_autotune | False |
|
| 104 |
+
| overfit | False |
|
| 105 |
+
| full_windows | False |
|
| 106 |
+
| wandb_log | True |
|
| 107 |
+
| wandb_project | milli |
|
| 108 |
+
| wandb_run_name | milli_v18d.d |
|
| 109 |
+
| wandb_group | pretrain |
|
| 110 |
+
| resume_checkpoint | JonathanMiddleton/daisy-milli-base-v18d.b |
|
| 111 |
+
| resume_target_tokens_override | 300000000000 |
|
| 112 |
+
| use_value_embeddings | True |
|
| 113 |
+
| use_tied_embeddings | False |
|
| 114 |
+
| seed | 1337 |
|
| 115 |
+
| task_val_debug_log_samples | False |
|
| 116 |
+
| log_interval | 16384 |
|
| 117 |
+
| muon_warmup_steps | 300 |
|
| 118 |
+
| lr_scale | 1.0 |
|
| 119 |
+
| cooldown_fraction | 0.0 |
|
| 120 |
+
| lr_schedule | {"name": "n_phase_linear", "config": {"cooldown_fraction": 0.0, "phases": [{"progress": 0.0, "scale": 1.0}, {"progress": 0.36117676, "scale": 0.20527}, {"progress": 1.0, "scale": 0.1}], "floor": 0.0, "warmup_fraction": 0.0, "begin_after_fraction": 0.0}} |
|
| 121 |
+
| grad_acc_steps | 5 |
|
| 122 |
+
| val_loss_every_tokens | 245760000 |
|
| 123 |
+
| checkpoint_warmup_tokens | 1 |
|
| 124 |
+
| checkpoint_per_n_tokens | 245760000 |
|
| 125 |
+
| save_checkpoint | True |
|
| 126 |
+
| benchmarks_frequency | 2 |
|
| 127 |
+
| mmlu_cache_bin_path | data/mmlu_cache/mmlu_cache.bin |
|
| 128 |
+
| mmlu_cache_bin_rebuild | False |
|
| 129 |
+
| task_training | False |
|
| 130 |
+
| track_last_n_layers | 0 |
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{#- Daisy Chat Template v2 -#}
|
| 2 |
+
{#- Supports: ChatML format, tool calling, multipart content -#}
|
| 3 |
+
|
| 4 |
+
{#- Macro to render content (string or multipart) -#}
|
| 5 |
+
{%- macro render_content(content) -%}
|
| 6 |
+
{%- if content is string -%}
|
| 7 |
+
{{ content }}
|
| 8 |
+
{%- elif content is iterable -%}
|
| 9 |
+
{%- for part in content -%}
|
| 10 |
+
{%- if part.type == 'text' -%}
|
| 11 |
+
{{ part.text }}
|
| 12 |
+
{%- elif part.type == 'tool_call' -%}
|
| 13 |
+
<|tool_call|>{{ part.text }}<|/tool_call|>
|
| 14 |
+
{%- elif part.type == 'tool_result' -%}
|
| 15 |
+
<|tool_result|>{{ part.text }}<|/tool_result|>
|
| 16 |
+
{%- elif part.type == 'python' -%}
|
| 17 |
+
<|python|>{{ part.text }}<|/python|>
|
| 18 |
+
{%- elif part.type == 'output' -%}
|
| 19 |
+
<|output|>{{ part.text }}<|/output|>
|
| 20 |
+
{%- elif part.type == 'think' -%}
|
| 21 |
+
<|think|>{{ part.text }}<|/think|>
|
| 22 |
+
{%- endif -%}
|
| 23 |
+
{%- endfor -%}
|
| 24 |
+
{%- else -%}
|
| 25 |
+
{{ content }}
|
| 26 |
+
{%- endif -%}
|
| 27 |
+
{%- endmacro -%}
|
| 28 |
+
|
| 29 |
+
{#- Main message loop -#}
|
| 30 |
+
{%- for message in messages -%}
|
| 31 |
+
{%- if message.role == 'system' -%}
|
| 32 |
+
<|im_start|>system
|
| 33 |
+
{{ message.content }}<|im_end|>
|
| 34 |
+
{% elif message.role == 'user' -%}
|
| 35 |
+
<|im_start|>user
|
| 36 |
+
{{ message.content }}<|im_end|>
|
| 37 |
+
{% elif message.role == 'assistant' -%}
|
| 38 |
+
<|im_start|>assistant
|
| 39 |
+
{% generation %}{{ render_content(message.content) }}<|im_end|>{% endgeneration %}
|
| 40 |
+
{% elif message.role == 'tool' -%}
|
| 41 |
+
<|tool_result|>{{ message.content }}<|/tool_result|>
|
| 42 |
+
{%- endif -%}
|
| 43 |
+
{%- endfor -%}
|
| 44 |
+
|
| 45 |
+
{#- Generation prompt -#}
|
| 46 |
+
{%- if add_generation_prompt -%}
|
| 47 |
+
<|im_start|>assistant
|
| 48 |
+
{% generation %}{% endgeneration %}
|
| 49 |
+
{%- endif -%}
|
| 50 |
+
|
checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae09dbaf90f145ef447f694d79818bc06c458b6fe0f32d5fd81dbe28e31f0c3d
|
| 3 |
+
size 16471242855
|
config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"DaisyForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn_all_layers": true,
|
| 6 |
+
"attn_impl": "standard",
|
| 7 |
+
"bos_token_id": 49131,
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"eos_token_id": 49131,
|
| 10 |
+
"eot_token_id": 49134,
|
| 11 |
+
"head_dim": 128,
|
| 12 |
+
"hidden_size": 1792,
|
| 13 |
+
"max_position_embeddings": 131072,
|
| 14 |
+
"model_dim": 1792,
|
| 15 |
+
"model_type": "daisy",
|
| 16 |
+
"num_attention_heads": 14,
|
| 17 |
+
"num_heads": 14,
|
| 18 |
+
"num_hidden_layers": 26,
|
| 19 |
+
"num_key_value_heads": 14,
|
| 20 |
+
"num_layers": 26,
|
| 21 |
+
"padded_embeddings": false,
|
| 22 |
+
"skip_mix_mode": "linear",
|
| 23 |
+
"tokenizer_name": "jonathanmiddleton/daisy",
|
| 24 |
+
"transformers_version": "5.3.0",
|
| 25 |
+
"use_tied_embeddings": false,
|
| 26 |
+
"use_value_embeddings": true,
|
| 27 |
+
"vocab_size": 49152,
|
| 28 |
+
"window_size": 2048
|
| 29 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 49131,
|
| 4 |
+
"eos_token_id": 49131,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.3.0"
|
| 8 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c296f22800259f9c4ed9bca652b00ec2653930886a8d527a78231c90c38568eb
|
| 3 |
+
size 4822418412
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<|endoftext|>",
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|endoftext|>",
|
| 7 |
+
"extra_special_tokens": [
|
| 8 |
+
"<|tool_call|>",
|
| 9 |
+
"<|/tool_call|>",
|
| 10 |
+
"<|tool_result|>",
|
| 11 |
+
"<|/tool_result|>",
|
| 12 |
+
"<|python|>",
|
| 13 |
+
"<|/python|>",
|
| 14 |
+
"<|output|>",
|
| 15 |
+
"<|/output|>",
|
| 16 |
+
"<|think|>",
|
| 17 |
+
"<|/think|>",
|
| 18 |
+
"<|system|>",
|
| 19 |
+
"<|user|>",
|
| 20 |
+
"<|assistant|>",
|
| 21 |
+
"<|reserved_0|>",
|
| 22 |
+
"<|reserved_1|>",
|
| 23 |
+
"<|reserved_2|>",
|
| 24 |
+
"<|reserved_3|>"
|
| 25 |
+
],
|
| 26 |
+
"is_local": false,
|
| 27 |
+
"model_max_length": 131072,
|
| 28 |
+
"pad_token": "<|pad|>",
|
| 29 |
+
"tokenizer_class": "TokenizersBackend",
|
| 30 |
+
"unk_token": null
|
| 31 |
+
}
|