Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +85 -0
config.json +98 -0
easydel-model.parameters +3 -0
generation_config.json +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+easydel-model.parameters filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# BaseTrainer
+## 🚀 Trained With [EasyDeL](https://github.com/erfanzar/EasyDeL)
+EasyDeL is an open-source framework designed to enhance and streamline the training process of machine learning
+models. With a primary focus on Jax, EasyDeL aims to provide convenient and effective solutions for
+training Flax/Jax models on TPU/GPU, for both serving and training purposes.
+## 📦 Installation & Usage
+```python
+from easydel import AutoEasyDeLModelForCausalLM
+from jax import numpy as jnp, lax
+model = AutoEasyDeLModelForCausalLM.from_pretrained(
+    f"REPO_ID/BaseTrainer",
+    dtype=...,
+    param_dtype=...,
+    precision=lax.Precision("fastest"),
+    auto_shard_model=True,
+)
+```
+## 🔧 Training Configuration
+### Model Details
+- **Architecture**: qwen2
+- **Platform**: TPU
+- **Number of Devices**: 16
+### Training Parameters
+- **Learning Rate**: 5e-05 → 5e-06
+- **Optimizer**: adamw
+- **Scheduler**: cosine
+- **Warmup Steps**: 160
+- **Weight Decay**: 0.02
+- **Loss Config**: LossConfig(
+    ignore_index: -100
+    label_smoothing: 0.0
+    z_loss: 0.0
+    loss_normalizing_factor: 'NUM_REAL_TARGET_TOKENS'
+    num_labels: None
+    problem_type: None
+    divide_weight_sum: False
+    shift_tokens: True
+    break_on_nan: True
+    reduction: None
+    num_classification_labels: None
+    classification_problem_type: None
+)
+### Training Setup
+- **Epochs**: 5
+- **Batch Size**: 16
+- **Sequence Length**: 4096
+- **Dtype**: <class 'jax.numpy.bfloat16'>
+- **Params Dtype**: <class 'jax.numpy.bfloat16'>
+### Advanced Configuration
+- **Gradient Checkpointing**:
+- **Gradient Accumulation Steps**: 1
+- **Max Training Steps**: None
+- **Max Evaluation Steps**: None
+- **Training Duration**: 7H
+### Sharding Configuration
+```python
+# Partition Rules
+( ('model/embed_tokens/embedding', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ( 'self_attn/(q_proj|k_proj|v_proj)/kernel',
+    PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('self_attn/o_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('mlp/gate_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('mlp/down_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('mlp/up_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('input_layernorm/kernel', PartitionSpec(None,)),
+  ('post_attention_layernorm/kernel', PartitionSpec(None,)),
+  ('model/norm/kernel', PartitionSpec(None,)),
+  ('lm_head/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('.*', PartitionSpec(None,)))
+```
+---
+*Generated with EasyDeL v0.1.2*

config.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_mechanism": "flash_attn2",
+  "axis_dims": [
+    1,
+    -1,
+    1,
+    1
+  ],
+  "axis_names": [
+    "dp",
+    "fsdp",
+    "tp",
+    "sp"
+  ],
+  "backend": null,
+  "bits": null,
+  "blocksize_b": 1,
+  "blocksize_k": 128,
+  "blocksize_q": 128,
+  "bos_token_id": 151643,
+  "dcn_axis_dims": null,
+  "easy_method": "train",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 151643,
+  "fcm_max_ratio": 0.0,
+  "fcm_min_ratio": 0.0,
+  "flash_attention_backward_pass_impl": "triton",
+  "freq_max_position_embeddings": 4096,
+  "gradient_checkpointing": "",
+  "hardware_abstraction": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "kv_cache_quantization_blocksize": 64,
+  "kv_cache_quantization_method": "None",
+  "kv_cache_sharding_sequence_axis_name": "sp",
+  "mask_max_position_embeddings": 4096,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "number_rep_kv": 1,
+  "pallas_k_block_size": 128,
+  "pallas_m_block_size": 128,
+  "pallas_n_block_size": 128,
+  "partition_axis": [
+    [
+      "fsdp",
+      "dp"
+    ],
+    "sp",
+    "sp",
+    "tp",
+    "sp",
+    "tp",
+    null,
+    null,
+    null,
+    null,
+    "tp",
+    "sp",
+    null
+  ],
+  "platform": "jax",
+  "pretraining_tp": 1,
+  "quantization_blocksize": 64,
+  "quantization_method": "None",
+  "quantization_pattern": ".*",
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "scan_attention_layers": false,
+  "scan_layers": true,
+  "scan_mlp_chunk_size": 1024,
+  "scan_ring_attention": true,
+  "sequence_axis_name": "sp",
+  "shard_attention_computation": true,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.3",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_scan_mlp": false,
+  "use_sharded_kv_caching": false,
+  "use_sharding_constraint": false,
+  "use_sliding_window": false,
+  "vocab_size": 151667
+}

easydel-model.parameters ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b18e89af5a5df4e1d683c3faf46f080bc41c5a4013daca98184bd5cc0eda3b0
+size 15225580736

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.50.3"
+}