Upload folder using huggingface_hub
Browse files- README.md +1 -1
- __pycache__/config_tiny_mistral.cpython-310.pyc +0 -0
- __pycache__/dataloader.cpython-310.pyc +0 -0
- __pycache__/modeling_mistral.cpython-310.pyc +0 -0
- config_tiny_mistral.py +3 -2
- dataloader.py +1 -1
- modeling_mistral.py +7 -7
- run_train.py +2 -3
README.md
CHANGED
|
@@ -16,4 +16,4 @@ python config_tiny_mistral.py
|
|
| 16 |
# Run training
|
| 17 |
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
|
| 18 |
torchrun --nproc_per_node=8 run_train.py --config-file config_tiny_mistral.yaml
|
| 19 |
-
```
|
|
|
|
| 16 |
# Run training
|
| 17 |
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
|
| 18 |
torchrun --nproc_per_node=8 run_train.py --config-file config_tiny_mistral.yaml
|
| 19 |
+
```
|
__pycache__/config_tiny_mistral.cpython-310.pyc
ADDED
|
Binary file (3.99 kB). View file
|
|
|
__pycache__/dataloader.cpython-310.pyc
ADDED
|
Binary file (2.81 kB). View file
|
|
|
__pycache__/modeling_mistral.cpython-310.pyc
ADDED
|
Binary file (24.7 kB). View file
|
|
|
config_tiny_mistral.py
CHANGED
|
@@ -6,6 +6,8 @@ python config_tiny_mistral.py
|
|
| 6 |
```
|
| 7 |
"""
|
| 8 |
import os
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from nanotron.config import (
|
| 11 |
CheckpointsArgs,
|
|
@@ -23,8 +25,6 @@ from nanotron.config import (
|
|
| 23 |
TokensArgs,
|
| 24 |
)
|
| 25 |
from nanotron.logging import human_format
|
| 26 |
-
from dataclasses import dataclass
|
| 27 |
-
from typing import Optional
|
| 28 |
|
| 29 |
|
| 30 |
@dataclass
|
|
@@ -58,6 +58,7 @@ class MistralConfig:
|
|
| 58 |
if self.num_key_value_heads is None:
|
| 59 |
self.num_key_value_heads = self.num_attention_heads
|
| 60 |
|
|
|
|
| 61 |
model_config = MistralConfig(
|
| 62 |
# Config for a tiny model model with 1.62M parameters
|
| 63 |
bos_token_id=1,
|
|
|
|
| 6 |
```
|
| 7 |
"""
|
| 8 |
import os
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Optional
|
| 11 |
|
| 12 |
from nanotron.config import (
|
| 13 |
CheckpointsArgs,
|
|
|
|
| 25 |
TokensArgs,
|
| 26 |
)
|
| 27 |
from nanotron.logging import human_format
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@dataclass
|
|
|
|
| 58 |
if self.num_key_value_heads is None:
|
| 59 |
self.num_key_value_heads = self.num_attention_heads
|
| 60 |
|
| 61 |
+
|
| 62 |
model_config = MistralConfig(
|
| 63 |
# Config for a tiny model model with 1.62M parameters
|
| 64 |
bos_token_id=1,
|
dataloader.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from nanotron.config import (
|
| 2 |
PretrainDatasetsArgs,
|
| 3 |
)
|
|
@@ -13,7 +14,6 @@ from nanotron.trainer import DistributedTrainer
|
|
| 13 |
from nanotron.utils import (
|
| 14 |
main_rank_first,
|
| 15 |
)
|
| 16 |
-
from nanotron import logging
|
| 17 |
|
| 18 |
try:
|
| 19 |
from huggingface_hub import __version__ as hf_hub_version
|
|
|
|
| 1 |
+
from nanotron import logging
|
| 2 |
from nanotron.config import (
|
| 3 |
PretrainDatasetsArgs,
|
| 4 |
)
|
|
|
|
| 14 |
from nanotron.utils import (
|
| 15 |
main_rank_first,
|
| 16 |
)
|
|
|
|
| 17 |
|
| 18 |
try:
|
| 19 |
from huggingface_hub import __version__ as hf_hub_version
|
modeling_mistral.py
CHANGED
|
@@ -23,16 +23,13 @@ from flash_attn.flash_attn_interface import (
|
|
| 23 |
flash_attn_with_kvcache,
|
| 24 |
)
|
| 25 |
from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
|
| 26 |
-
from torch import nn
|
| 27 |
-
from transformers import MistralConfig
|
| 28 |
-
from transformers.activations import ACT2FN
|
| 29 |
-
|
| 30 |
from nanotron import distributed as dist
|
| 31 |
from nanotron import logging
|
| 32 |
from nanotron.config import ParallelismArgs, RecomputeGranularity
|
| 33 |
-
from nanotron.
|
| 34 |
from nanotron.logging import log_rank
|
| 35 |
from nanotron.models import NanotronModel
|
|
|
|
| 36 |
from nanotron.parallel import ParallelContext
|
| 37 |
from nanotron.parallel.parameters import NanotronParameter
|
| 38 |
from nanotron.parallel.pipeline_parallel.block import (
|
|
@@ -49,7 +46,9 @@ from nanotron.parallel.tensor_parallel.nn import (
|
|
| 49 |
)
|
| 50 |
from nanotron.random import RandomStates
|
| 51 |
from nanotron.utils import checkpoint_method
|
| 52 |
-
from
|
|
|
|
|
|
|
| 53 |
|
| 54 |
logger = logging.get_logger(__name__)
|
| 55 |
|
|
@@ -852,6 +851,7 @@ class MistralForTraining(NanotronModel):
|
|
| 852 |
):
|
| 853 |
super().__init__()
|
| 854 |
import warnings
|
|
|
|
| 855 |
warnings.warn("This is just a Llama Model, not a Mistral one for demo purpose. Please fix implementation")
|
| 856 |
self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
|
| 857 |
self.loss = PipelineBlock(
|
|
@@ -1120,4 +1120,4 @@ def get_flops(
|
|
| 1120 |
else:
|
| 1121 |
raise ValueError("recompute_granularity must be one of 'full' or 'selective'")
|
| 1122 |
|
| 1123 |
-
return model_flops, hardware_flops
|
|
|
|
| 23 |
flash_attn_with_kvcache,
|
| 24 |
)
|
| 25 |
from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
from nanotron import distributed as dist
|
| 27 |
from nanotron import logging
|
| 28 |
from nanotron.config import ParallelismArgs, RecomputeGranularity
|
| 29 |
+
from nanotron.generation.generate_store import AttachableStore
|
| 30 |
from nanotron.logging import log_rank
|
| 31 |
from nanotron.models import NanotronModel
|
| 32 |
+
from nanotron.nn.layer_norm import TritonRMSNorm
|
| 33 |
from nanotron.parallel import ParallelContext
|
| 34 |
from nanotron.parallel.parameters import NanotronParameter
|
| 35 |
from nanotron.parallel.pipeline_parallel.block import (
|
|
|
|
| 46 |
)
|
| 47 |
from nanotron.random import RandomStates
|
| 48 |
from nanotron.utils import checkpoint_method
|
| 49 |
+
from torch import nn
|
| 50 |
+
from transformers import MistralConfig
|
| 51 |
+
from transformers.activations import ACT2FN
|
| 52 |
|
| 53 |
logger = logging.get_logger(__name__)
|
| 54 |
|
|
|
|
| 851 |
):
|
| 852 |
super().__init__()
|
| 853 |
import warnings
|
| 854 |
+
|
| 855 |
warnings.warn("This is just a Llama Model, not a Mistral one for demo purpose. Please fix implementation")
|
| 856 |
self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
|
| 857 |
self.loss = PipelineBlock(
|
|
|
|
| 1120 |
else:
|
| 1121 |
raise ValueError("recompute_granularity must be one of 'full' or 'selective'")
|
| 1122 |
|
| 1123 |
+
return model_flops, hardware_flops
|
run_train.py
CHANGED
|
@@ -9,11 +9,10 @@ torchrun --nproc_per_node=8 run_train.py --config-file config_tiny_mistral.yaml
|
|
| 9 |
"""
|
| 10 |
import argparse
|
| 11 |
|
| 12 |
-
from
|
| 13 |
from dataloader import get_dataloader
|
|
|
|
| 14 |
from nanotron.trainer import DistributedTrainer
|
| 15 |
-
from config_tiny_mistral import MistralConfig
|
| 16 |
-
|
| 17 |
|
| 18 |
|
| 19 |
def get_args():
|
|
|
|
| 9 |
"""
|
| 10 |
import argparse
|
| 11 |
|
| 12 |
+
from config_tiny_mistral import MistralConfig
|
| 13 |
from dataloader import get_dataloader
|
| 14 |
+
from modeling_mistral import MistralForTraining
|
| 15 |
from nanotron.trainer import DistributedTrainer
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def get_args():
|