Update configuration_quasar.py
Browse files- configuration_quasar.py +16 -7
configuration_quasar.py
CHANGED
|
@@ -18,9 +18,7 @@ QUASAR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
| 18 |
class QuasarConfig(PretrainedConfig):
|
| 19 |
r"""
|
| 20 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
| 21 |
-
model according to the specified arguments, defining the model architecture.
|
| 22 |
-
defaults will yield a similar configuration to that of the Quasar
|
| 23 |
-
[microsoft/quasar-1](https://huggingface.co/microsoft/quasar-1).
|
| 24 |
|
| 25 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 26 |
documentation from [`PretrainedConfig`] for more information.
|
|
@@ -83,15 +81,26 @@ class QuasarConfig(PretrainedConfig):
|
|
| 83 |
Denotes beginning of sequences token id.
|
| 84 |
eos_token_id (`int`, *optional*, defaults to 2):
|
| 85 |
Denotes end of sequences token id.
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
Example:
|
| 88 |
|
| 89 |
```python
|
| 90 |
-
>>> from transformers import AutoModel,
|
| 91 |
|
| 92 |
|
| 93 |
-
>>> # Initializing a Quasar
|
| 94 |
-
>>> configuration =
|
| 95 |
|
| 96 |
>>> # Initializing a model from the configuration
|
| 97 |
>>> model = QuasarModel(configuration, trust_remote_code=True)
|
|
|
|
| 18 |
class QuasarConfig(PretrainedConfig):
|
| 19 |
r"""
|
| 20 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
| 21 |
+
model according to the specified arguments, defining the model architecture.
|
|
|
|
|
|
|
| 22 |
|
| 23 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 24 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
| 81 |
Denotes beginning of sequences token id.
|
| 82 |
eos_token_id (`int`, *optional*, defaults to 2):
|
| 83 |
Denotes end of sequences token id.
|
| 84 |
+
duplicate_trick (`bool`, *optional*, defaults to `True`):
|
| 85 |
+
Whether to use the trick of self layers calling
|
| 86 |
+
duplicate_grad (`bool`, *optional*, defaults to `True`):
|
| 87 |
+
Whether or not to do a double grad step during training. Thi is not compatible with Gradient Checkpointing
|
| 88 |
+
remove_ff_bias (`bool`, *optional*, defaults to `True`):
|
| 89 |
+
Whether or not to remove feed forward bias
|
| 90 |
+
gated_activation (`bool`, *optional*, defaults to `False`):
|
| 91 |
+
Whether or not to use a GeluGLU Activation
|
| 92 |
+
simple_norm (`bool`, *optional*, defaults to `False`):
|
| 93 |
+
Whether or not to use a simpler version of RMS Layer Norm
|
| 94 |
+
sliding_window ('int', *optional* defaults to 2048):
|
| 95 |
+
If specified it enables a sliding context window to extend the moel context from 2048 to 32K
|
| 96 |
Example:
|
| 97 |
|
| 98 |
```python
|
| 99 |
+
>>> from transformers import AutoModel, AutoConfig
|
| 100 |
|
| 101 |
|
| 102 |
+
>>> # Initializing a Quasar style configuration
|
| 103 |
+
>>> configuration = AutoConfig.from_pretrained("AstraMindAI/AstraQuasar-4B")
|
| 104 |
|
| 105 |
>>> # Initializing a model from the configuration
|
| 106 |
>>> model = QuasarModel(configuration, trust_remote_code=True)
|