| from transformers import PretrainedConfig | |
| class DFMConfig(PretrainedConfig): | |
| model_type = "dfm" | |
| def __init__( | |
| self, | |
| vocab_size=50257, | |
| hidden_size=2048, | |
| cond_dim=256, | |
| n_blocks=21, | |
| n_heads=32, | |
| dropout=0.1, | |
| sequence_length=1024, | |
| source_distribution="mask", | |
| flow_scheduler_type="polynomial", | |
| flow_exponent=1.0, | |
| flow_loss_function="generalized_kl", | |
| sampling_steps=1024, | |
| bos_token_id=50256, | |
| eos_token_id=50256, | |
| mask_token_id=50257, | |
| tokenizer_name="gpt2", | |
| dtype="bfloat16", | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| **kwargs, | |
| ) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.cond_dim = cond_dim | |
| self.n_blocks = n_blocks | |
| self.n_heads = n_heads | |
| self.dropout = dropout | |
| self.sequence_length = sequence_length | |
| self.source_distribution = source_distribution | |
| self.flow_scheduler_type = flow_scheduler_type | |
| self.flow_exponent = flow_exponent | |
| self.flow_loss_function = flow_loss_function | |
| self.sampling_steps = sampling_steps | |
| self.mask_token_id = mask_token_id | |
| self.tokenizer_name = tokenizer_name | |
| self.dtype = dtype | |