| | |
| | arch_dict = { |
| | |
| | "roberta": { |
| | "config_names": { |
| | "context_length": "max_position_embeddings", |
| | "vocab_size": "vocab_size", |
| | "width": "hidden_size", |
| | "heads": "num_attention_heads", |
| | "layers": "num_hidden_layers", |
| | "layer_attr": "layer", |
| | "token_embeddings_attr": "embeddings" |
| | }, |
| | "pooler": "mean_pooler", |
| | }, |
| | |
| | "xlm-roberta": { |
| | "config_names": { |
| | "context_length": "max_position_embeddings", |
| | "vocab_size": "vocab_size", |
| | "width": "hidden_size", |
| | "heads": "num_attention_heads", |
| | "layers": "num_hidden_layers", |
| | "layer_attr": "layer", |
| | "token_embeddings_attr": "embeddings" |
| | }, |
| | "pooler": "mean_pooler", |
| | }, |
| | |
| | "mt5": { |
| | "config_names": { |
| | |
| | |
| | |
| | "context_length": "", |
| | "vocab_size": "vocab_size", |
| | "width": "d_model", |
| | "heads": "num_heads", |
| | "layers": "num_layers", |
| | "layer_attr": "block", |
| | "token_embeddings_attr": "embed_tokens" |
| | }, |
| | "pooler": "mean_pooler", |
| | }, |
| | } |
| |
|