tiny-recursive-model / modeling_tiny_recursive.py
ainz's picture
Upload modeling_tiny_recursive.py with huggingface_hub
53b7a9c verified
raw
history blame
2.03 kB
from transformers import PreTrainedModel, PretrainedConfig
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2MLP
from transformers.generation import GenerationMixin
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
import torch
import torch.nn as nn
class TRMConfig(PretrainedConfig):
model_type = "recursive_gpt"
def __init__(
self,
vocab_size=50257,
n_positions=1024,
n_embd=512,
n_physical_layers=3,
n_loops=8,
n_head=8,
embd_pdrop=0.1,
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_embd = n_embd
self.n_physical_layers = n_physical_layers
self.n_loops = n_loops
self.n_head = n_head
self.embd_pdrop = embd_pdrop
# Required for transformers compatibility
self.hidden_size = n_embd
self.num_attention_heads = n_head
self.num_hidden_layers = n_physical_layers
self.n_inner = None
class TinyRecursiveModel(PreTrainedModel, GenerationMixin):
config_class = TRMConfig
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.config = config
# 1. Embeddings
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop)
# 2. The Logic Core - Add your recursive layers here
# [Your recursive implementation from the notebook]
# 3. Language modeling head
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.post_init()
def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
# Add your forward pass implementation
pass
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return {"input_ids": input_ids}