Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +77 -14
__init__.py +3 -0
__pycache__/configuration_geomotiongpt.cpython-311.pyc +0 -0
__pycache__/modeling_geomotiongpt.cpython-311.pyc +0 -0
config.json +37 -0
configuration_geomotiongpt.py +123 -0
model.safetensors +3 -0
modeling_geomotiongpt.py +523 -0

README.md CHANGED Viewed

@@ -11,6 +11,7 @@ tags:
 datasets:
 - humanml3d
 pipeline_tag: text-generation
 ---
 # GeoMotionGPT
@@ -19,36 +20,98 @@ GeoMotionGPT is a motion-to-text model that converts human motion sequences into
 ## Model Components
-This repository contains two model components:
-### 1. Motion Tokenizer (`motion_tokenizer/`)
-- **Architecture**: Decoder-only Vector Quantizer (DVQ) with Gumbel-Softmax Straight-Through (GSST) quantization
 - **Codebook Size**: 512 tokens
 - **Input**: 263-dimensional motion features (HumanML3D format)
-- **Temporal Downsampling**: 4x
-### 2. Language Model (`language_model/`)
-- **Base Model**: GPT-2
 - **Task**: Motion-to-Text generation
 - **Training**: Fine-tuned with orthogonality regularization (λ=0.01)
-- **Motion Tokens**: 512 additional tokens for motion representation
-## Usage
 ```python
 import torch
-from safetensors.torch import load_file
-# Load motion tokenizer
-motion_tokenizer_weights = load_file("motion_tokenizer/model.safetensors")
-# Load language model
-lm_weights = load_file("language_model/model.safetensors")
 ```
 ## Training Details
-- **Motion Tokenizer**: Trained on HumanML3D dataset with DVQ quantization
 - **Language Model**: Fine-tuned GPT-2 with:
   - Orthogonality loss (λ=0.01) for motion token embeddings
   - Codebook-initialized motion embeddings

 datasets:
 - humanml3d
 pipeline_tag: text-generation
+library_name: transformers
 ---
 # GeoMotionGPT
 ## Model Components
+This model integrates two components:
+### 1. Motion Tokenizer (DVQ-GSST)
+- **Architecture**: Decoder-only Vector Quantizer with Gumbel-Softmax Straight-Through quantization
 - **Codebook Size**: 512 tokens
 - **Input**: 263-dimensional motion features (HumanML3D format)
+- **Temporal Downsampling**: 8x (3 downsampling layers with stride 2)
+### 2. Language Model (Fine-tuned GPT-2)
+- **Base Model**: GPT-2 (124M parameters)
 - **Task**: Motion-to-Text generation
 - **Training**: Fine-tuned with orthogonality regularization (λ=0.01)
+- **Total Vocab**: 50772 tokens (50257 text + 512 motion + 3 special)
+## Quick Start
 ```python
+from transformers import AutoModelForCausalLM
 import torch
+# Load the model
+model = AutoModelForCausalLM.from_pretrained(
+    "zy22b/GeoMotionGPT",
+    trust_remote_code=True
+)
+# Access the motion tokenizer
+motion_tokenizer = model.motion_tokenizer
+# Example: Tokenize motion (batch, time, 263)
+motion = torch.randn(1, 100, 263)  # Random motion features
+tokens = motion_tokenizer.encode(motion)  # -> (batch, time//8)
+print(f"Motion tokens shape: {tokens.shape}")
+# Example: Decode tokens back to motion
+reconstructed = motion_tokenizer.decode(tokens)  # -> (batch, time, 263)
+```
+## Usage with HumanML3D Data
+```python
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM
+# Load model
+model = AutoModelForCausalLM.from_pretrained(
+    "zy22b/GeoMotionGPT",
+    trust_remote_code=True
+)
+motion_tokenizer = model.motion_tokenizer
+# Load HumanML3D motion file
+motion = np.load("path/to/new_joint_vecs/000000.npy")  # (T, 263)
+# Load normalization parameters
+mean = np.load("path/to/Mean.npy")
+std = np.load("path/to/Std.npy")
+# Normalize
+motion_norm = (motion - mean) / std
+# Convert to tensor and add batch dimension
+motion_tensor = torch.FloatTensor(motion_norm).unsqueeze(0)  # (1, T, 263)
+# Tokenize
+with torch.no_grad():
+    tokens = motion_tokenizer.encode(motion_tensor)
+print(f"Input shape: {motion_tensor.shape}")
+print(f"Token shape: {tokens.shape}")
+print(f"Tokens: {tokens[0].tolist()}")
+```
+## Model Architecture
+```
+GeoMotionGPTForCausalLM
+├── motion_tokenizer (MotionTokenizer)
+│   ├── encoder (MotionEncoder)
+│   │   └── 1D CNN with ResNet blocks
+│   ├── decoder (MotionDecoder)
+│   │   └── 1D Transposed CNN with ResNet blocks
+│   └── quantizer (GumbelSoftmaxQuantizer)
+│       └── 512-entry codebook
+└── language_model (GPT2LMHeadModel)
+    └── 12-layer transformer
 ```
 ## Training Details
+- **Motion Tokenizer**: Trained on HumanML3D dataset with DVQ-GSST quantization
 - **Language Model**: Fine-tuned GPT-2 with:
   - Orthogonality loss (λ=0.01) for motion token embeddings
   - Codebook-initialized motion embeddings

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# GeoMotionGPT Model Package
+from .configuration_geomotiongpt import GeoMotionGPTConfig
+from .modeling_geomotiongpt import GeoMotionGPTForCausalLM, GeoMotionGPTPreTrainedModel, MotionTokenizer

__pycache__/configuration_geomotiongpt.cpython-311.pyc ADDED Viewed

Binary file (4.96 kB). View file

__pycache__/modeling_geomotiongpt.cpython-311.pyc ADDED Viewed

Binary file (26.6 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "GeoMotionGPTForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_geomotiongpt.GeoMotionGPTConfig",
+    "AutoModelForCausalLM": "modeling_geomotiongpt.GeoMotionGPTForCausalLM"
+  },
+  "model_type": "geomotiongpt",
+  "motion_vocab_size": 512,
+  "motion_input_dim": 263,
+  "motion_hidden_dim": 512,
+  "motion_down_t": 3,
+  "motion_depth": 3,
+  "motion_dilation_growth_rate": 3,
+  "text_vocab_size": 50257,
+  "vocab_size": 50772,
+  "n_positions": 1024,
+  "n_embd": 768,
+  "n_layer": 12,
+  "n_head": 12,
+  "n_inner": null,
+  "activation_function": "gelu_new",
+  "resid_pdrop": 0.1,
+  "embd_pdrop": 0.1,
+  "attn_pdrop": 0.1,
+  "layer_norm_epsilon": 1e-05,
+  "initializer_range": 0.02,
+  "mot_factor": 1.0,
+  "attention_mode": "all",
+  "lambda_ortho": 0.01,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 50256,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0"
+}

configuration_geomotiongpt.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+GeoMotionGPT Configuration
+This module contains the configuration class for GeoMotionGPT, a motion-to-text model
+that combines a VQ-VAE motion tokenizer with a fine-tuned GPT-2 language model.
+"""
+from transformers import PretrainedConfig
+class GeoMotionGPTConfig(PretrainedConfig):
+    """
+    Configuration class for GeoMotionGPT model.
+    GeoMotionGPT consists of two components:
+    1. Motion Tokenizer (DVQ-GSST): Converts 263-dim HumanML3D motion features to discrete tokens
+    2. Language Model (GPT-2): Generates text descriptions from motion tokens
+    Args:
+        motion_vocab_size (`int`, *optional*, defaults to 512):
+            Size of the motion codebook vocabulary.
+        motion_input_dim (`int`, *optional*, defaults to 263):
+            Input dimension of motion features (HumanML3D format).
+        motion_hidden_dim (`int`, *optional*, defaults to 512):
+            Hidden dimension for motion encoder.
+        motion_down_t (`int`, *optional*, defaults to 3):
+            Number of temporal downsampling layers.
+        motion_depth (`int`, *optional*, defaults to 3):
+            Depth of ResNet blocks in encoder.
+        text_vocab_size (`int`, *optional*, defaults to 50257):
+            Size of the text vocabulary (GPT-2).
+        n_positions (`int`, *optional*, defaults to 1024):
+            Maximum sequence length.
+        n_embd (`int`, *optional*, defaults to 768):
+            Embedding dimension for GPT-2.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of transformer layers.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads.
+        mot_factor (`float`, *optional*, defaults to 1.0):
+            Factor for motion embedding dimension.
+        attention_mode (`str`, *optional*, defaults to "all"):
+            Cross-modal attention mode.
+        lambda_ortho (`float`, *optional*, defaults to 0.01):
+            Orthogonality regularization weight.
+    Example:
+        ```python
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained("zy22b/GeoMotionGPT", trust_remote_code=True)
+        print(config.motion_vocab_size)  # 512
+        ```
+    """
+    model_type = "geomotiongpt"
+    def __init__(
+        self,
+        # Motion tokenizer config
+        motion_vocab_size: int = 512,
+        motion_input_dim: int = 263,
+        motion_hidden_dim: int = 512,
+        motion_down_t: int = 3,
+        motion_depth: int = 3,
+        motion_dilation_growth_rate: int = 3,
+        # Language model config (GPT-2)
+        text_vocab_size: int = 50257,
+        n_positions: int = 1024,
+        n_embd: int = 768,
+        n_layer: int = 12,
+        n_head: int = 12,
+        n_inner: int = None,
+        activation_function: str = "gelu_new",
+        resid_pdrop: float = 0.1,
+        embd_pdrop: float = 0.1,
+        attn_pdrop: float = 0.1,
+        layer_norm_epsilon: float = 1e-5,
+        initializer_range: float = 0.02,
+        # Multi-modal config
+        mot_factor: float = 1.0,
+        attention_mode: str = "all",
+        lambda_ortho: float = 0.01,
+        # Special tokens
+        bos_token_id: int = 50256,
+        eos_token_id: int = 50256,
+        pad_token_id: int = 50256,
+        **kwargs
+    ):
+        # Motion tokenizer parameters
+        self.motion_vocab_size = motion_vocab_size
+        self.motion_input_dim = motion_input_dim
+        self.motion_hidden_dim = motion_hidden_dim
+        self.motion_down_t = motion_down_t
+        self.motion_depth = motion_depth
+        self.motion_dilation_growth_rate = motion_dilation_growth_rate
+        # Language model parameters
+        self.text_vocab_size = text_vocab_size
+        self.vocab_size = text_vocab_size + motion_vocab_size + 3  # +3 for special motion tokens (BOT, EOT, PAD)
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # Multi-modal parameters
+        self.mot_factor = mot_factor
+        self.attention_mode = attention_mode
+        self.lambda_ortho = lambda_ortho
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            **kwargs
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95961f9795c0c9620cca77ed684da258cb181970bc1612ff404e28b84ee1e473
+size 766672340

modeling_geomotiongpt.py ADDED Viewed

	@@ -0,0 +1,523 @@

+"""
+GeoMotionGPT Model
+This module contains the model implementation for GeoMotionGPT, integrating:
+1. Motion Tokenizer (DVQ-GSST VQ-VAE)
+2. Language Model (fine-tuned GPT-2 for motion-to-text)
+Usage:
+    ```python
+    from transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained("zy22b/GeoMotionGPT", trust_remote_code=True)
+    motion_tokenizer = model.motion_tokenizer
+    # Tokenize motion
+    motion_tokens = motion_tokenizer.encode(motion_features)
+    # Generate text
+    text = model.generate_from_motion(motion_tokens)
+    ```
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, List, Union
+from transformers import PreTrainedModel, GPT2LMHeadModel, GPT2Config
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+# Handle both package and standalone imports
+try:
+    from .configuration_geomotiongpt import GeoMotionGPTConfig
+except ImportError:
+    from configuration_geomotiongpt import GeoMotionGPTConfig
+# =====================================================
+# Motion Tokenizer Components (DVQ-GSST)
+# =====================================================
+class Swish(nn.Module):
+    """Swish activation function."""
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+class ResConv1DBlock(nn.Module):
+    """Single residual convolution block."""
+    def __init__(self, n_in, n_state, dilation=1, activation='relu', norm=None):
+        super().__init__()
+        padding = dilation
+        self.norm = norm
+        if norm == "LN":
+            self.norm1 = nn.LayerNorm(n_in)
+            self.norm2 = nn.LayerNorm(n_in)
+        elif norm == "GN":
+            self.norm1 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+        elif norm == "BN":
+            self.norm1 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+        else:
+            self.norm1 = nn.Identity()
+            self.norm2 = nn.Identity()
+        if activation == "relu":
+            self.activation1 = nn.ReLU()
+            self.activation2 = nn.ReLU()
+        elif activation == "silu":
+            self.activation1 = Swish()
+            self.activation2 = Swish()
+        elif activation == "gelu":
+            self.activation1 = nn.GELU()
+            self.activation2 = nn.GELU()
+        self.conv1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+        self.conv2 = nn.Conv1d(n_state, n_in, 1, 1, 0)
+    def forward(self, x):
+        x_orig = x
+        if self.norm == "LN":
+            x = self.norm1(x.transpose(-2, -1))
+            x = self.activation1(x.transpose(-2, -1))
+        else:
+            x = self.norm1(x)
+            x = self.activation1(x)
+        x = self.conv1(x)
+        if self.norm == "LN":
+            x = self.norm2(x.transpose(-2, -1))
+            x = self.activation2(x.transpose(-2, -1))
+        else:
+            x = self.norm2(x)
+            x = self.activation2(x)
+        x = self.conv2(x)
+        return x + x_orig
+class Resnet1D(nn.Module):
+    """1D ResNet block composed of multiple ResConv1DBlocks."""
+    def __init__(self, n_in, n_depth, dilation_growth_rate=1,
+                 reverse_dilation=True, activation='relu', norm=None):
+        super().__init__()
+        blocks = [
+            ResConv1DBlock(n_in, n_in, dilation=dilation_growth_rate ** depth,
+                          activation=activation, norm=norm)
+            for depth in range(n_depth)
+        ]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)
+class MotionEncoder(nn.Module):
+    """Encoder for motion features with temporal downsampling."""
+    def __init__(self, input_dim=263, hidden_dim=512, nb_code=512,
+                 down_t=3, stride_t=2, depth=3, dilation_growth_rate=3,
+                 activation='relu', norm=None):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(input_dim, hidden_dim, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for _ in range(down_t):
+            block = nn.Sequential(
+                nn.Conv1d(hidden_dim, hidden_dim, filter_t, stride_t, pad_t),
+                Resnet1D(hidden_dim, depth, dilation_growth_rate,
+                        reverse_dilation=False, activation=activation, norm=norm),
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(hidden_dim, nb_code, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)
+class MotionDecoder(nn.Module):
+    """Decoder for reconstructing motion from quantized features."""
+    def __init__(self, output_dim=263, hidden_dim=512, code_dim=512,
+                 down_t=3, stride_t=2, depth=3, dilation_growth_rate=3,
+                 activation='relu', norm=None):
+        super().__init__()
+        blocks = []
+        blocks.append(nn.Conv1d(code_dim, hidden_dim, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for _ in range(down_t):
+            block = nn.Sequential(
+                Resnet1D(hidden_dim, depth, dilation_growth_rate,
+                        reverse_dilation=True, activation=activation, norm=norm),
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(hidden_dim, hidden_dim, 3, 1, 1)
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(hidden_dim, hidden_dim, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        blocks.append(nn.Conv1d(hidden_dim, output_dim, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.model(x)
+class GumbelSoftmaxQuantizer(nn.Module):
+    """Gumbel-Softmax Straight-Through quantizer for VQ-VAE."""
+    def __init__(self, nb_code=512, code_dim=512):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.codebook = nn.Embedding(nb_code, code_dim)
+        nn.init.uniform_(self.codebook.weight, -1.0 / nb_code, 1.0 / nb_code)
+        self.tau = 0.4
+    def quantize(self, x):
+        """Quantize encoder output to discrete indices."""
+        return x.argmax(dim=-1)
+    def dequantize(self, indices):
+        """Convert indices back to embeddings."""
+        return self.codebook(indices)
+    def forward(self, x_encoder):
+        """Forward pass with Gumbel-Softmax sampling."""
+        N, C, T = x_encoder.shape
+        x = x_encoder.permute(0, 2, 1).contiguous().view(-1, C)
+        # Gumbel-Softmax with straight-through
+        y_hard_st = F.gumbel_softmax(x, tau=self.tau, hard=True, dim=-1)
+        x_quantized = torch.matmul(y_hard_st, self.codebook.weight)
+        return x_quantized.view(N, T, -1).permute(0, 2, 1).contiguous()
+class MotionTokenizer(nn.Module):
+    """
+    DVQ-GSST Motion Tokenizer.
+    Converts continuous motion features (263-dim HumanML3D format) to discrete tokens.
+    Args:
+        config: GeoMotionGPTConfig containing motion tokenizer parameters
+    Example:
+        ```python
+        motion = torch.randn(1, 100, 263)  # (batch, time, features)
+        tokens = motion_tokenizer.encode(motion)  # (batch, time//8)
+        ```
+    """
+    def __init__(self, config: GeoMotionGPTConfig):
+        super().__init__()
+        self.config = config
+        self.encoder = MotionEncoder(
+            input_dim=config.motion_input_dim,
+            hidden_dim=config.motion_hidden_dim,
+            nb_code=config.motion_vocab_size,
+            down_t=config.motion_down_t,
+            depth=config.motion_depth,
+            dilation_growth_rate=config.motion_dilation_growth_rate,
+        )
+        self.decoder = MotionDecoder(
+            output_dim=config.motion_input_dim,
+            hidden_dim=config.motion_hidden_dim,
+            code_dim=config.motion_vocab_size,
+            down_t=config.motion_down_t,
+            depth=config.motion_depth,
+            dilation_growth_rate=config.motion_dilation_growth_rate,
+        )
+        self.quantizer = GumbelSoftmaxQuantizer(
+            nb_code=config.motion_vocab_size,
+            code_dim=config.motion_vocab_size,
+        )
+    def encode(self, motion: torch.Tensor) -> torch.Tensor:
+        """
+        Encode motion features to discrete tokens.
+        Args:
+            motion: Motion features of shape (batch, time, 263)
+        Returns:
+            Token indices of shape (batch, time // downsample_ratio)
+        """
+        # (batch, time, 263) -> (batch, 263, time)
+        x = motion.permute(0, 2, 1).float()
+        # Encode
+        x_enc = self.encoder(x)  # (batch, nb_code, time')
+        # (batch, nb_code, time') -> (batch, time', nb_code)
+        x_enc = x_enc.permute(0, 2, 1).contiguous()
+        N, T, C = x_enc.shape
+        # Get token indices
+        indices = self.quantizer.quantize(x_enc.view(-1, C))
+        return indices.view(N, T)
+    def decode(self, tokens: torch.Tensor) -> torch.Tensor:
+        """
+        Decode tokens back to motion features.
+        Args:
+            tokens: Token indices of shape (batch, time')
+        Returns:
+            Motion features of shape (batch, time, 263)
+        """
+        # Get embeddings from tokens
+        x = self.quantizer.dequantize(tokens)  # (batch, time', code_dim)
+        # (batch, time', code_dim) -> (batch, code_dim, time')
+        x = x.permute(0, 2, 1).contiguous()
+        # Decode
+        x_out = self.decoder(x)  # (batch, 263, time)
+        # (batch, 263, time) -> (batch, time, 263)
+        return x_out.permute(0, 2, 1)
+    def forward(self, motion: torch.Tensor):
+        """Forward pass for training (encode -> quantize -> decode)."""
+        x = motion.permute(0, 2, 1).float()
+        x_enc = self.encoder(x)
+        x_quant = self.quantizer(x_enc)
+        x_dec = self.decoder(x_quant)
+        return x_dec.permute(0, 2, 1)
+# =====================================================
+# Main GeoMotionGPT Model
+# =====================================================
+class GeoMotionGPTPreTrainedModel(PreTrainedModel):
+    """Base class for GeoMotionGPT models."""
+    config_class = GeoMotionGPTConfig
+    base_model_prefix = "geomotiongpt"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class GeoMotionGPTForCausalLM(GeoMotionGPTPreTrainedModel):
+    """
+    GeoMotionGPT Model for motion-to-text generation.
+    This model combines:
+    1. A VQ-VAE motion tokenizer (DVQ-GSST) for converting motion to discrete tokens
+    2. A fine-tuned GPT-2 model for generating text from motion tokens
+    Example:
+        ```python
+        from transformers import AutoModelForCausalLM
+        import torch
+        # Load model
+        model = AutoModelForCausalLM.from_pretrained(
+            "zy22b/GeoMotionGPT",
+            trust_remote_code=True
+        )
+        # Access motion tokenizer
+        motion_tokenizer = model.motion_tokenizer
+        # Tokenize motion (batch, time, 263) -> (batch, tokens)
+        motion = torch.randn(1, 100, 263)
+        motion_tokens = motion_tokenizer.encode(motion)
+        # Generate text from motion tokens
+        text = model.generate_text(motion_tokens)
+        ```
+    """
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: GeoMotionGPTConfig):
+        super().__init__(config)
+        # Motion tokenizer
+        self.motion_tokenizer = MotionTokenizer(config)
+        # Build GPT-2 config
+        gpt2_config = GPT2Config(
+            vocab_size=config.vocab_size,
+            n_positions=config.n_positions,
+            n_embd=config.n_embd,
+            n_layer=config.n_layer,
+            n_head=config.n_head,
+            n_inner=config.n_inner,
+            activation_function=config.activation_function,
+            resid_pdrop=config.resid_pdrop,
+            embd_pdrop=config.embd_pdrop,
+            attn_pdrop=config.attn_pdrop,
+            layer_norm_epsilon=config.layer_norm_epsilon,
+            initializer_range=config.initializer_range,
+            bos_token_id=config.bos_token_id,
+            eos_token_id=config.eos_token_id,
+        )
+        # Language model (GPT-2)
+        self.language_model = GPT2LMHeadModel(gpt2_config)
+        # Motion token embeddings (separate from text embeddings)
+        mot_embed_dim = int(config.n_embd // config.n_head * config.mot_factor) * config.n_head
+        self.motion_embed = nn.Embedding(
+            config.motion_vocab_size + 3,  # +3 for special tokens (BOT, EOT, PAD)
+            mot_embed_dim
+        )
+        self.motion_head = nn.Linear(mot_embed_dim, config.motion_vocab_size + 3, bias=False)
+        # Projection layers for multi-modal fusion
+        self.motion_to_text_proj = nn.Linear(mot_embed_dim, config.n_embd)
+        self.text_to_motion_proj = nn.Linear(config.n_embd, mot_embed_dim)
+        # Initialize weights
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.transformer.wte
+    def set_input_embeddings(self, value):
+        self.language_model.transformer.wte = value
+    def get_output_embeddings(self):
+        return self.language_model.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.lm_head = new_embeddings
+    def encode_motion(self, motion: torch.Tensor) -> torch.Tensor:
+        """
+        Encode motion features to discrete tokens.
+        Args:
+            motion: Motion features of shape (batch, time, 263)
+        Returns:
+            Token indices of shape (batch, time // 8)
+        """
+        return self.motion_tokenizer.encode(motion)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        """
+        Forward pass through the language model.
+        For motion-to-text generation, use the `generate_text` method instead.
+        """
+        return self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        """Prepare inputs for text generation."""
+        return self.language_model.prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, **kwargs
+        )
+    @torch.no_grad()
+    def generate_text(
+        self,
+        motion_tokens: torch.Tensor,
+        max_new_tokens: int = 128,
+        num_beams: int = 4,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate text descriptions from motion tokens.
+        Args:
+            motion_tokens: Motion token indices of shape (batch, seq_len)
+            max_new_tokens: Maximum number of new tokens to generate
+            num_beams: Number of beams for beam search
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            do_sample: Whether to use sampling
+        Returns:
+            List of generated text strings
+        """
+        device = motion_tokens.device
+        batch_size = motion_tokens.shape[0]
+        # Offset motion tokens (they come after text tokens)
+        motion_offset = self.config.text_vocab_size
+        input_ids = motion_tokens + motion_offset
+        # Add BOS token at the start
+        bos_tokens = torch.full(
+            (batch_size, 1),
+            self.config.bos_token_id,
+            dtype=torch.long,
+            device=device
+        )
+        input_ids = torch.cat([bos_tokens, input_ids], dim=1)
+        # Generate
+        outputs = self.language_model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            num_beams=num_beams,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=do_sample,
+            pad_token_id=self.config.pad_token_id,
+            eos_token_id=self.config.eos_token_id,
+            **kwargs
+        )
+        # Decode only the generated part
+        generated_ids = outputs[:, input_ids.shape[1]:]
+        # Note: Actual text decoding requires a tokenizer
+        # Return raw generated IDs for now
+        return generated_ids
+# Register for AutoClass
+GeoMotionGPTConfig.register_for_auto_class()
+GeoMotionGPTForCausalLM.register_for_auto_class("AutoModelForCausalLM")