GeoMotionGPT: Geometry-Aligned Motion Understanding with Large Language Models
Paper
β’
2601.07632
β’
Published
β’
1
GeoMotionGPT is a motion-to-text model that converts human motion sequences into natural language descriptions.
This model integrates two components:
from transformers import AutoModelForCausalLM
import torch
# Load the model
model = AutoModelForCausalLM.from_pretrained(
"zy22b/GeoMotionGPT",
trust_remote_code=True
)
# Access the motion tokenizer
motion_tokenizer = model.motion_tokenizer
# Example: Tokenize motion (batch, time, 263)
motion = torch.randn(1, 100, 263) # Random motion features
with torch.no_grad():
tokens = motion_tokenizer.encode(motion) # -> (batch, time//8)
print(f"Motion tokens shape: {tokens.shape}")
# Example: Decode tokens back to motion
with torch.no_grad():
reconstructed = motion_tokenizer.decode(tokens) # -> (batch, time, 263)
print(f"Reconstructed shape: {reconstructed.shape}")
import numpy as np
import torch
from transformers import AutoModelForCausalLM
# Load model
model = AutoModelForCausalLM.from_pretrained(
"zy22b/GeoMotionGPT",
trust_remote_code=True
)
motion_tokenizer = model.motion_tokenizer
# Load HumanML3D motion file
motion = np.load("datasets/humanml3d/new_joint_vecs/000000.npy") # (T, 263)
# Load normalization parameters
mean = np.load("datasets/humanml3d/Mean.npy")
std = np.load("datasets/humanml3d/Std.npy")
# Normalize
motion_norm = (motion - mean) / std
# Convert to tensor and add batch dimension
motion_tensor = torch.FloatTensor(motion_norm).unsqueeze(0) # (1, T, 263)
# Tokenize
with torch.no_grad():
tokens = motion_tokenizer.encode(motion_tensor)
print(f"Input shape: {motion_tensor.shape}") # e.g., torch.Size([1, 116, 263])
print(f"Token shape: {tokens.shape}") # e.g., torch.Size([1, 14])
print(f"Tokens: {tokens[0].tolist()}") # e.g., [138, 104, 508, ...]
The following is a complete, self-contained script for motion-to-text generation.
Requirements:
pip install torch transformers numpy safetensors huggingface_hub
Complete Code:
"""
GeoMotionGPT: Complete Motion-to-Text Generation
Self-contained script - requires only: torch, transformers, numpy, safetensors
"""
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2LMHeadModel, GPT2Config
from safetensors.torch import load_file
class GeoMotionGPTGenerator:
"""Complete Motion-to-Text Generator using HuggingFace models."""
def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
self.device = torch.device(device)
self.text_vocab_size = 50257
self.motion_codebook_size = 512
# Load motion tokenizer from HuggingFace
print("Loading motion tokenizer from HuggingFace...")
hf_model = AutoModelForCausalLM.from_pretrained(
"zy22b/GeoMotionGPT",
trust_remote_code=True
)
self.motion_tokenizer = hf_model.motion_tokenizer.to(self.device)
self.motion_tokenizer.eval()
# Load GPT2 tokenizer
print("Loading GPT2 tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
self.tokenizer.pad_token = self.tokenizer.eos_token
# Add motion tokens to tokenizer
motion_tokens = [f'<motion_id_{i}>' for i in range(self.motion_codebook_size)]
special_tokens = ['<start_of_motion>', '<end_of_motion>', '<masked_motion>', '<pad_motion>']
self.tokenizer.add_tokens(motion_tokens)
self.tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
# Build language model using transformers GPT2LMHeadModel
print("Building language model...")
config = GPT2Config(
vocab_size=50772, # 50257 text + 512 motion + 3 special
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12,
)
self.language_model = GPT2LMHeadModel(config).to(self.device)
# Load weights
print("Loading language model weights...")
self._load_weights()
self.language_model.eval()
print("Model ready!")
def _load_weights(self):
"""Load language model weights from HuggingFace."""
from huggingface_hub import hf_hub_download
# Download weights
weights_path = hf_hub_download(
repo_id="zy22b/GeoMotionGPT",
filename="model.safetensors"
)
state_dict = load_file(weights_path)
# Map weights to model (remove 'language_model.' prefix)
new_state_dict = {}
for k, v in state_dict.items():
if k.startswith("language_model."):
new_key = k[len("language_model."):]
new_state_dict[new_key] = v
self.language_model.load_state_dict(new_state_dict, strict=False)
def motion_tokens_to_string(self, tokens: torch.Tensor) -> str:
"""Convert motion token IDs to string format."""
token_list = tokens[0].cpu().tolist()
mot_start = f'<motion_id_{self.motion_codebook_size}>' # <start_of_motion>
mot_end = f'<motion_id_{self.motion_codebook_size + 1}>' # <end_of_motion>
motion_str = ''.join([f'<motion_id_{int(t)}>' for t in token_list])
return mot_start + motion_str + mot_end
def generate_text(self, motion: np.ndarray, mean: np.ndarray, std: np.ndarray,
max_new_tokens: int = 40) -> str:
"""
Generate text description from motion sequence.
Args:
motion: Raw motion array of shape (T, 263)
mean: Mean for normalization (263,)
std: Std for normalization (263,)
max_new_tokens: Maximum tokens to generate
Returns:
Generated text description
"""
# Normalize motion
motion_norm = (motion - mean) / std
motion_tensor = torch.FloatTensor(motion_norm).unsqueeze(0).to(self.device)
# Tokenize motion
with torch.no_grad():
motion_tokens = self.motion_tokenizer.encode(motion_tensor)
# Convert to string and create prompt
motion_string = self.motion_tokens_to_string(motion_tokens)
prompt = f"Generate text: {motion_string} \n "
# Tokenize prompt
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
# Generate using GPT2's built-in generate
with torch.no_grad():
output_ids = self.language_model.generate(
input_ids,
max_new_tokens=max_new_tokens,
do_sample=False,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
)
# Decode output
generated_ids = output_ids[0, input_ids.shape[1]:]
# Filter to text tokens only
text_ids = [tid.item() for tid in generated_ids if tid.item() < self.text_vocab_size]
generated_text = self.tokenizer.decode(text_ids, skip_special_tokens=True)
return generated_text.strip(), motion_tokens[0].tolist()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="GeoMotionGPT Motion-to-Text Generation")
parser.add_argument("--motion_file", type=str, required=True,
help="Path to HumanML3D motion .npy file")
parser.add_argument("--mean_file", type=str, default="Mean.npy",
help="Path to Mean.npy")
parser.add_argument("--std_file", type=str, default="Std.npy",
help="Path to Std.npy")
parser.add_argument("--device", type=str, default="cuda",
help="Device to use (cuda/cpu)")
args = parser.parse_args()
# Initialize generator
generator = GeoMotionGPTGenerator(device=args.device)
# Load data
motion = np.load(args.motion_file)
mean = np.load(args.mean_file)
std = np.load(args.std_file)
print(f"\nInput motion shape: {motion.shape}")
# Generate text
text, tokens = generator.generate_text(motion, mean, std)
print(f"Motion tokens ({len(tokens)}): {tokens}")
print(f"\nGenerated text: {text}")
Example Usage:
python geomotiongpt_inference.py \
--motion_file datasets/humanml3d/new_joint_vecs/000000.npy \
--mean_file datasets/humanml3d/Mean.npy \
--std_file datasets/humanml3d/Std.npy
Expected Output:
Loading motion tokenizer from HuggingFace...
Loading GPT2 tokenizer...
Building language model...
Loading language model weights...
Model ready!
Input motion shape: (116, 263)
Motion tokens (14): [138, 104, 508, 21, 498, 229, 144, 484, 393, 393, 144, 144, 144, 414]
Generated text: a person kicks something with their left foot.
GeoMotionGPTForCausalLM
βββ motion_tokenizer (MotionTokenizer)
β βββ quantizer (MotionQuantizer)
β β βββ 1D CNN with ResNet blocks
β βββ decoder (MotionDecoder)
β β βββ 1D Transposed CNN with ResNet blocks
β βββ codebook
β βββ 512-entry codebook
βββ language_model (GPT2LMHeadModel)
βββ 12-layer transformer
If you use this model, please cite:
@misc{ye2026geomotiongpt,
title={GeoMotionGPT: Geometry-Aligned Motion Understanding with Large Language Models},
author={Zhankai Ye and Bofan Li and Yukai Jin and Shuoqiu Li and Wei Wang and Yanfu Zhang and Shangqian Gao and Xin Liu},
year={2026},
eprint={2601.07632},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2601.07632},
}
MIT License
Base model
openai-community/gpt2