|
|
--- |
|
|
license: apache-2.0 |
|
|
base_model: |
|
|
- nvidia/Alpamayo-R1-10B |
|
|
--- |
|
|
nvidia/Alpamayo-R1-10B 4bit Model. |
|
|
|
|
|
μ΄λͺ¨λΈμ μμ¨μ£Όν μ€ μμ§λ λ°μ΄ν°λ‘ μ΄λ²€νΈλ₯Ό μμΈ‘νλ μ©λλ‘ νμ©ν μ μμ΅λλ€. |
|
|
μμ¨μ£Όνμ νλκ² μλλΌ μμ¨μ£Όν μ€ νΉμ μν©μ΄ λ°μν κ²μ μλ €μ£Όλ κΈ°λ₯μ ν©λλ€. |
|
|
|
|
|
|
|
|
```Runinfo |
|
|
model download ./Alpamayo-R1-10B-4bit |
|
|
|
|
|
GPU 12G/16G Memory Run able |
|
|
|
|
|
12G Memory is num_frames is 1 ~ 8, over OOM |
|
|
|
|
|
Transformers is 4.57.5 ( 5.0.0rc not run) |
|
|
|
|
|
nvidia/Alpamayo-R1-10B μ΄ λμ©λ λ©λͺ¨λ¦¬λ₯Ό μꡬνκ³ 4bit λ‘ λ‘λ©νμ¬ μ μ₯ν λͺ¨λΈμ
λλ€. |
|
|
12G μμλ μ€νκ°λ₯ν΄μ‘μ΅λλ€λ§ μ£Όμ΄μ§λ νλ μμλ 1~8μ λ, κ·Έ μ΄μμ΄λ©΄ OOMμ΄ λ¨μ΄μ§λλ€. |
|
|
νΈλμ€ν¬λ¨Έ λ²μ 5.0.0rcμμλ λμνμ§ μμ΅λλ€. |
|
|
|
|
|
git clone https://github.com/NVlabs/alpamayo νκ³ |
|
|
cd alpamayo |
|
|
pip install . λ‘ μ€μΉν΄μΌ ν©λλ€λ§ |
|
|
|
|
|
pyproject.tomlμ μμ νλκ² μ’μ΅λλ€. |
|
|
python 3.13μ μ¬μ©νλ©΄ requires-python = "==3.13.*" |
|
|
transformers μ torchλ₯Ό λΌμΈμ μ κ±°νκ³ μ€μΉνλ©΄ μ€μΉλ λ²μ μ΄ κ΅μ²΄λμ§ μμ΅λλ€. |
|
|
``` |
|
|
----------------------------------- |
|
|
```python |
|
|
import torch |
|
|
import numpy as np |
|
|
from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1 |
|
|
from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset |
|
|
from alpamayo_r1 import helper |
|
|
|
|
|
model_path = "Alpamayo-R1-10B-4bit" |
|
|
model = AlpamayoR1.from_pretrained(model_path, dtype=torch.bfloat16).to("cuda") |
|
|
|
|
|
processor = helper.get_processor(model.tokenizer) |
|
|
|
|
|
clip_id = "030c760c-ae38-49aa-9ad8-f5650a545d26" |
|
|
print(f"Loading dataset for clip_id: {clip_id}...") |
|
|
#need set access token or huggingface-cli login... |
|
|
data = load_physical_aiavdataset(clip_id, t0_us=15_100_000,num_frames=1) |
|
|
print("Dataset loaded.") |
|
|
|
|
|
messages = helper.create_message(data["image_frames"].flatten(0, 1)) |
|
|
|
|
|
inputs = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=True, |
|
|
add_generation_prompt=False, |
|
|
continue_final_message=True, |
|
|
return_dict=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
|
|
|
model_inputs = { |
|
|
"tokenized_data": inputs, |
|
|
"ego_history_xyz": data["ego_history_xyz"], |
|
|
"ego_history_rot": data["ego_history_rot"], |
|
|
} |
|
|
|
|
|
model_inputs = helper.to_device(model_inputs, "cuda") |
|
|
torch.cuda.manual_seed_all(42) |
|
|
with torch.autocast("cuda", dtype=torch.bfloat16): |
|
|
pred_xyz, pred_rot, extra = model.sample_trajectories_from_data_with_vlm_rollout( |
|
|
data=model_inputs, |
|
|
top_p=0.98, |
|
|
temperature=0.6, |
|
|
num_traj_samples=1, # Feel free to raise this for more output trajectories and CoC traces. |
|
|
max_generation_length=256, |
|
|
return_extra=True, |
|
|
) |
|
|
|
|
|
|
|
|
print("Chain-of-Causation (per trajectory):\n", extra["cot"][0]) |
|
|
gt_xy = data["ego_future_xyz"].cpu()[0, 0, :, :2].T.numpy() |
|
|
pred_xy = pred_xyz.cpu().numpy()[0, 0, :, :, :2].transpose(0, 2, 1) |
|
|
diff = np.linalg.norm(pred_xy - gt_xy[None, ...], axis=1).mean(-1) |
|
|
min_ade = diff.min() |
|
|
print("minADE:", min_ade, "meters") |
|
|
print( |
|
|
"Note: VLA-reasoning models produce nondeterministic outputs due to trajectory sampling, " |
|
|
"hardware differences, etc. With num_traj_samples=1 (set for GPU memory compatibility), " |
|
|
"variance in minADE is expected. For visual sanity checks, see notebooks/inference.ipynb" |
|
|
) |
|
|
``` |
|
|
-------------------- |
|
|
```Result: |
|
|
|
|
|
|
|
|
Chain-of-Causation (per trajectory): |
|
|
[['Nudge to the left to pass the stopped truck encroaching into the lane.']] |
|
|
minADE: 1.7749525 meters |
|
|
Note: VLA-reasoning models produce nondeterministic outputs due to trajectory sampling, hardware differences, etc. With num_traj_samples=1 (set for GPU memory compatibility), variance in minADE is expected. For visual sanity checks, see notebooks/inference.ipynb |
|
|
``` |
|
|
|
|
|
|
|
|
λλ 1μ₯μ μ΄λ―Έμ§λ‘ νλ
νλ κ²μ ν
μ€νΈ νλ €κ³ μλμ κ°μ μμ λ₯Ό λ§λ€μλ€. |
|
|
λ°μ΄ν° λ‘λ© μμ΄ κΈ°λ³Έ μ΄κΈ°νλ₯Ό νμ¬ μμμ μμ μμνλ κ²μμ μμνλ€. |
|
|
ꡬλνκΈ° μν΄μ μ΅μ 12G μ΄μμΈ GPUμΉ΄λλ₯Ό μ¬μ©ν΄μΌ νκ³ , μλ΅μλ λ κ½€ μ§μ°μ΄ κ±Έλ € |
|
|
μ€μ μλμ°¨μ μ μ©νκΈ°μ 무리μΈκ² κ°λ€. |
|
|
|
|
|
```python |
|
|
#ZeroTime init Base Image(1 photo on load image) |
|
|
import torch |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1 |
|
|
from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset |
|
|
from alpamayo_r1 import helper |
|
|
|
|
|
num_history_steps = 16 # κ³Όκ±° μ€ν
μ |
|
|
num_future_steps = 64 # λ―Έλ μ€ν
μ |
|
|
|
|
|
# λλ―Έ μμΉ λ°μ΄ν° (xyz μ’ν) |
|
|
ego_history_xyz = torch.zeros((1, 1, num_history_steps, 3)) # (batch, agent, steps, xyz) |
|
|
ego_future_xyz = torch.zeros((1, 1, num_future_steps, 3)) |
|
|
|
|
|
# λλ―Έ νμ λ°μ΄ν° (3x3 νμ νλ ¬) |
|
|
ego_history_rot = torch.eye(3).repeat(1, 1, num_history_steps, 1, 1) # (1,1,steps,3,3) |
|
|
ego_future_rot = torch.eye(3).repeat(1, 1, num_future_steps, 1, 1) |
|
|
|
|
|
print("ego_history_xyz:", ego_history_xyz.shape) |
|
|
print("ego_future_xyz:", ego_future_xyz.shape) |
|
|
print("ego_history_rot:", ego_history_rot.shape) |
|
|
print("ego_future_rot:", ego_future_rot.shape) |
|
|
N_cameras = 1 |
|
|
camera_indices = torch.arange(N_cameras, dtype=torch.long) # (N_cameras,) - long νμ
λͺ
μ |
|
|
|
|
|
data={ |
|
|
"camera_indices": camera_indices, # (N_cameras,) |
|
|
"ego_history_xyz": ego_history_xyz, # (1, 1, num_history_steps, 3) |
|
|
"ego_history_rot": ego_history_rot, # (1, 1, num_history_steps, 3, 3) |
|
|
"ego_future_xyz": ego_future_xyz, # (1, 1, num_future_steps, 3) |
|
|
"ego_future_rot": ego_future_rot, # (1, 1, num_future_steps, 3, 3) |
|
|
# "relative_timestamps": relative_timestamps, # (N_cameras, num_frames) |
|
|
# "absolute_timestamps": absolute_timestamps # (N_cameras, num_frames) |
|
|
} |
|
|
img_path = "IMG_20260116_065921.jpg" |
|
|
# μμΈ‘νκ³ μΆμ JPG νμΌ κ²½λ‘ |
|
|
image = Image.open(img_path).convert("RGB") |
|
|
# helper.create_messageλ tensor μ
λ ₯μ κΈ°λνλ―λ‘ λ³ν |
|
|
# PIL Imageλ₯Ό numpy arrayλ‘ λ³ν ν float32λ‘ λ³ν |
|
|
image_array = np.array(image).astype(np.float32) / 255.0 # 0-1 λ²μλ‘ μ κ·ν |
|
|
image_tensor = torch.from_numpy(image_array).unsqueeze(0) # [batch, H, W, C] |
|
|
# λ©μμ§ μμ± |
|
|
messages = helper.create_message(image_tensor) |
|
|
|
|
|
# Example clip ID |
|
|
model_path = "Alpamayo-R1-10B-4bit" |
|
|
model = AlpamayoR1.from_pretrained(model_path, dtype=torch.bfloat16).to("cuda") |
|
|
processor = helper.get_processor(model.tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
# μ€μ κ° |
|
|
|
|
|
inputs = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=True, |
|
|
add_generation_prompt=False, |
|
|
continue_final_message=True, |
|
|
return_dict=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
|
|
|
model_inputs = { |
|
|
"tokenized_data": inputs, |
|
|
"ego_history_xyz": data["ego_history_xyz"], |
|
|
"ego_history_rot": data["ego_history_rot"], |
|
|
} |
|
|
|
|
|
model_inputs = helper.to_device(model_inputs, "cuda") |
|
|
|
|
|
torch.cuda.manual_seed_all(42) |
|
|
with torch.autocast("cuda", dtype=torch.bfloat16): |
|
|
pred_xyz, pred_rot, extra = model.sample_trajectories_from_data_with_vlm_rollout( |
|
|
data=model_inputs, |
|
|
top_p=0.98, |
|
|
temperature=0.6, |
|
|
num_traj_samples=1, # Feel free to raise this for more output trajectories and CoC traces. |
|
|
max_generation_length=256, |
|
|
return_extra=True, |
|
|
) |
|
|
|
|
|
# the size is [batch_size, num_traj_sets, num_traj_samples] |
|
|
print("Chain-of-Causation (per trajectory):\n", extra["cot"][0]) |
|
|
|
|
|
gt_xy = data["ego_future_xyz"].cpu()[0, 0, :, :2].T.numpy() |
|
|
pred_xy = pred_xyz.cpu().numpy()[0, 0, :, :, :2].transpose(0, 2, 1) |
|
|
diff = np.linalg.norm(pred_xy - gt_xy[None, ...], axis=1).mean(-1) |
|
|
min_ade = diff.min() |
|
|
print("minADE:", min_ade, "meters") |
|
|
print( |
|
|
"Note: VLA-reasoning models produce nondeterministic outputs due to trajectory sampling, " |
|
|
"hardware differences, etc. With num_traj_samples=1 (set for GPU memory compatibility), " |
|
|
"variance in minADE is expected. For visual sanity checks, see notebooks/inference.ipynb" |
|
|
) |
|
|
``` |
|
|
|
|
|
```output |
|
|
|
|
|
Chain-of-Causation (per trajectory): |
|
|
[['Keep lane to continue driving since the lane ahead is clear.']] |
|
|
minADE: 0.55852604 meters |
|
|
Note: VLA-reasoning models produce nondeterministic outputs due to trajectory sampling, hardware differences, etc. With num_traj_samples=1 (set for GPU memory compatibility), variance in minADE is expected. For visual sanity checks, see notebooks/inference.ipynb |
|
|
|
|
|
``` |