Files changed (1) hide show
  1. README.md +24 -1
README.md CHANGED
@@ -52,6 +52,27 @@ returns a list of (timestamps, object_id, pixel_x, pixel_y) output points.
52
 
53
  ### Video Pointing Example:
54
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  video_path = "https://storage.googleapis.com/oe-training-public/demo_videos/many_penguins.mp4"
56
  video_messages = [
57
  {
@@ -73,12 +94,14 @@ inputs = processor.apply_chat_template(
73
  return_pointing_metadata=True
74
  )
75
 
 
 
76
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
77
 
78
  with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
79
  output = model.generate(
80
  **inputs,
81
- logits_processor=model.build_logit_processor_from_inputs(inputs)
82
  max_new_tokens=200
83
  )
84
 
 
52
 
53
  ### Video Pointing Example:
54
  ```python
55
+
56
+ from transformers import AutoProcessor, AutoModelForImageTextToText
57
+ import torch
58
+ import numpy as np
59
+
60
+ checkpoint_dir = "allenai/MolmoPoint-Vid-4B" # or path to a converted HF checkpoint
61
+
62
+ model = AutoModelForImageTextToText.from_pretrained(
63
+ checkpoint_dir,
64
+ trust_remote_code=True,
65
+ dtype="auto",
66
+ device_map="cuda",
67
+ )
68
+
69
+ processor = AutoProcessor.from_pretrained(
70
+ checkpoint_dir,
71
+ trust_remote_code=True,
72
+ padding_side="left",
73
+ )
74
+
75
+
76
  video_path = "https://storage.googleapis.com/oe-training-public/demo_videos/many_penguins.mp4"
77
  video_messages = [
78
  {
 
94
  return_pointing_metadata=True
95
  )
96
 
97
+ metadata = inputs.pop("metadata")
98
+
99
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
100
 
101
  with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
102
  output = model.generate(
103
  **inputs,
104
+ logits_processor=model.build_logit_processor_from_inputs(inputs),
105
  max_new_tokens=200
106
  )
107