Menlo
/

Poseless-3B

@@ -24,8 +24,68 @@ Our key contributions are as follows:
 * Model type: Qwen 2.5 3B Instruct, fine-tuned for hand pose estimation
 * License: Apache-2.0 license
-# How to Get Started
 ##  Citation
 BibTeX: []

 * Model type: Qwen 2.5 3B Instruct, fine-tuned for hand pose estimation
 * License: Apache-2.0 license
+## How to Get Started
+```python
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+# 1. Load model and processor
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_path = "path/to/qwen2.5_vl/checkpoint-1500/"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).eval().to(device)
+processor = AutoProcessor.from_pretrained(
+    model_path,
+    min_pixels=256*28*28,
+    max_pixels=1280*28*28,
+    trust_remote_code=True
+)
+# 2. Prepare your image
+image = Image.open("your_hand_image.png").convert("RGB")
+# 3. Create messages
+messages = [
+    {"role": "system", "content": "You are a specialized Vision Language Model designed to accurately estimate joint angles from hand pose images..."},
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": image,
+                "min_pixels": 1003520,
+                "max_pixels": 1003520,
+            },
+            {"type": "text", "text": "<Pose>"},
+        ],
+    },
+]
+# 4. Process and get predictions
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device)
+# 5. Generate output
+generated_ids = model.generate(**inputs, max_new_tokens=1024)
+generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(output_text)  # This will show the joint angles in XML format
+```
+The output will be joint angles in radians in XML format:
+```xml
+<lh_WRJ2>angle</lh_WRJ2><lh_WRJ1>angle</lh_WRJ1><lh_FFJ4>angle</lh_FFJ4>...
+```
 ##  Citation
 BibTeX: []