ayh015 commited on
Commit
3a1265d
·
1 Parent(s): e9ad225
data/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .hicodet import HICODet
2
+
3
+ __all__ = ["HICODet"]
data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (235 Bytes). View file
 
data/__pycache__/constants.cpython-311.pyc ADDED
Binary file (38.2 kB). View file
 
data/__pycache__/convsersation.cpython-311.pyc ADDED
Binary file (6.49 kB). View file
 
data/__pycache__/hicodet.cpython-311.pyc ADDED
Binary file (17.5 kB). View file
 
data/__pycache__/pose_hicodet.cpython-311.pyc ADDED
Binary file (18.4 kB). View file
 
data/constants.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ COCO_KEYPOINT_NAME = [
2
+ 'nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'
3
+ ]
4
+
5
+ COCO_KEYPOINT_NAME_TOKEN = [
6
+ '<nose>', '<left_eye>', '<right_eye>', '<left_ear>', '<right_ear>', '<left_shoulder>', '<right_shoulder>', '<left_elbow>', '<right_elbow>', '<left_wrist>', '<right_wrist>', '<left_hip>', '<right_hip>', '<left_knee>', '<right_knee>', '<left_ankle>', '<right_ankle>'
7
+ ]
8
+
9
+ PART_ORDER = ["right foot", "right leg", "left leg", "left foot", "hip", "head", "right hand", "right arm", "left arm", "left hand"]
10
+
11
+ KeypointLocationDescription = {
12
+ 'nose': 'The nose is the central, protruding feature on their face, located just above the upper lip.',
13
+ 'left eye': 'The left eye is the visual organ on the left side of their face, typically located above the left cheek and beside the nose.',
14
+ 'right eye': 'The right eye is the visual organ on the right side of their face, typically located above the right cheek and beside the nose.',
15
+ 'left ear': 'The left ear is the auditory organ on the left side of their head, typically located to the side of the left temple.',
16
+ 'right ear': 'The right ear is the auditory organ on the right side of their head, typically located to the side of the right temple.',
17
+ 'left shoulder': 'The left shoulder is the joint connecting the left arm and the torso, typically situated on the upper left side of the chest.',
18
+ 'right shoulder': 'The right shoulder is the joint connecting the right arm and the torso, typically situated on the upper right side of the chest.',
19
+ 'left elbow': 'The left elbow is the joint connecting the left upper arm and the left forearm, typically situated in the middle of the left arm, between left shoulder and left wrist.',
20
+ 'right elbow': 'The right elbow is the joint connecting the right upper arm and the right forearm, typically situated in the middle of the right arm, between right shoulder and right wrist.',
21
+ 'left wrist': 'The left wrist is the joint connecting the left forearm and the left hand, typically located at the base of the left hand.',
22
+ 'right wrist': 'The right wrist is the joint connecting the right forearm and the right hand, typically located at the base of the right hand.',
23
+ 'left hip': 'The left hip is the joint connecting the left thigh to the pelvis, typically located on the left side of the lower torso.',
24
+ 'right hip': 'The right hip is the joint connecting the right thigh to the pelvis, typically located on the right side of the lower torso.',
25
+ 'left knee': 'The left knee is the joint connecting the left thigh and the left lower leg, typically situated in the middle of the left leg, it is located between the left hip and left ankle.',
26
+ 'right knee': 'The right knee is the joint connecting the upper leg and lower leg on the right side, it is located between the right hip and right ankle.',
27
+ 'left ankle': 'The left ankle is the joint connecting the left lower leg and the left foot, typically located at the base of the left leg.',
28
+ 'right ankle': 'The right ankle is the joint connecting the right lower leg and the right foot, typically located at the base of the right leg.',
29
+ 'neck': "The neck is the part of the body connecting the head to the torso, typically situated between the shoulders.",
30
+ 'torso': "The torso is the central part of the body, typically encompassing the chest, abdomen, and back.",
31
+ 'pelvis': "The pelvis is the bony structure that forms the base of the spine and connects the torso to the lower body, typically located between the left hip and right hip.",
32
+ 'left toe': "The left toe is located at the end of the left foot, typically at the front or tip of the foot.",
33
+ 'right toe': "The right toe is the digit located at the end of the right foot, typically at the front or tip of the foot.",
34
+ 'head_top': "The head_top keypoint is the highest point on their head, typically at the crown.",
35
+ 'thorax': "The thorax is the central part of the torso, typically located between the neck and the abdomen."
36
+ }
37
+
38
+ KeypointLocationQuestion = {
39
+ 'nose': ["Please provide the coordinates for the nose of the person, which is the central, protruding feature on their face, located just above the upper lip.", "I'd like to know the precise coordinates of the nose, positioned at the center of the person's face, slightly below the eyes and between them.", "Where can I find the coordinates for the nose of this individual? It's the prominent feature in the middle of their facial structure.", "Could you please output the coordinates for the nose, which is typically situated in the middle of the face, between the eyes and above the upper lip?", "I'm interested in the coordinates of the nose. It's the facial feature at the center of the face, responsible for the sense of smell.", "Where are the coordinates for the nose? It's the prominent part of the face located between the eyes and just above the upper lip.", "Please provide the coordinates for the nose, found at the center of the person's face, between the eyes and above the upper lip.", "I'd like to see the coordinates of the nose, which is positioned in the middle of the face, slightly below the eyes and between them.", "Where can I locate the coordinates for the nose of this person? It's the central feature on their face, responsible for smelling.", "Could you output the coordinates for the nose, the central facial feature located between the eyes and just above the upper lip?"],
40
+ 'left eye': ["Please provide the coordinates for the left eye of the person, which is the visual organ on the left side of their face, typically located above the left cheek and beside the nose.", "I'd like to know the precise coordinates of the left eye, situated on the left side of the person's face, to the side of the nose and above the left cheek.", "Where can I find the coordinates for the left eye of this individual? It's the organ responsible for vision on the left side of their face.", "Could you please output the coordinates for the left eye, which is typically found on the left side of the face, beside the nose and above the left cheek?", "I'm interested in the coordinates of the left eye. It's the visual organ on the left side of the face, adjacent to the nose and above the left cheek.", "Where are the coordinates for the left eye? It's the eye situated on the left side of the face, above the left cheek, and beside the nose.", "Please provide the coordinates for the left eye, located on the left side of the person's face, beside the nose and above the left cheek.", "I'd like to see the coordinates of the left eye, which is positioned on the left side of the face, above the left cheek, and adjacent to the nose.", "Where can I locate the coordinates for the left eye of this person? It's the visual organ responsible for left-sided vision.", "Could you output the coordinates for the left eye, the eye on the left side of the face, above the left cheek, and beside the nose?"],
41
+ 'right eye': ["Please provide the coordinates for the right eye of the person, which is the visual organ on the right side of their face, typically located above the right cheek and beside the nose.", "I'd like to know the precise coordinates of the right eye, situated on the right side of the person's face, to the side of the nose and above the right cheek.", "Where can I find the coordinates for the right eye of this individual? It's the organ responsible for vision on the right side of their face.", "Could you please output the coordinates for the right eye, which is typically found on the right side of the face, beside the nose and above the right cheek?", "I'm interested in the coordinates of the right eye. It's the visual organ on the right side of the face, adjacent to the nose and above the right cheek.", "Where are the coordinates for the right eye? It's the eye situated on the right side of the face, above the right cheek, and beside the nose.", "Please provide the coordinates for the right eye, located on the right side of the person's face, beside the nose and above the right cheek.", "I'd like to see the coordinates of the right eye, which is positioned on the right side of the face, above the right cheek, and adjacent to the nose.", "Where can I locate the coordinates for the right eye of this person? It's the visual organ responsible for right-sided vision.", "Could you output the coordinates for the right eye, the eye on the right side of the face, above the right cheek, and beside the nose?"],
42
+ 'left ear': ["Please provide the coordinates for the left ear of the person, which is the auditory organ on the left side of their head, typically located to the side of the left temple.", "I'd like to know the precise coordinates of the left ear, situated on the left side of the person's head, to the side of the left temple and above the left jawline.", "Where can I find the coordinates for the left ear of this individual? It's the auditory organ on the left side of their head.", "Could you please output the coordinates for the left ear, which is typically found on the left side of the head, to the side of the left temple and above the left jawline?", "I'm interested in the coordinates of the left ear. It's the auditory organ on the left side of the head, adjacent to the left temple and above the left jawline.", "Where are the coordinates for the left ear? It's the ear situated on the left side of the head, beside the left temple and above the left jawline.", "Please provide the coordinates for the left ear, located on the left side of the person's head, beside the left temple and above the left jawline.", "I'd like to see the coordinates of the left ear, which is positioned on the left side of the head, above the left jawline, and adjacent to the left temple.", "Where can I locate the coordinates for the left ear of this person? It's the auditory organ on the left side of the head.", "Could you output the coordinates for the left ear, the ear on the left side of the head, above the left jawline, and beside the left temple?"],
43
+ 'right ear': ["Please provide the coordinates for the right ear of the person, which is the auditory organ on the right side of their head, typically located to the side of the right temple.", "I'd like to know the precise coordinates of the right ear, situated on the right side of the person's head, to the side of the right temple and above the right jawline.", "Where can I find the coordinates for the right ear of this individual? It's the auditory organ on the right side of their head.", "Could you please output the coordinates for the right ear, which is typically found on the right side of the head, to the side of the right temple and above the right jawline?", "I'm interested in the coordinates of the right ear. It's the auditory organ on the right side of the head, adjacent to the right temple and above the right jawline.", "Where are the coordinates for the right ear? It's the ear situated on the right side of the head, beside the right temple and above the right jawline.", "Please provide the coordinates for the right ear, located on the right side of the person's head, beside the right temple and above the right jawline.", "I'd like to see the coordinates of the right ear, which is positioned on the right side of the head, above the right jawline, and adjacent to the right temple.", "Where can I locate the coordinates for the right ear of this person? It's the auditory organ on the right side of the head.", "Could you output the coordinates for the right ear, the ear on the right side of the head, above the right jawline, and beside the right temple?"],
44
+ 'left shoulder': ["Please provide the coordinates for the left shoulder of the person, which is the joint connecting the left arm and the torso, typically situated on the upper left side of the chest.", "I'd like to know the precise coordinates of the left shoulder, located on the upper part of the person's chest, on the left side.", "Where can I find the coordinates for the left shoulder of this individual? It's the joint connecting the left arm to the upper body.", "Could you please output the coordinates for the left shoulder, typically found on the upper left side of the chest, where the arm meets the torso?", "I'm interested in the coordinates of the left shoulder. It's the joint on the upper left part of the chest that allows arm movement.", "Where are the coordinates for the left shoulder? It's the shoulder joint situated on the upper left side of the chest.", "Please provide the coordinates for the left shoulder, located on the upper left side of the chest, where the arm attaches to the torso.", "I'd like to see the coordinates of the left shoulder, which is positioned on the upper part of the chest, on the left side.", "Where can I locate the coordinates for the left shoulder of this person? It's the joint that connects the left arm to the upper body.", "Could you output the coordinates for the left shoulder, the joint on the upper left part of the chest where the arm articulates with the torso?"],
45
+ 'right shoulder': ["Please provide the coordinates for the right shoulder of the person, which is the joint connecting the right arm and the torso, typically situated on the upper right side of the chest.", "I'd like to know the precise coordinates of the right shoulder, located on the upper part of the person's chest, on the right side.", "Where can I find the coordinates for the right shoulder of this individual? It's the joint connecting the right arm to the upper body.", "Could you please output the coordinates for the right shoulder, typically found on the upper right side of the chest, where the arm meets the torso?", "I'm interested in the coordinates of the right shoulder. It's the joint on the upper right part of the chest that allows arm movement.", "Where are the coordinates for the right shoulder? It's the shoulder joint situated on the upper right side of the chest.", "Please provide the coordinates for the right shoulder, located on the upper right side of the chest, where the arm attaches to the torso.", "I'd like to see the coordinates of the right shoulder, which is positioned on the upper part of the chest, on the right side.", "Where can I locate the coordinates for the right shoulder of this person? It's the joint that connects the right arm to the upper body.", "Could you output the coordinates for the right shoulder, the joint on the upper right part of the chest where the arm articulates with the torso?"],
46
+ 'left elbow': ["Please provide the coordinates for the left elbow of the person, which is the joint connecting the left upper arm (humerus) and the left forearm (radius and ulna), typically situated in the middle of the left arm.", "I'd like to know the precise coordinates of the left elbow, located in the middle of the left arm, where the upper arm meets the forearm.", "Where can I find the coordinates for the left elbow of this individual? It's the joint responsible for bending and straightening the left arm.", "Could you please output the coordinates for the left elbow, typically found in the center of the left arm, where the upper arm and forearm meet?", "I'm interested in the coordinates of the left elbow. It's the joint on the left arm that facilitates arm movement.", "Where are the coordinates for the left elbow? It's the elbow joint situated in the middle of the left arm.", "Please provide the coordinates for the left elbow, located in the middle of the left arm, where the upper arm connects to the forearm.", "I'd like to see the coordinates of the left elbow, which is positioned in the center of the left arm.", "Where can I locate the coordinates for the left elbow of this person? It's the joint that allows flexion and extension of the left arm.", "Could you output the coordinates for the left elbow, the joint in the middle of the left arm responsible for arm movement?"],
47
+ 'right elbow': ["Please provide the coordinates for the right elbow of the person, which is the joint connecting the right upper arm (humerus) and the right forearm (radius and ulna), typically situated in the middle of the right arm.", "I'd like to know the precise coordinates of the right elbow, located in the middle of the right arm, where the upper arm meets the forearm.", "Where can I find the coordinates for the right elbow of this individual? It's the joint responsible for bending and straightening the right arm.", "Could you please output the coordinates for the right elbow, typically found in the center of the right arm, where the upper arm and forearm meet?", "I'm interested in the coordinates of the right elbow. It's the joint on the right arm that facilitates arm movement.", "Where are the coordinates for the right elbow? It's the elbow joint situated in the middle of the right arm.", "Please provide the coordinates for the right elbow, located in the middle of the right arm, where the upper arm connects to the forearm.", "I'd like to see the coordinates of the right elbow, which is positioned in the center of the right arm.", "Where can I locate the coordinates for the right elbow of this person? It's the joint that allows flexion and extension of the right arm.", "Could you output the coordinates for the right elbow, the joint in the middle of the right arm responsible for arm movement?"],
48
+ 'left wrist': ["Please provide the coordinates for the left wrist of the person, which is the joint connecting the left forearm (radius and ulna) and the left hand, typically located at the base of the left hand.", "I'd like to know the precise coordinates of the left wrist, situated at the base of the left hand, where the forearm meets the hand.", "Where can I find the coordinates for the left wrist of this individual? It's the joint responsible for hand movement on the left side.", "Could you please output the coordinates for the left wrist, typically found at the base of the left hand, where the forearm connects to the hand?", "I'm interested in the coordinates of the left wrist. It's the joint on the left side that facilitates movement of the hand and wrist.", "Where are the coordinates for the left wrist? It's the wrist joint situated at the base of the left hand.", "Please provide the coordinates for the left wrist, located at the base of the left hand, where the forearm articulates with the hand.", "I'd like to see the coordinates of the left wrist, which is positioned at the base of the left hand.", "Where can I locate the coordinates for the left wrist of this person? It's the joint that allows movement of the left hand and wrist.", "Could you output the coordinates for the left wrist, the joint at the base of the left hand responsible for hand and wrist movement?"],
49
+ 'right wrist': ["Please provide the coordinates for the right wrist of the person, which is the joint connecting the right forearm (radius and ulna) and the right hand, typically located at the base of the right hand.", "I'd like to know the precise coordinates of the right wrist, situated at the base of the right hand, where the forearm meets the hand.", "Where can I find the coordinates for the right wrist of this individual? It's the joint responsible for hand movement on the right side.", "Could you please output the coordinates for the right wrist, typically found at the base of the right hand, where the forearm connects to the hand?", "I'm interested in the coordinates of the right wrist. It's the joint on the right side that facilitates movement of the hand and wrist.", "Where are the coordinates for the right wrist? It's the wrist joint situated at the base of the right hand.", "Please provide the coordinates for the right wrist, located at the base of the right hand, where the forearm articulates with the hand.", "I'd like to see the coordinates of the right wrist, which is positioned at the base of the right hand.", "Where can I locate the coordinates for the right wrist of this person? It's the joint that allows movement of the right hand and wrist.", "Could you output the coordinates for the right wrist, the joint at the base of the right hand responsible for hand and wrist movement?"],
50
+ 'left hip': ["Please provide the coordinates for the left hip of the person, which is the joint connecting the left thigh to the pelvis, typically located on the left side of the lower torso.", "I'd like to know the precise coordinates of the left hip, situated on the left side of the lower torso, where the thigh bone meets the pelvis.", "Where can I find the coordinates for the left hip of this individual? It's the joint responsible for leg movement on the left side.", "Could you please output the coordinates for the left hip, typically found on the left side of the lower torso, where the thigh articulates with the pelvis?", "I'm interested in the coordinates of the left hip. It's the joint on the left side that facilitates movement of the leg.", "Where are the coordinates for the left hip? It's the hip joint situated on the left side of the lower torso.", "Please provide the coordinates for the left hip, located on the left side of the lower torso, where the thigh connects to the pelvis.", "I'd like to see the coordinates of the left hip, which is positioned on the left side of the lower torso.", "Where can I locate the coordinates for the left hip of this person? It's the joint that allows movement of the left leg.", "Could you output the coordinates for the left hip, the joint on the left side responsible for leg movement?"],
51
+ 'right hip': ["Please provide the coordinates for the right hip of the person, which is the joint connecting the right thigh to the pelvis, typically located on the right side of the lower torso.", "I'd like to know the precise coordinates of the right hip, situated on the right side of the lower torso, where the thigh bone meets the pelvis.", "Where can I find the coordinates for the right hip of this individual? It's the joint responsible for leg movement on the right side.", "Could you please output the coordinates for the right hip, typically found on the right side of the lower torso, where the thigh articulates with the pelvis?", "I'm interested in the coordinates of the right hip. It's the joint on the right side that facilitates movement of the leg.", "Where are the coordinates for the right hip? It's the hip joint situated on the right side of the lower torso.", "Please provide the coordinates for the right hip, located on the right side of the lower torso, where the thigh connects to the pelvis.", "I'd like to see the coordinates of the right hip, which is positioned on the right side of the lower torso.", "Where can I locate the coordinates for the right hip of this person? It's the joint that allows movement of the right leg.", "Could you output the coordinates for the right hip, the joint on the right side responsible for leg movement?"],
52
+ 'left knee': ["Please provide the coordinates for the left knee of the person, which is the joint connecting the left thigh and the left lower leg, typically situated in the middle of the left leg.", "I'd like to know the precise coordinates of the left knee, located in the middle of the left leg, where the upper leg (thigh) meets the lower leg (shin).", "Where can I find the coordinates for the left knee of this individual? It's the joint responsible for bending and straightening the left leg.", "Could you please output the coordinates for the left knee, typically found in the center of the left leg, where the upper and lower leg bones meet?", "I'm interested in the coordinates of the left knee. It's the joint on the left leg that facilitates leg movement.", "Where are the coordinates for the left knee? It's the knee joint situated in the middle of the left leg.", "Please provide the coordinates for the left knee, located in the middle of the left leg, where the thigh connects to the lower leg.", "I'd like to see the coordinates of the left knee, which is positioned in the center of the left leg.", "Where can I locate the coordinates for the left knee of this person? It's the joint that allows movement of the left leg.", "Could you output the coordinates for the left knee, the joint in the middle of the left leg responsible for leg movement?"],
53
+ 'right knee': ["Please provide the coordinates for the right knee of the person, which is the joint connecting the upper leg (thigh) and lower leg (shin) on the right side.", "I'd like to know the precise coordinates of the right knee, situated at the midpoint between the hip and the ankle on the right leg.", "Where can I find the coordinates for the right knee of this individual? It's the pivotal joint on the right leg, roughly in the middle.", "Could you please output the coordinates for the right knee, which is typically located at the front of the right leg, midway between the hip and the ankle?", "I'm interested in the coordinates of the right knee joint. It's the point where the upper and lower parts of the right leg meet.", "Where are the coordinates for the right knee? It's the joint on the right leg that allows bending and is located between the hip and the ankle.", "Please provide the coordinates for the right knee, which is found on the right leg and facilitates movement between the thigh and the shin.", "I'd like to see the coordinates of the right knee, which is positioned on the right leg, about halfway between the hip and the ankle.", "Where can I locate the coordinates for the right knee of this person? It's the joint connecting the upper and lower sections of the right leg.", "Could you output the coordinates for the right knee, the joint on the right leg that enables flexion, situated between the hip and the ankle?"],
54
+ 'left ankle': ["Please provide the coordinates for the left ankle of the person, which is the joint connecting the left lower leg (shin) and the left foot, typically located at the base of the left leg.", "I'd like to know the precise coordinates of the left ankle, situated at the base of the left leg, where the lower leg (shin) meets the foot.", "Where can I find the coordinates for the left ankle of this individual? It's the joint responsible for foot movement on the left side.", "Could you please output the coordinates for the left ankle, typically found at the base of the left leg, where the lower leg (shin) articulates with the foot?", "I'm interested in the coordinates of the left ankle. It's the joint on the left leg that facilitates foot movement.", "Where are the coordinates for the left ankle? It's the ankle joint situated at the base of the left leg.", "Please provide the coordinates for the left ankle, located at the base of the left leg, where the shin connects to the foot.", "I'd like to see the coordinates of the left ankle, which is positioned at the base of the left leg.", "Where can I locate the coordinates for the left ankle of this person? It's the joint that allows movement of the left foot.", "Could you output the coordinates for the left ankle, the joint at the base of the left leg responsible for foot movement?"],
55
+ 'right ankle': ["Please provide the coordinates for the right ankle of the person, which is the joint connecting the right lower leg (shin) and the right foot, typically located at the base of the right leg.", "I'd like to know the precise coordinates of the right ankle, situated at the base of the right leg, where the lower leg (shin) meets the foot.", "Where can I find the coordinates for the right ankle of this individual? It's the joint responsible for foot movement on the right side.", "Could you please output the coordinates for the right ankle, typically found at the base of the right leg, where the lower leg (shin) articulates with the foot?", "I'm interested in the coordinates of the right ankle. It's the joint on the right leg that facilitates foot movement.", "Where are the coordinates for the right ankle? It's the ankle joint situated at the base of the right leg.", "Please provide the coordinates for the right ankle, located at the base of the right leg, where the shin connects to the foot.", "I'd like to see the coordinates of the right ankle, which is positioned at the base of the right leg.", "Where can I locate the coordinates for the right ankle of this person? It's the joint that allows movement of the right foot.", "Could you output the coordinates for the right ankle, the joint at the base of the right leg responsible for foot movement?"],
56
+ 'neck': ["Please provide the coordinates for the neck of the person, which is the part of the body connecting the head to the torso, typically situated between the shoulders.", "I'd like to know the precise coordinates of the neck, located in the area between the head and the upper body, often marked by its curvature.", "Where can I find the coordinates for the neck of this individual? It's the part of the body that supports the head and allows movement.", "Could you please output the coordinates for the neck, typically found in the region connecting the head to the shoulders?", "I'm interested in the coordinates of the neck. It's the vital part of the body that connects the head to the torso.", "Where are the coordinates for the neck? It's the area situated between the head and the upper body.", "Please provide the coordinates for the neck, located between the head and the shoulders, serving as a bridge between the two.", "I'd like to see the coordinates of the neck, which is positioned between the head and the upper body.", "Where can I locate the coordinates for the neck of this person? It's the part that allows the head to move and turn.", "Could you output the coordinates for the neck, the vital junction between the head and the torso?"],
57
+ 'pelvis': ["Please provide the coordinates for the pelvis of the person, which is the bony structure that forms the base of the spine and connects the torso to the lower body, typically located in the lower part of the abdomen and buttocks.", "I'd like to know the precise coordinates of the pelvis, situated in the lower part of the body where the spine meets the hips and legs.", "Where can I find the coordinates for the pelvis of this individual? It's the central structure that supports the upper body and connects to the lower body.", "Could you please output the coordinates for the pelvis, typically found in the region between the abdomen and the hips?", "I'm interested in the coordinates of the pelvis. It's the foundational structure that connects the upper and lower parts of the body.", "Where are the coordinates for the pelvis? It's the bony structure located in the lower abdomen and buttocks.", "Please provide the coordinates for the pelvis, located at the base of the spine and forming the core of the body's support.", "I'd like to see the coordinates of the pelvis, which is positioned in the lower part of the body and connects the torso to the legs.", "Where can I locate the coordinates for the pelvis of this person? It's the structure that helps with balance and stability.", "Could you output the coordinates for the pelvis, the central structure connecting the upper and lower body?"],
58
+ 'left toe': ["Please provide the coordinates for the left toe of the person, which is the digit located at the end of the left foot, typically at the front or tip of the foot.", "I'd like to know the precise coordinates of the left toe, situated at the forward end of the left foot, often marked by the toenail.", "Where can I find the coordinates for the left toe of this individual? It's the digit at the front of the left foot used for balance and movement.", "Could you please output the coordinates for the left toe, typically found at the tip of the left foot where the toes begin?", "I'm interested in the coordinates of the left toe. It's the digit on the left foot's front end that assists in walking and balance.", "Where are the coordinates for the left toe? It's the toe located at the end of the left foot, near the toenail.", "Please provide the coordinates for the left toe, situated at the front end of the left foot, often marked by the toenail.", "I'd like to see the coordinates of the left toe, which is positioned at the tip of the left foot.", "Where can I locate the coordinates for the left toe of this person? It's the digit responsible for touch and balance on the left foot.", "Could you output the coordinates for the left toe, the digit at the front end of the left foot that aids in walking and balance?"],
59
+ 'right toe': ["Please provide the coordinates for the right toe of the person, which is the digit located at the end of the right foot, typically at the front or tip of the foot.", "I'd like to know the precise coordinates of the right toe, situated at the forward end of the right foot, often marked by the toenail.", "Where can I find the coordinates for the right toe of this individual? It's the digit at the front of the right foot used for balance and movement.", "Could you please output the coordinates for the right toe, typically found at the tip of the right foot where the toes begin?", "I'm interested in the coordinates of the right toe. It's the digit on the right foot's front end that assists in walking and balance.", "Where are the coordinates for the right toe? It's the toe located at the end of the right foot, near the toenail.", "Please provide the coordinates for the right toe, situated at the front end of the right foot, often marked by the toenail.", "I'd like to see the coordinates of the right toe, which is positioned at the tip of the right foot.", "Where can I locate the coordinates for the right toe of this person? It's the digit responsible for touch and balance on the right foot.", "Could you output the coordinates for the right toe, the digit at the front end of the right foot that aids in walking and balance?"],
60
+ 'head_top': ["Please provide the coordinates for the head top of the person, which is the highest point on their head, typically at the crown.", "I'd like to know the precise coordinates of the head top, situated at the very top of the person's head.", "Where can I find the coordinates for the head top of this individual? It's the highest point on the head, often marked by the skull's apex.", "Could you please output the coordinates for the head top, typically located at the crown of the head?", "I'm interested in the coordinates of the head top. It's the highest point on the person's head.", "Where are the coordinates for the head top? It's the topmost point on the head, usually at the crown.", "Please provide the coordinates for the head top, located at the very top of the head, often marked by the hairline.", "I'd like to see the coordinates of the head top, which is positioned at the highest point on the head.", "Where can I locate the coordinates for the head top of this person? It's the topmost point of their skull.", "Could you output the coordinates for the head top, the highest point on the person's head?"],
61
+ 'thorax': ["Please provide the coordinates for the thorax of the person, which is the central part of the torso, typically located between the neck and the abdomen.", "I'd like to know the precise coordinates of the thorax, situated in the middle of the upper body, often marked by the ribcage.", "Where can I find the coordinates for the thorax of this individual? It's the central region of the torso that houses the heart and lungs.", "Could you please output the coordinates for the thorax, typically found in the middle of the upper body, where the ribcage surrounds vital organs?", "I'm interested in the coordinates of the thorax. It's the central part of the torso responsible for protecting vital organs.", "Where are the coordinates for the thorax? It's the midsection of the upper body, often covered by the chest.", "Please provide the coordinates for the thorax, located between the neck and the abdomen, serving as a vital core of the body.", "I'd like to see the coordinates of the thorax, which is positioned in the middle of the upper body.", "Where can I locate the coordinates for the thorax of this person? It's the region that houses the heart and lungs.", "Could you output the coordinates for the thorax, the central part of the torso responsible for protecting vital organs?"],
62
+ 'torso': ["Please provide the coordinates for the torso of the person, which is the central part of the body, typically encompassing the chest, abdomen, and back.", "I'd like to know the precise coordinates of the torso, situated in the middle of the body and extending from the chest to the waist.", "Where can I find the coordinates for the torso of this individual? It's the core region that houses many vital organs.", "Could you please output the coordinates for the torso, typically found in the central area of the body, covering the chest and abdomen?", "I'm interested in the coordinates of the torso. It's the main part of the body that includes the chest and abdominal regions.", "Where are the coordinates for the torso? It's the central core of the body that plays a crucial role in protecting vital organs.", "Please provide the coordinates for the torso, located in the middle of the body, between the chest and the waist.", "I'd like to see the coordinates of the torso, which is positioned at the core of the body and includes the chest and abdomen.", "Where can I locate the coordinates for the torso of this person? It's the central region responsible for housing and protecting vital organs.", "Could you output the coordinates for the torso, the central part of the body encompassing the chest, abdomen, and back?"]
63
+ }
data/convsersation.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import dataclasses
3
+ from enum import auto, Enum
4
+ from typing import List, Tuple
5
+ from collections import defaultdict
6
+ from .constants import PART_ORDER, COCO_KEYPOINT_NAME
7
+
8
+ def read_hoi_file_2_dict(hoi_config):
9
+ hoi_dict = {}
10
+ with open(hoi_config, "r", encoding="utf-8") as f:
11
+ for line in f:
12
+ line = line.strip()
13
+ if not line or line.startswith("#"):
14
+ continue
15
+ nums, obj, action = line.split() # split on whitespace
16
+ hoi_dict[int(nums)] = [obj, action] # use nums as int; remove int() if you want string keys
17
+ return hoi_dict
18
+
19
+ def read_part_state_file_2_dict(part_state_config):
20
+ d = defaultdict(list)
21
+ with open(part_state_config, "r", encoding="utf-8") as f:
22
+ for line in f:
23
+ line = line.strip()
24
+ if not line or line.startswith("#"):
25
+ continue
26
+
27
+ key, val = line.split(":", 1) # split only on first ":"
28
+ key = key.strip()
29
+ val = val.strip()
30
+ d[key].append(val)
31
+ return d
32
+
33
+ @dataclasses.dataclass
34
+ class Conversation:
35
+ def __init__(self, system='', data_path=''):
36
+ super().__init__()
37
+ if system == '':
38
+ self.system = f"""
39
+ You are an AI assistant. You will be given an image that contains a main human subject.
40
+ Task:
41
+ Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.
42
+
43
+ Hints:
44
+ You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.
45
+
46
+ Required Constraints:
47
+ - Start with ONE sentence that summarizes the main action in natural language.
48
+ - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
49
+ - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
50
+ - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
51
+ - Write your description in clear, concise sentences grounded in visible evidence.
52
+
53
+ Optional Constraints :
54
+ - Write naturally. Avoid repeating the same sentence pattern.
55
+ - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
56
+ """
57
+ else:
58
+ self.system = system
59
+
60
+ self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
61
+ self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))
62
+
63
+ def _humanpart2word(self, action_labels):
64
+ action_labels_in_words = []
65
+ part_state_keys = list(self.part_state_reference.keys())
66
+ for d in action_labels:
67
+ human_part_id = d['human_part']
68
+ part_state_id = d['partstate']
69
+
70
+ part_name = PART_ORDER[human_part_id]
71
+ for key in part_state_keys:
72
+ if key in part_name:
73
+ states = self.part_state_reference[key]
74
+ part_state = states[part_state_id]
75
+ action_labels_in_words.append([part_name, part_state])
76
+ return action_labels_in_words
77
+
78
+ def _actionid2word(self, hoi_id):
79
+ obj, act = self.hoi_reference[hoi_id]
80
+ return obj, act
81
+
82
+ def get_prompt(self, meta):
83
+ hoi_obj = meta['hoi_obj']
84
+
85
+ hoi_id = hoi_obj['hoi_id']
86
+ obj_in_word, act_in_word = self._actionid2word(hoi_id)
87
+ action_labels = hoi_obj['action_labels']
88
+ action_labels_in_words = self._humanpart2word(action_labels)
89
+
90
+ prompt = f"""
91
+ Given the image, describe the visual evidence (especially body parts) that supports the action.
92
+ Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}.
93
+ Use these cues as guidance. Only mention cues you can actually see in the image.
94
+ """
95
+ return prompt
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+ if __name__ == "__main__":
105
+ pass
data/hicodet.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HICODet dataset under PyTorch framework
3
+
4
+ Fred Zhang <frederic.zhang@anu.edu.au>
5
+
6
+ The Australian National University
7
+ Australian Centre for Robotic Vision
8
+ """
9
+
10
+ import os
11
+ import json
12
+ import numpy as np
13
+
14
+ from typing import Optional, List, Callable, Tuple
15
+ from pocket.data import ImageDataset, DataSubset
16
+
17
+ class HICODetSubset(DataSubset):
18
+ def __init__(self, *args) -> None:
19
+ super().__init__(*args)
20
+ def filename(self, idx: int) -> str:
21
+ """Override: return the image file name in the subset"""
22
+ return self._filenames[self._idx[self.pool[idx]]]
23
+ def image_size(self, idx: int) -> Tuple[int, int]:
24
+ """Override: return the size (width, height) of an image in the subset"""
25
+ return self._image_sizes[self._idx[self.pool[idx]]]
26
+ @property
27
+ def anno_interaction(self) -> List[int]:
28
+ """Override: Number of annotated box pairs for each interaction class"""
29
+ num_anno = [0 for _ in range(self.num_interation_cls)]
30
+ intra_idx = [self._idx[i] for i in self.pool]
31
+ for idx in intra_idx:
32
+ for hoi in self._anno[idx]['hoi']:
33
+ num_anno[hoi] += 1
34
+ return num_anno
35
+ @property
36
+ def anno_object(self) -> List[int]:
37
+ """Override: Number of annotated box pairs for each object class"""
38
+ num_anno = [0 for _ in range(self.num_object_cls)]
39
+ anno_interaction = self.anno_interaction
40
+ for corr in self._class_corr:
41
+ num_anno[corr[1]] += anno_interaction[corr[0]]
42
+ return num_anno
43
+ @property
44
+ def anno_action(self) -> List[int]:
45
+ """Override: Number of annotated box pairs for each action class"""
46
+ num_anno = [0 for _ in range(self.num_action_cls)]
47
+ anno_interaction = self.anno_interaction
48
+ for corr in self._class_corr:
49
+ num_anno[corr[2]] += anno_interaction[corr[0]]
50
+ return num_anno
51
+
52
+ class HICODet(ImageDataset):
53
+ """
54
+ Arguments:
55
+ root(str): Root directory where images are downloaded to
56
+ anno_file(str): Path to json annotation file
57
+ transform(callable, optional): A function/transform that takes in an PIL image
58
+ and returns a transformed version
59
+ target_transform(callable, optional): A function/transform that takes in the
60
+ target and transforms it
61
+ transforms (callable, optional): A function/transform that takes input sample
62
+ and its target as entry and returns a transformed version.
63
+ """
64
+ def __init__(self, root: str, anno_file: str,
65
+ transform: Optional[Callable] = None,
66
+ target_transform: Optional[Callable] = None,
67
+ transforms: Optional[Callable] = None) -> None:
68
+ super(HICODet, self).__init__(root, transform, target_transform, transforms)
69
+ with open(anno_file, 'r') as f:
70
+ anno = json.load(f)
71
+
72
+ import pdb;pdb.set_trace()
73
+ self.num_object_cls = 80
74
+ self.num_interation_cls = 600
75
+ self.num_action_cls = 117
76
+ self._anno_file = anno_file
77
+
78
+ # Load annotations
79
+ self._load_annotation_and_metadata(anno)
80
+
81
+ def __len__(self) -> int:
82
+ """Return the number of images"""
83
+ return len(self._idx)
84
+
85
+ def __getitem__(self, i: int) -> tuple:
86
+ """
87
+ Arguments:
88
+ i(int): Index to an image
89
+
90
+ Returns:
91
+ tuple[image, target]: By default, the tuple consists of a PIL image and a
92
+ dict with the following keys:
93
+ "boxes_h": list[list[4]]
94
+ "boxes_o": list[list[4]]
95
+ "hoi":: list[N]
96
+ "verb": list[N]
97
+ "object": list[N]
98
+ """
99
+ intra_idx = self._idx[i]
100
+ return self._transforms(
101
+ self.load_image(os.path.join(self._root, self._filenames[intra_idx])),
102
+ self._anno[intra_idx]
103
+ )
104
+
105
+ def __repr__(self) -> str:
106
+ """Return the executable string representation"""
107
+ reprstr = self.__class__.__name__ + '(root=' + repr(self._root)
108
+ reprstr += ', anno_file='
109
+ reprstr += repr(self._anno_file)
110
+ reprstr += ')'
111
+ # Ignore the optional arguments
112
+ return reprstr
113
+
114
+ def __str__(self) -> str:
115
+ """Return the readable string representation"""
116
+ reprstr = 'Dataset: ' + self.__class__.__name__ + '\n'
117
+ reprstr += '\tNumber of images: {}\n'.format(self.__len__())
118
+ reprstr += '\tImage directory: {}\n'.format(self._root)
119
+ reprstr += '\tAnnotation file: {}\n'.format(self._root)
120
+ return reprstr
121
+
122
+ @property
123
+ def annotations(self) -> List[dict]:
124
+ return self._anno
125
+
126
+ @property
127
+ def class_corr(self) -> List[Tuple[int, int, int]]:
128
+ """
129
+ Class correspondence matrix in zero-based index
130
+ [
131
+ [hoi_idx, obj_idx, verb_idx],
132
+ ...
133
+ ]
134
+
135
+ Returns:
136
+ list[list[3]]
137
+ """
138
+ return self._class_corr.copy()
139
+
140
+ @property
141
+ def object_n_verb_to_interaction(self) -> List[list]:
142
+ """
143
+ The interaction classes corresponding to an object-verb pair
144
+
145
+ HICODet.object_n_verb_to_interaction[obj_idx][verb_idx] gives interaction class
146
+ index if the pair is valid, None otherwise
147
+
148
+ Returns:
149
+ list[list[117]]
150
+ """
151
+ lut = np.full([self.num_object_cls, self.num_action_cls], None)
152
+ for i, j, k in self._class_corr:
153
+ lut[j, k] = i
154
+ return lut.tolist()
155
+
156
+ @property
157
+ def object_to_interaction(self) -> List[list]:
158
+ """
159
+ The interaction classes that involve each object type
160
+
161
+ Returns:
162
+ list[list]
163
+ """
164
+ obj_to_int = [[] for _ in range(self.num_object_cls)]
165
+ for corr in self._class_corr:
166
+ obj_to_int[corr[1]].append(corr[0])
167
+ return obj_to_int
168
+
169
+ @property
170
+ def object_to_verb(self) -> List[list]:
171
+ """
172
+ The valid verbs for each object type
173
+
174
+ Returns:
175
+ list[list]
176
+ """
177
+ obj_to_verb = [[] for _ in range(self.num_object_cls)]
178
+ for corr in self._class_corr:
179
+ obj_to_verb[corr[1]].append(corr[2])
180
+ return obj_to_verb
181
+
182
+ @property
183
+ def anno_interaction(self) -> List[int]:
184
+ """
185
+ Number of annotated box pairs for each interaction class
186
+
187
+ Returns:
188
+ list[600]
189
+ """
190
+ return self._num_anno.copy()
191
+
192
+ @property
193
+ def anno_object(self) -> List[int]:
194
+ """
195
+ Number of annotated box pairs for each object class
196
+
197
+ Returns:
198
+ list[80]
199
+ """
200
+ num_anno = [0 for _ in range(self.num_object_cls)]
201
+ for corr in self._class_corr:
202
+ num_anno[corr[1]] += self._num_anno[corr[0]]
203
+ return num_anno
204
+
205
+ @property
206
+ def anno_action(self) -> List[int]:
207
+ """
208
+ Number of annotated box pairs for each action class
209
+
210
+ Returns:
211
+ list[117]
212
+ """
213
+ num_anno = [0 for _ in range(self.num_action_cls)]
214
+ for corr in self._class_corr:
215
+ num_anno[corr[2]] += self._num_anno[corr[0]]
216
+ return num_anno
217
+
218
+ @property
219
+ def objects(self) -> List[str]:
220
+ """
221
+ Object names
222
+
223
+ Returns:
224
+ list[str]
225
+ """
226
+ return self._objects.copy()
227
+
228
+ @property
229
+ def verbs(self) -> List[str]:
230
+ """
231
+ Verb (action) names
232
+
233
+ Returns:
234
+ list[str]
235
+ """
236
+ return self._verbs.copy()
237
+
238
+ @property
239
+ def interactions(self) -> List[str]:
240
+ """
241
+ Combination of verbs and objects
242
+
243
+ Returns:
244
+ list[str]
245
+ """
246
+ return [self._verbs[j] + ' ' + self.objects[i]
247
+ for _, i, j in self._class_corr]
248
+
249
+ def split(self, ratio: float) -> Tuple[HICODetSubset, HICODetSubset]:
250
+ """
251
+ Split the dataset according to given ratio
252
+
253
+ Arguments:
254
+ ratio(float): The percentage of training set between 0 and 1
255
+ Returns:
256
+ train(Dataset)
257
+ val(Dataset)
258
+ """
259
+ perm = np.random.permutation(len(self._idx))
260
+ n = int(len(perm) * ratio)
261
+ return HICODetSubset(self, perm[:n]), HICODetSubset(self, perm[n:])
262
+
263
+ def filename(self, idx: int) -> str:
264
+ """Return the image file name given the index"""
265
+ return self._filenames[self._idx[idx]]
266
+
267
+ def image_size(self, idx: int) -> Tuple[int, int]:
268
+ """Return the size (width, height) of an image"""
269
+ return self._image_sizes[self._idx[idx]]
270
+
271
+ def _load_annotation_and_metadata(self, f: dict) -> None:
272
+ """
273
+ Arguments:
274
+ f(dict): Dictionary loaded from {anno_file}.json
275
+ """
276
+ idx = list(range(len(f['filenames'])))
277
+ for empty_idx in f['empty']:
278
+ idx.remove(empty_idx)
279
+
280
+ num_anno = [0 for _ in range(self.num_interation_cls)]
281
+ for anno in f['annotation']:
282
+ for hoi in anno['hoi']:
283
+ num_anno[hoi] += 1
284
+
285
+ self._idx = idx
286
+ self._num_anno = num_anno
287
+
288
+ self._anno = f['annotation']
289
+ self._filenames = f['filenames']
290
+ self._image_sizes = f['size']
291
+ self._class_corr = f['correspondence']
292
+ self._empty_idx = f['empty']
293
+ self._objects = f['objects']
294
+ self._verbs = f['verbs']
data/pose_hicodet.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import logging
5
+ import random
6
+ from typing import Dict
7
+
8
+ import torch
9
+ from torch.utils.data import Dataset
10
+ from torchvision import transforms
11
+ import numpy as np
12
+
13
+ import transformers
14
+ from pycocotools.coco import COCO
15
+
16
+ from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion
17
+ from .constants import COCO_KEYPOINT_NAME_TOKEN
18
+
19
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
20
+ PREFIX_IMAGE = "Image: "
21
+ PREFIX_NO_IMAGE = "Image: N/A"
22
+ BEGIN_DESCRIPTION = "<des>"
23
+ END_DESCRIPTION = "</des>"
24
+ IGNORE_INDEX = -100
25
+ DEFAULT_EOS_TOKEN = "</s>"
26
+ BEGIN_OPTIONS = "<opt>"
27
+ END_OPTIONS = "</opt>"
28
+ BEGIN_LOC = "<loc>"
29
+ END_LOC = "</loc>"
30
+ BEGIN_QUESTION = "<qes>"
31
+ END_QUESTION = "</qes>"
32
+
33
+ class PoseHICODetDataset(Dataset):
34
+ """Dataset for supervised fine-tuning."""
35
+ def __init__(self, data_path: str,
36
+ multimodal_cfg: dict,
37
+ ):
38
+ super(PoseHICODetDataset, self).__init__()
39
+ logging.warning("Loading data...")
40
+ self.multimodal_cfg = multimodal_cfg
41
+ self.mllm_image_size = multimodal_cfg['image_size']
42
+ self.aspect_ratio = 1.0
43
+ self.pixel_std = 200
44
+ self.num_joints = 17
45
+ self.num_joints_full_body = 136
46
+ self.list_data_dict = self._load_data(data_path)
47
+
48
+
49
+ def _iou(self, a, b):
50
+ x1, y1, x2, y2 = a; X1, Y1, X2, Y2 = b
51
+ iw = max(0, min(x2, X2) - max(x1, X1))
52
+ ih = max(0, min(y2, Y2) - max(y1, Y1))
53
+ inter = iw * ih
54
+ return inter / ((x2 - x1) * (y2 - y1) + (X2 - X1) * (Y2 - Y1) - inter + 1e-9)
55
+
56
+ def _match_pose_hoi_objs(self, pose_objs, hoi_objs):
57
+ matched_pose_objs = []
58
+ matched_hoi_objs = []
59
+
60
+ for pose_obj in pose_objs:
61
+ for hoi_obj in hoi_objs:
62
+ X1, Y1, W, H = pose_obj['bbox']
63
+ iou = self._iou(hoi_obj['human_bbox'], [X1, Y1, X1+W, Y1+H])
64
+ if iou < 0.9: continue
65
+ if 'action_labels' not in list(hoi_obj.keys()):
66
+ continue
67
+
68
+ matched_pose_objs.append(pose_obj)
69
+ matched_hoi_objs.append(hoi_obj)
70
+
71
+ return matched_pose_objs, matched_hoi_objs
72
+
73
+ def _load_data(self, data_path):
74
+
75
+ # load pose annotation via coco api
76
+ coco_path = os.path.join(data_path, 'Annotation/hico-fullbody-pose/halpe_train_v1.json')
77
+ coco = COCO(coco_path)
78
+
79
+ # load instance-level hoi+part state annotation via json
80
+ json_path = os.path.join(data_path, "Annotation/hico-det-instance-level/hico-det-training-set-instance-level.json")
81
+ with open(json_path, "r", encoding="utf-8") as f:
82
+ hoi_data = json.load(f) # dict (or list) depending on the JSON root
83
+
84
+ instance_id = 0
85
+ list_data_dict=[]
86
+ for index in coco.getImgIds():
87
+ #load pose data per image id
88
+ im_ann = coco.loadImgs(index)[0]
89
+ width = im_ann['width']
90
+ height = im_ann['height']
91
+ annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
92
+ pose_objs = coco.loadAnns(annIds)
93
+
94
+ #load hoi data per image id
95
+ file_name = im_ann['file_name']
96
+ hoi_objs = hoi_data[file_name]['labels']
97
+
98
+ pose_objs, hoi_objs = self._match_pose_hoi_objs(pose_objs, hoi_objs)
99
+
100
+ for (pose_obj, hoi_obj) in zip(pose_objs, hoi_objs):
101
+ cls = pose_obj['category_id']
102
+ if cls != 1: continue
103
+
104
+ # ignore objs without keypoints annotation
105
+ if max(pose_obj['keypoints']) == 0:
106
+ continue
107
+
108
+ assert 'action_labels' in list(hoi_obj.keys())
109
+
110
+ joints_3d = np.zeros((self.num_joints_full_body, 3), dtype=np.float32)
111
+ joints_3d_vis = np.zeros((self.num_joints_full_body, 3), dtype=np.float32)
112
+ visible = np.zeros((self.num_joints_full_body), dtype=np.float32)
113
+ for ipt in range(self.num_joints_full_body):
114
+ joints_3d[ipt, 0] = pose_obj['keypoints'][ipt * 3 + 0]
115
+ joints_3d[ipt, 1] = pose_obj['keypoints'][ipt * 3 + 1]
116
+ joints_3d[ipt, 2] = 0
117
+ t_vis = pose_obj['keypoints'][ipt * 3 + 2]
118
+ visible[ipt] = t_vis
119
+ if t_vis > 1:
120
+ t_vis = 1
121
+ joints_3d_vis[ipt, 0] = t_vis
122
+ joints_3d_vis[ipt, 1] = t_vis
123
+ joints_3d_vis[ipt, 2] = 0
124
+
125
+ center, scale = self._box2cs(pose_obj['bbox'][:4])
126
+ list_data_dict.append({
127
+ 'file_name': file_name,
128
+ 'image_id': index,
129
+ 'center': center,
130
+ 'scale': scale,
131
+ 'joints_3d': joints_3d[:self.num_joints], # the first 17 keypoints are aligned with COCO's 17 keypoints definition.
132
+ 'joints_3d_vis': joints_3d_vis[:self.num_joints],
133
+ 'instance_id': instance_id,
134
+ 'hoi_obj': hoi_obj,
135
+ })
136
+ instance_id += 1
137
+
138
+ logging.warning("The number of training samples is {}".format(len(list_data_dict)))
139
+ logging.warning("Formatting inputs...Skip in lazy mode")
140
+ return list_data_dict
141
+
142
+ def __len__(self):
143
+ return len(self.list_data_dict)
144
+
145
+ def __getitem__(self, i):
146
+ sources = self.list_data_dict[i]
147
+ image, joints, joints_vis, c, s = self._get_image_item(sources)
148
+
149
+ data_dict = {}
150
+ data_dict["image"] = image
151
+ data_dict["has_image"] = True
152
+ data_dict["meta"] = sources
153
+ return data_dict
154
+
155
+ def _get_image_item(self, sources):
156
+ file_name = sources['file_name']
157
+ image_folder = self.multimodal_cfg['image_folder']
158
+ image_file = os.path.join(image_folder, file_name)
159
+ image = cv2.imread(
160
+ image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
161
+ )
162
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
163
+
164
+ # process image
165
+ joints = sources['joints_3d']
166
+ joints_vis = sources['joints_3d_vis']
167
+ c = sources['center']
168
+ s = sources['scale']
169
+ r = 0
170
+
171
+ trans = get_affine_transform(c, s, r, (int(self.mllm_image_size), int(self.mllm_image_size)))
172
+ image = cv2.warpAffine(
173
+ image,
174
+ trans,
175
+ (int(self.mllm_image_size), int(self.mllm_image_size)),
176
+ flags=cv2.INTER_LINEAR)
177
+
178
+ # for i in range(self.num_joints):
179
+ # if joints_vis[i, 0] > 0.0:
180
+ # joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
181
+
182
+ return image, joints, joints_vis, c, s
183
+
184
+ def _box2cs(self, box):
185
+ x, y, w, h = box[:4]
186
+ return self._xywh2cs(x, y, w, h)
187
+
188
+ def _xywh2cs(self, x, y, w, h):
189
+ center = np.zeros((2), dtype=np.float32)
190
+ center[0] = x + w * 0.5
191
+ center[1] = y + h * 0.5
192
+
193
+ if w > self.aspect_ratio * h:
194
+ h = w * 1.0 / self.aspect_ratio
195
+ elif w < self.aspect_ratio * h:
196
+ w = h * self.aspect_ratio
197
+ scale = np.array(
198
+ [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
199
+ dtype=np.float32)
200
+ if center[0] != -1:
201
+ # scale = scale * 1.25
202
+ scale = scale * 1.0
203
+
204
+ return center, scale
205
+
206
+ def _generate_target(self, joints, joints_vis):
207
+ '''
208
+ :param joints: [num_joints, 3]
209
+ :param joints_vis: [num_joints, 3]
210
+ :return: target, target_weight(1: visible, 0: invisible)
211
+ '''
212
+ target_weight = np.ones((self.num_joints, 1), dtype=np.float32)
213
+ target_weight[:, 0] = joints_vis[:, 0]
214
+ target = np.zeros((self.num_joints,
215
+ self.heatmap_size[1],
216
+ self.heatmap_size[0]),
217
+ dtype=np.float32)
218
+
219
+ tmp_size = self.sigma * 3
220
+
221
+ for joint_id in range(self.num_joints):
222
+ feat_stride = self.vitpose_image_size / self.heatmap_size
223
+ mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
224
+ mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
225
+ # Check that any part of the gaussian is in-bounds
226
+ ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
227
+ br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
228
+ if ul[0] >= self.heatmap_size[0] or ul[1] >= self.heatmap_size[1] \
229
+ or br[0] < 0 or br[1] < 0:
230
+ # If not, just return the image as is
231
+ target_weight[joint_id] = 0
232
+ continue
233
+
234
+ # # Generate gaussian
235
+ size = 2 * tmp_size + 1
236
+ x = np.arange(0, size, 1, np.float32)
237
+ y = x[:, np.newaxis]
238
+ x0 = y0 = size // 2
239
+ # The gaussian is not normalized, we want the center value to equal 1
240
+ g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * self.sigma ** 2))
241
+
242
+ # Usable gaussian range
243
+ g_x = max(0, -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0]
244
+ g_y = max(0, -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1]
245
+ # Image range
246
+ img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0])
247
+ img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1])
248
+
249
+ v = target_weight[joint_id]
250
+ if v > 0.5:
251
+ target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
252
+ g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
253
+
254
+ # if self.use_different_joints_weight:
255
+ # target_weight = np.multiply(target_weight, self.joints_weight)
256
+
257
+ return target, target_weight
258
+
259
+ def fliplr_joints(joints, joints_vis, width, matched_parts):
260
+ """
261
+ flip coords
262
+ """
263
+ # Flip horizontal
264
+ joints[:, 0] = width - joints[:, 0] - 1
265
+
266
+ # Change left-right parts
267
+ for pair in matched_parts:
268
+ joints[pair[0], :], joints[pair[1], :] = \
269
+ joints[pair[1], :], joints[pair[0], :].copy()
270
+ joints_vis[pair[0], :], joints_vis[pair[1], :] = \
271
+ joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
272
+
273
+ return joints*joints_vis, joints_vis
274
+
275
+ def transform_preds(coords, center, scale, output_size):
276
+ target_coords = np.zeros(coords.shape)
277
+ trans = get_affine_transform(center, scale, 0, output_size, inv=1)
278
+ for p in range(coords.shape[0]):
279
+ target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
280
+ return target_coords
281
+
282
+ def get_affine_transform(
283
+ center, scale, rot, output_size,
284
+ shift=np.array([0, 0], dtype=np.float32), inv=0
285
+ ):
286
+ if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
287
+ print(scale)
288
+ scale = np.array([scale, scale])
289
+
290
+ scale_tmp = scale * 200.0
291
+ src_w = scale_tmp[0]
292
+ dst_w = output_size[0]
293
+ dst_h = output_size[1]
294
+
295
+ rot_rad = np.pi * rot / 180
296
+ src_dir = get_dir([0, src_w * -0.5], rot_rad)
297
+ dst_dir = np.array([0, dst_w * -0.5], np.float32)
298
+
299
+ src = np.zeros((3, 2), dtype=np.float32)
300
+ dst = np.zeros((3, 2), dtype=np.float32)
301
+ src[0, :] = center + scale_tmp * shift
302
+ src[1, :] = center + src_dir + scale_tmp * shift
303
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
304
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
305
+
306
+ src[2:, :] = get_3rd_point(src[0, :], src[1, :])
307
+ dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
308
+
309
+ if inv:
310
+ trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
311
+ else:
312
+ trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
313
+
314
+ return trans
315
+
316
+
317
+ def affine_transform(pt, t):
318
+ new_pt = np.array([pt[0], pt[1], 1.]).T
319
+ new_pt = np.dot(t, new_pt)
320
+ return new_pt[:2]
321
+
322
+
323
+ def get_3rd_point(a, b):
324
+ direct = a - b
325
+ return b + np.array([-direct[1], direct[0]], dtype=np.float32)
326
+
327
+
328
+ def get_dir(src_point, rot_rad):
329
+ sn, cs = np.sin(rot_rad), np.cos(rot_rad)
330
+
331
+ src_result = [0, 0]
332
+ src_result[0] = src_point[0] * cs - src_point[1] * sn
333
+ src_result[1] = src_point[0] * sn + src_point[1] * cs
334
+
335
+ return src_result
scripts/annotate.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IDX=0,4,
2
+ export PYTHONPATH=$PYTHONPATH:./
3
+
4
+ data_path=../datasets/HICO-Det
5
+ model_path=./model_weights/qwen3_8b_vl_instruct
6
+ output_dir=outputs
7
+
8
+ if [ -d ${output_dir} ];then
9
+ echo "dir already exists"
10
+ else
11
+ mkdir ${output_dir}
12
+ fi
13
+
14
+ CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=2 --master_port=25005 \
15
+ tools/annotate.py \
16
+ --model-path ${model_path} \
17
+ --data-path ${data_path} \
18
+ --output-dir ${output_dir} \
tools/__init__.py ADDED
File without changes
tools/annotate.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+
7
+ import torch
8
+ import torch.distributed as dist
9
+ from torch.utils.data import DataLoader
10
+ from torchvision import transforms as T
11
+
12
+ from data.pose_hicodet import PoseHICODetDataset
13
+ from data.convsersation import Conversation
14
+
15
+ import re
16
+ from dataclasses import dataclass
17
+
18
+ from transformers import Qwen3VLForConditionalGeneration
19
+ from transformers import AutoTokenizer, AutoConfig, AutoProcessor
20
+
21
+ def disable_torch_init():
22
+ """
23
+ Disable the redundant torch default initialization to accelerate model creation.
24
+ """
25
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
26
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
27
+
28
+ import os, json
29
+ import torch
30
+ import torch.distributed as dist
31
+
32
+ def gather_labels_and_save(labels, output_path):
33
+ # Make sure dist is initialized (torchrun / deepspeed / accelerate usually does this)
34
+ world_size = dist.get_world_size()
35
+ rank = dist.get_rank()
36
+
37
+ gathered = [None for _ in range(world_size)]
38
+ dist.all_gather_object(gathered, labels) # gathered[i] is labels from rank i
39
+
40
+ if rank == 0:
41
+ merged = []
42
+ for part in gathered:
43
+ merged.extend(part)
44
+
45
+ with open(output_path, "w", encoding="utf-8") as f:
46
+ json.dump(merged, f, ensure_ascii=False, indent=2)
47
+
48
+ dist.barrier() # optional: ensure rank0 finished writing before others exit
49
+
50
+ @dataclass
51
+ class DataCollatorForSupervisedDataset(object):
52
+ def __init__(self, processor, data_path):
53
+ self.processor = processor
54
+ self.conv = Conversation(
55
+ system='',
56
+ data_path=data_path
57
+ )
58
+
59
+ def __call__(self, data_dicts):
60
+ """Collate examples for supervised fine-tuning."""
61
+ batch_prompts = []
62
+ batch_images = []
63
+ result_meta = []
64
+
65
+ for i, data_dict in enumerate(data_dicts):
66
+ batch_images.append(data_dict['image'])
67
+ batch_prompts.append(self.conv.get_prompt(data_dict['meta']))
68
+ result_meta.append(data_dict['meta'])
69
+
70
+ messages = []
71
+ for prompt in zip(batch_prompts):
72
+ messages.append([
73
+ {"role": "system",
74
+ "content":[
75
+ {"type": "text",
76
+ "text": self.conv.system},]},
77
+ {"role": "user",
78
+ "content":[
79
+ {"type": "image"},
80
+ {"type": "text",
81
+ "text": prompt},]},
82
+ ])
83
+
84
+ prompts = [self.processor.apply_chat_template(m,
85
+ tokenize=False,
86
+ add_generation_prompt=True)
87
+ for m in messages]
88
+ batch_tensors = self.processor(
89
+ text=prompts,
90
+ images=batch_images,
91
+ return_tensors="pt",
92
+ padding=True
93
+ )
94
+ return batch_tensors, result_meta
95
+
96
+ @torch.no_grad()
97
+ def worker(model, processor, dataset, args, output_dir):
98
+
99
+ rank = int(os.environ["LOCAL_RANK"])
100
+ world_size = int(os.environ["WORLD_SIZE"])
101
+ indices = list(range(rank, len(dataset), world_size))
102
+ print("==>" + " Worker {} Started, responsible for {} images".format(rank, len(indices)))
103
+
104
+ sub_dataset = torch.utils.data.Subset(dataset, indices)
105
+ batch_size = 1
106
+ data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
107
+ labels = []
108
+ for batch_tensors, result_meta in tqdm(data_loader):
109
+
110
+ input_ids = batch_tensors['input_ids'].cuda()
111
+ batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
112
+ with torch.inference_mode():
113
+ output_dict = model.generate(do_sample=False,
114
+ output_scores=True,
115
+ return_dict_in_generate=True,
116
+ max_new_tokens=1600,
117
+ output_logits=True,
118
+ **batch_tensors,)
119
+
120
+ output_ids = output_dict['sequences']
121
+
122
+ for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
123
+ input_token_len = input_id.shape[0]
124
+ n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
125
+ if n_diff_input_output > 0:
126
+ print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
127
+ output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
128
+
129
+ labels.append({
130
+ 'file_name': meta['file_name'],
131
+ 'image_id': meta['image_id'],
132
+ 'instance_id': meta['instance_id'],
133
+ 'keypoints': meta['joints_3d'].reshape(-1).tolist(),
134
+ 'vis': meta['joints_3d_vis'].reshape(-1).tolist(),
135
+ 'im_height': meta['hoi_obj']['height'],
136
+ 'im_width': meta['hoi_obj']['width'],
137
+ 'human_bbox': meta['hoi_obj']['human_bbox'],
138
+ 'object_bbox': meta['hoi_obj']['object_bbox'],
139
+ 'action_labels': meta['hoi_obj']['action_labels'],
140
+ 'description': output,
141
+ })
142
+
143
+ break
144
+
145
+ output_path = os.path.join(args.output_dir, 'labels.json')
146
+ gather_labels_and_save(labels, output_path=output_path)
147
+
148
+ def eval_model(args):
149
+ torch.distributed.init_process_group(backend='nccl')
150
+ rank = int(os.environ["LOCAL_RANK"])
151
+ world_size = int(os.environ["WORLD_SIZE"])
152
+
153
+ print('Init process group: world_size: {}, rank: {}'.format(world_size, rank))
154
+ torch.cuda.set_device(rank)
155
+
156
+ disable_torch_init()
157
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
158
+ args.model_path,
159
+ torch_dtype=torch.bfloat16,
160
+ trust_remote_code=True
161
+ )
162
+ model = model.cuda()
163
+ model.eval()
164
+
165
+ processor = AutoProcessor.from_pretrained(
166
+ args.model_path,
167
+ trust_remote_code=True)
168
+ processor.tokenizer.padding_side = "left"
169
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
170
+
171
+ dataset = PoseHICODetDataset(
172
+ data_path=args.data_path,
173
+ multimodal_cfg=dict(image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
174
+ data_augmentation=False,
175
+ image_size=336,),)
176
+ worker(model, processor, dataset, args, args.output_dir)
177
+
178
+ if __name__ == "__main__":
179
+ parser = argparse.ArgumentParser()
180
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
181
+ parser.add_argument("--data-path", type=str, default="")
182
+ parser.add_argument("--output-dir", type=str, default="")
183
+ args = parser.parse_args()
184
+
185
+ eval_model(args)
186
+