|
|
import os |
|
|
import dataclasses |
|
|
from enum import auto, Enum |
|
|
from typing import List, Tuple |
|
|
from collections import defaultdict |
|
|
from .constants import PART_ORDER, COCO_KEYPOINT_NAME |
|
|
|
|
|
def read_hoi_file_2_dict(hoi_config): |
|
|
hoi_dict = {} |
|
|
with open(hoi_config, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line or line.startswith("#"): |
|
|
continue |
|
|
nums, obj, action = line.split() |
|
|
hoi_dict[int(nums)] = [obj, action] |
|
|
return hoi_dict |
|
|
|
|
|
def read_part_state_file_2_dict(part_state_config): |
|
|
d = defaultdict(list) |
|
|
with open(part_state_config, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line or line.startswith("#"): |
|
|
continue |
|
|
|
|
|
key, val = line.split(":", 1) |
|
|
key = key.strip() |
|
|
val = val.strip() |
|
|
d[key].append(val) |
|
|
return d |
|
|
|
|
|
@dataclasses.dataclass |
|
|
class Conversation: |
|
|
def __init__(self, system='', data_path=''): |
|
|
super().__init__() |
|
|
if system == '': |
|
|
self.system = f""" |
|
|
You are an AI assistant. You will be given an image that contains a main human subject. |
|
|
Task: |
|
|
Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects. |
|
|
|
|
|
Hints: |
|
|
You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe. |
|
|
|
|
|
Required Constraints: |
|
|
- Start with ONE sentence that summarizes the main action in natural language. |
|
|
- When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}. |
|
|
- Do NOT invent body-part names outside these sets (no synonyms, no paraphrases). |
|
|
- If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists. |
|
|
- Write your description in clear, concise sentences grounded in visible evidence. |
|
|
|
|
|
Optional Constraints : |
|
|
- Write naturally. Avoid repeating the same sentence pattern. |
|
|
- Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary. |
|
|
""" |
|
|
else: |
|
|
self.system = system |
|
|
|
|
|
self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
|
|
self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
|
|
|
|
|
def _humanpart2word(self, action_labels): |
|
|
action_labels_in_words = [] |
|
|
part_state_keys = list(self.part_state_reference.keys()) |
|
|
for d in action_labels: |
|
|
human_part_id = d['human_part'] |
|
|
part_state_id = d['partstate'] |
|
|
|
|
|
part_name = PART_ORDER[human_part_id] |
|
|
for key in part_state_keys: |
|
|
if key in part_name: |
|
|
states = self.part_state_reference[key] |
|
|
part_state = states[part_state_id] |
|
|
action_labels_in_words.append([part_name, part_state]) |
|
|
return action_labels_in_words |
|
|
|
|
|
def _actionid2word(self, hoi_id): |
|
|
obj, act = self.hoi_reference[hoi_id] |
|
|
return obj, act |
|
|
|
|
|
def get_prompt(self, meta): |
|
|
hoi_obj = meta['hoi_obj'] |
|
|
|
|
|
hoi_id = hoi_obj['hoi_id'] |
|
|
obj_in_word, act_in_word = self._actionid2word(hoi_id) |
|
|
action_labels = hoi_obj['action_labels'] |
|
|
action_labels_in_words = self._humanpart2word(action_labels) |
|
|
|
|
|
prompt = f""" |
|
|
Given the image, describe the visual evidence (especially body parts) that supports the action. |
|
|
Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}. |
|
|
Use these cues as guidance. Only mention cues you can actually see in the image. |
|
|
""" |
|
|
return prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pass |