File size: 4,282 Bytes

3a1265d

import os
import dataclasses
from enum import auto, Enum
from typing import List, Tuple
from collections import defaultdict
from .constants import PART_ORDER, COCO_KEYPOINT_NAME

def read_hoi_file_2_dict(hoi_config):
    hoi_dict = {}
    with open(hoi_config, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            nums, obj, action = line.split()  # split on whitespace
            hoi_dict[int(nums)] = [obj, action]      # use nums as int; remove int() if you want string keys
    return hoi_dict

def read_part_state_file_2_dict(part_state_config):
    d = defaultdict(list)
    with open(part_state_config, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            key, val = line.split(":", 1)   # split only on first ":"
            key = key.strip()
            val = val.strip()
            d[key].append(val)
    return d

@dataclasses.dataclass
class Conversation:
    def __init__(self, system='', data_path=''):
        super().__init__()       
        if system == '':
           self.system = f"""
           You are an AI assistant. You will be given an image that contains a main human subject.
           Task:
           Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.

           Hints:
           You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.

           Required Constraints:
           - Start with ONE sentence that summarizes the main action in natural language.
           - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
           - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
           - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
           - Write your description in clear, concise sentences grounded in visible evidence.

           Optional Constraints :
           - Write naturally. Avoid repeating the same sentence pattern.
           - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
           """
        else:
            self.system = system
         
        self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
        self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))

    def _humanpart2word(self, action_labels):
        action_labels_in_words = []
        part_state_keys = list(self.part_state_reference.keys())
        for d in action_labels:
            human_part_id = d['human_part']
            part_state_id = d['partstate']

            part_name = PART_ORDER[human_part_id]
            for key in part_state_keys:
                if key in part_name:
                    states = self.part_state_reference[key]
                    part_state = states[part_state_id]
            action_labels_in_words.append([part_name, part_state])
        return action_labels_in_words

    def _actionid2word(self, hoi_id):
        obj, act = self.hoi_reference[hoi_id]
        return obj, act

    def get_prompt(self, meta):
        hoi_obj = meta['hoi_obj']
        
        hoi_id = hoi_obj['hoi_id']
        obj_in_word, act_in_word = self._actionid2word(hoi_id)
        action_labels = hoi_obj['action_labels']
        action_labels_in_words = self._humanpart2word(action_labels)

        prompt = f"""
               Given the image, describe the visual evidence (especially body parts) that supports the action.
               Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}.
               Use these cues as guidance. Only mention cues you can actually see in the image.
        """
        return prompt






    

if __name__ == "__main__":
   pass