AutoLLMAnnotation / data /convsersation.py

init

3a1265d 20 days ago

4.28 kB

	import os
	import dataclasses
	from enum import auto, Enum
	from typing import List, Tuple
	from collections import defaultdict
	from .constants import PART_ORDER, COCO_KEYPOINT_NAME

	def read_hoi_file_2_dict(hoi_config):
	hoi_dict = {}
	with open(hoi_config, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue
	nums, obj, action = line.split() # split on whitespace
	hoi_dict[int(nums)] = [obj, action] # use nums as int; remove int() if you want string keys
	return hoi_dict

	def read_part_state_file_2_dict(part_state_config):
	d = defaultdict(list)
	with open(part_state_config, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue

	key, val = line.split(":", 1) # split only on first ":"
	key = key.strip()
	val = val.strip()
	d[key].append(val)
	return d

	@dataclasses.dataclass
	class Conversation:
	def __init__(self, system='', data_path=''):
	super().__init__()
	if system == '':
	self.system = f"""
	You are an AI assistant. You will be given an image that contains a main human subject.
	Task:
	Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects.

	Hints:
	You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe.

	Required Constraints:
	- Start with ONE sentence that summarizes the main action in natural language.
	- When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}.
	- Do NOT invent body-part names outside these sets (no synonyms, no paraphrases).
	- If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists.
	- Write your description in clear, concise sentences grounded in visible evidence.

	Optional Constraints :
	- Write naturally. Avoid repeating the same sentence pattern.
	- Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary.
	"""
	else:
	self.system = system

	self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt'))
	self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt'))

	def _humanpart2word(self, action_labels):
	action_labels_in_words = []
	part_state_keys = list(self.part_state_reference.keys())
	for d in action_labels:
	human_part_id = d['human_part']
	part_state_id = d['partstate']

	part_name = PART_ORDER[human_part_id]
	for key in part_state_keys:
	if key in part_name:
	states = self.part_state_reference[key]
	part_state = states[part_state_id]
	action_labels_in_words.append([part_name, part_state])
	return action_labels_in_words

	def _actionid2word(self, hoi_id):
	obj, act = self.hoi_reference[hoi_id]
	return obj, act

	def get_prompt(self, meta):
	hoi_obj = meta['hoi_obj']

	hoi_id = hoi_obj['hoi_id']
	obj_in_word, act_in_word = self._actionid2word(hoi_id)
	action_labels = hoi_obj['action_labels']
	action_labels_in_words = self._humanpart2word(action_labels)

	prompt = f"""
	Given the image, describe the visual evidence (especially body parts) that supports the action.
	Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}.
	Use these cues as guidance. Only mention cues you can actually see in the image.
	"""
	return prompt








	if __name__ == "__main__":
	pass