synthesis / AttributePerception /prompt.py

Upload folder using huggingface_hub

55500d6 verified 9 months ago

23.4 kB

	import random
	random.seed(42)

	mcqa_example_pool = [
	{"Q": "What is Theresa Woo's profession?",
	"Options": ["A. Police Officer.", "B. Journalist.", "C. Doctor.", "D. Lawyer."],
	"Answer": "B. Journalist."},

	{"Q": "What is true about the theory of the formation of the earth that NASA believes in?",
	"Options": ["A. It can explain the time span of the earth formation.", "B. It cannot explain the formation of gas giants.", "C. It can explain the creation of giant worlds.", "D. Solar wind takes part in it."],
	"Answer": "D"},

	{"Q": "What color is the makeup bag in the video?",
	"Options": ["A. Black.", "B. Brown.", "C. Bright blue.", "D. Pink."],
	"Answer": "B. Brown."},

	{"Q": "What the color is the parrot that stands on wood?",
	"Options": ["A. Blue.", "B. Green.", "C. White.", "D. Black."],
	"Answer": "B"},

	{"Q": "Which element has the highest peak in the spectra of sample token by Dr. Rudy Reimer?",
	"Options": ["A. Iron.", "B. Zirconium.", "C. Helium.", "D. Aluminum."],
	"Answer": "B. Zirconium."},

	{"Q": "What color is the laptop the woman is holding ?",
	"Options": ["A. Black.", "B. Pink.", "C. Silver.", "D. Green."],
	"Answer": "B"},

	{"Q": "In line with the video evidence, which of the following statements about the world's longest railway line is not correct?",
	"Options": ["A. It was built by Russia and China.", "B. The length of it is 9289km.", "C. It was completed in 1916.", "D. French loans helped a lot in the process of building it."],
	"Answer": "A. It was built by Russia and China."},

	{"Q": "Based on the video, how is the group dressed for their performance?",
	"Options": ["A. Costumes from different musical eras.", "B. Casual streetwear.", "C. Formal attire with black and white colors.", "D. Matching school uniforms."],
	"Answer": "C"},

	{"Q": "What do the two people have in common?",
	"Options": ["A. They are both piano-learner.", "B. They both wear glasses.", "C. They are both good at playing the violin.", "D. They both wear black shirt."],
	"Answer": "D. They both wear black shirt."},

	{"Q": "What's the lichens possibly like according to its image in the video?",
	"Options": ["A. It is black.", "B. Strip-shaped single-cell aggregates.", "C. Round single cell aggregate.", "D. It can be very huge."],
	"Answer": "B"},

	{"Q": "What are the colors of the famous statue of Augustus Caesar that is supposed to?",
	"Options": ["A. Totally white.", "B. Mainly white and red.", "C. Mainly white and green.", "D. Mainly red and green."],
	"Answer": "B. Mainly white and red."},

	{"Q": "Which car was the first to be designed with an automobile roof?",
	"Options": ["A. Type 44.", "B. Type 46.", "C. Type 41.", "D. Type 55."],
	"Answer": "A"},

	{"Q": "Which of the following features can describe Hamlet in the video?",
	"Options": ["A. Wearing brown coat.", "B. Blonde hair.", "C. Thick beard.", "D. Long hair."],
	"Answer": "D. Long hair."},

	{"Q": "Which of the following methods of revealing the original color of the ancient art work is not mentioned in the video?",
	"Options": ["A. By imagining the color that fits in well with the art work.", "B. Ultraviolet light.", "C. Pigment analysis.", "D. Sampling some visible colors."],
	"Answer": "A"},

	{"Q": "What outfit was Cardi B wearing?",
	"Options": ["A. A soft, pastel blue gown with a strapless design that fits snugly, highlighting her shoulders and neckline.",
	"B. A striking red dress with a black underdress or lining.",
	"C. A gorgeous orange sequined dress with a form-fitting silhouette.",
	"D. A blue dress with a unique, sculptural design that resembles a 'waterfall of fabric' that flowed around her."],
	"Answer": "D. A blue dress with a unique, sculptural design that resembles a 'waterfall of fabric' that flowed around her."}
	]

	def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality attribute perception questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality attribute perception question that requires identifying properties or characteristics of objects/people, such as colors, materials, professions, ownership, appearance, position, role, or other static properties.) while using composite features or contextual constraints to ensure answer uniqueness in longer videos.\n"
	"You must also provide 4 answer options (A–D), with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
	"## CRITICAL RULES:\n"
	"1. Uniqueness Guarantee: Each question must include one of the following constraints to ensure answer uniqueness:\n"
	"- Composite Feature: Combine multiple attributes (e.g., 'What is color of the hat worn by the woman in red dress').\n"
	"- Contextual Constraint: Use scene, or background information (e.g., 'What is the brand of the car at parking lot').\n"
	"- Functional/Usage Constraint: Describe the object's function, purpose, or role (e.g., 'what is the material of the gloves used for cooking').\n"
	"2. Static Attributes Only: Focus on static features such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"3. Description Grounding: Answers must be directly verifiable from the provided text.\n\n"
	"4. Focus on Visual Entities Attribute: The question must test the model’s ability to recognize the specific attribute of objects.\n"
	"6. Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"7. Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Question...', 'Options': ['A. ...', ..., 'D. ...'], 'Answer': 'X'}]\n\n"
	)

	choosed_example_pool = random.sample(mcqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Options = example['Options']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	" 'Options': [\n"
	f" '{Options[0]}',\n"
	f" '{Options[1]}',\n"
	f" '{Options[2]}',\n"
	f" '{Options[3]}'\n"
	" ],\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header


	system_prompt = task_inst_part + example_part_header

	user_prompt = (
	"I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these description and the system instructions, generate one high-quality attribute perception question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying the specific attribute of visible objects, such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"- You must use an composite feature, contextual constraint or functional/usage constraint in question to constrain the question, thereby ensuring answer uniqueness.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt

	def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):

	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality attribute perception questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality attribute perception question that requires identifying properties or characteristics of objects/people, such as colors, materials, professions, ownership, appearance, position, role, etc.) while using composite features or contextual constraints to ensure answer uniqueness in longer videos.\n"
	"You must also provide 4 answer options (A–D), with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
	"## INSTRUCTIONS:\n"
	"- Focus on Attributes: The question must test the model’s ability to recognize colors, materials, professions, ownership, factual claims, or other static properties.\n"
	"- Ground in Description: All answers must be verifiable from the provided description without temporal reasoning or external knowledge.\n"
	"- Avoid Actions: Do not ask about dynamic behaviors or events (e.g., 'What is X doing?').\n"
	"- Explicit Evidence: Ensure the correct answer is directly stated or visually implied in the description.\n"
	"- Output Format: Format the output as a list of dictionaries with:\n"
	" - `'Q'`: The question.\n"
	" - `'Options'`: Four options labeled 'A' to 'D'.\n"
	" - `'Answer'`: The correct option (e.g., `'A'`).\n\n"
	)

	# Randomly select 3 examples
	choosed_example_pool = random.sample(mcqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Options = example['Options']
	Answer = example['Answer']
	example_part_header += (
	f"{idx+1}. {{'Q': '{Q}',\n"
	" 'Options': [\n"
	f" '{Options[0]}',\n"
	f" '{Options[1]}',\n"
	f" '{Options[2]}',\n"
	f" '{Options[3]}'\n"
	" ],\n"
	f" 'Answer': '{Answer}'}}\n\n"
	)

	guidelines_part = (
	"## GUIDELINES FOR CREATING QUESTIONS:\n"
	"- Attribute Specificity: Focus on singular properties (not 'Describe X').\n"
	"- Factual Grounding: For truth-judgment questions, base options on explicit claims in the description.\n"
	"- Plausible Distractors: Wrong options should be semantically related.\n"
	"- No Implicit Knowledge: Avoid questions requiring unstated information.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Question...', 'Options': ['A. ...', ..., 'D. ...'], 'Answer': 'X'}]"
	)

	system_prompt = task_inst_part + example_part_header + guidelines_part

	user_prompt = (
	"I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these description and the system instructions, generate one high-quality attribute perception question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying the specific attribute of visible objects, such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt


	openqa_example_pool = [
	{"Q": "What is Theresa Woo's profession?",
	"Answer": "Theresa Woo's profession is a journalist."},

	{"Q": "What is true about the theory of the formation of the earth that NASA believes in?",
	"Answer": "The theory of the formation of the earth that NASA believes in involves solar wind as part of the process."},

	{"Q": "What color is the makeup bag in the video?",
	"Answer": "The makeup bag in the video is brown."},

	{"Q": "What is the color of the parrot that stands on wood?",
	"Answer": "The color of the parrot that stands on wood is green."},

	{"Q": "Which element has the highest peak in the spectra of the sample taken by Dr. Rudy Reimer?",
	"Answer": "The element with the highest peak in the spectra of the sample taken by Dr. Rudy Reimer is zirconium."},

	{"Q": "What color is the laptop the woman is holding?",
	"Answer": "The laptop the woman is holding is pink."},

	{"Q": "In line with the video evidence, which statement about the world's longest railway line is not correct?",
	"Answer": "The statement that the world's longest railway line was built by Russia and China is not correct."},

	{"Q": "Based on the video, how is the group dressed for their performance?",
	"Answer": "The group is dressed in formal attire with black and white colors for their performance."},

	{"Q": "What do the two people have in common?",
	"Answer": "The two people both wear black shirts."},

	{"Q": "What is the lichens possibly like according to its image in the video?",
	"Answer": "According to its image in the video, the lichens are possibly strip-shaped single-cell aggregates."},

	{"Q": "What are the colors of the famous statue of Augustus Caesar?",
	"Answer": "The colors of the famous statue of Augustus Caesar are mainly white and red."},

	{"Q": "Which car was the first to be designed with an automobile roof?",
	"Answer": "The first car to be designed with an automobile roof was Type 44."},

	{"Q": "Which feature can describe Hamlet in the video?",
	"Answer": "Hamlet in the video can be described as having long hair."},

	{"Q": "Which method of revealing the original color of ancient artwork is not mentioned in the video?",
	"Answer": "The method of revealing the original color of ancient artwork by imagining the color that fits in well with the artwork is not mentioned in the video."},

	{"Q": "What outfit was Cardi B wearing?",
	"Answer": "Cardi B was wearing a blue dress with a unique, sculptural design that resembles a 'waterfall of fabric' flowing around her."}
	]

	import random

	def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):
	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality attribute perception questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality attribute perception question that requires identifying properties or characteristics of objects/people, such as colors, materials, professions, ownership, appearance, position, role, or other static properties. while using composite features or contextual constraints to ensure answer uniqueness in longer videos.\n"
	"The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
	"## CRITICAL RULES:\n"
	"1. Uniqueness Guarantee: Each question must include one of the following constraints to ensure answer uniqueness:\n"
	"- Composite Feature: Combine multiple attributes (e.g., 'What is the color of the hat worn by the woman in red dress').\n"
	"- Contextual Constraint: Use scene, or background information (e.g., 'What is the brand of the car at parking lot').\n"
	"- Functional/Usage Constraint: Describe the object's function, purpose, or role (e.g., 'What is the material of the gloves used for cooking').\n"
	"2. Static Attributes Only: Focus on static features such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"3. Description Grounding: Answers must be directly verifiable from the provided text.\n"
	"4. Focus on Visual Entities Attribute: The question must test the model’s ability to recognize the specific attribute of objects.\n"
	"5. Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"6. Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Question...', 'Answer': 'Complete sentence answer...'}]\n\n"
	)

	# 使用 OpenQA 示例池
	choosed_example_pool = random.sample(openqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header

	system_prompt = task_inst_part + example_part

	user_prompt = (
	"I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these descriptions and the system instructions, generate one high-quality attribute perception question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying the specific attribute of visible objects, such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"- You must use a composite feature, contextual constraint, or functional/usage constraint in the question to constrain the question, thereby ensuring answer uniqueness.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt

	import random

	def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality attribute perception questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality attribute perception question that requires identifying properties or characteristics of objects/people, such as colors, materials, professions, ownership, appearance, position, role, etc.) while using composite features or contextual constraints to ensure answer uniqueness in longer videos.\n"
	"The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
	"## INSTRUCTIONS:\n"
	"- Focus on Attributes: The question must test the model’s ability to recognize colors, materials, professions, ownership, factual claims, or other static properties.\n"
	"- Ground in Description: All answers must be verifiable from the provided description without temporal reasoning or external knowledge.\n"
	"- Avoid Actions: Do not ask about dynamic behaviors or events (e.g., 'What is X doing?').\n"
	"- Explicit Evidence: Ensure the correct answer is directly stated or visually implied in the description.\n"
	"- Output Format: Format the output as a list of dictionaries with:\n"
	" - `'Q'`: The question.\n"
	" - `'Answer'`: The correct answer as a complete sentence.\n\n"
	)

	# 使用 OpenQA 示例池
	choosed_example_pool = random.sample(openqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Answer = example['Answer']
	example_part_header += (
	f"{idx+1}. {{'Q': '{Q}',\n"
	f" 'Answer': '{Answer}'}}\n\n"
	)

	guidelines_part = (
	"## GUIDELINES FOR CREATING QUESTIONS:\n"
	"- Attribute Specificity: Focus on singular properties (not 'Describe X').\n"
	"- Factual Grounding: For truth-judgment questions, base answers on explicit claims in the description.\n"
	"- Complete Sentence Answers: Always provide the answer as a grammatically correct, complete sentence.\n"
	"- No Implicit Knowledge: Avoid questions requiring unstated information.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Question...', 'Answer': 'Your complete sentence answer here...'}]"
	)

	system_prompt = task_inst_part + example_part_header + guidelines_part

	user_prompt = (
	"I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these descriptions and the system instructions, generate one high-quality attribute perception question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying the specific attribute of visible objects, such as colors, materials, professions, ownership, appearance, position, role, or other static properties.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt