File size: 23,475 Bytes
55500d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 |
import random
random.seed(42)
mcqa_example_pool = [
{
"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
"Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."],
"Answer": "A. A shield"
},
{
"Q": "What card does the male judge pick?",
"Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."],
"Answer": "A"
},
{
"Q": "Who finally find the lost city?",
"Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."],
"Answer": "D. Dr.Michael Heckenberger."
},
{
"Q": "What sport are the two teams of athletes playing?",
"Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."],
"Answer": "C"
},
{
"Q": "What item is not used to decorate the Christmas tree?",
"Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."],
"Answer": "C. Green stars."
},
{
"Q": "What is the main subject matter of the advertisement featured in the video?",
"Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."],
"Answer": "A"
},
{
"Q": "What country's practice game is this?",
"Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."],
"Answer": "B. USA."
},
{
"Q": "According to the video, which team ultimately won?",
"Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."],
"Answer": "A"
},
{
"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
"Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."],
"Answer": "A. Golgi apparatus (Golgi body)."
},
{
"Q": "At the beginning, what is the player's rank?",
"Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."],
"Answer": "D"
},
{
"Q": "Which team in the video reached the finish line first?",
"Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."],
"Answer": "B"
},
{
"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
"Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."],
"Answer": "C. He is a neutral individual athlete."
},
{
"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
"Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."],
"Answer": "B"
},
{
"Q": "Which woman works as a chef?",
"Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."],
"Answer": "A"
},
{
"Q": "What kind of chess are the old people in the video playing?",
"Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."],
"Answer": "C. Chinese chess."
},
{
"Q": "Which ingredient is not used in the video?",
"Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."],
"Answer": "D"
},
{
"Q": "Who does the video focus on regarding their work with globular clusters?",
"Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."],
"Answer": "A"
}
]
def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n"
"You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
"## CRITICAL RULES:"
"1. **Uniqueness Guarantee**: Each question must include either:"
" - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
" - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
" - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
"2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
"3. **Description DescrGroundingiption**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
"5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
"\n"
)
choosed_example_pool = random.sample(mcqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Options = example['Options']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
" 'Options': [\n"
f" '{Options[0]}',\n"
f" '{Options[1]}',\n"
f" '{Options[2]}',\n"
f" '{Options[3]}'\n"
" ],\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
system_prompt = task_inst_part + example_part
user_prompt = (
"I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
"You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
"## INSTRUCTIONS:\n"
"- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
"- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
"- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
"\n"
)
choosed_example_pool = random.sample(mcqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Options = example['Options']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
" 'Options': [\n"
f" '{Options[0]}',\n"
f" '{Options[1]}',\n"
f" '{Options[2]}',\n"
f" '{Options[3]}'\n"
" ],\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
guidelines_part = (
"## GUIDELINES FOR CREATING QUESTIONS:\n"
"- **Specificity**: Ask about singular, clearly defined object.\n"
"- **Visual Certainty**: Ensure the correct answer is unambiguous.\n"
"- **Description Grounding**: Base all questions and answers on the video description.\n"
"- **Plausible Distractors**: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n"
"- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
"\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]")
system_prompt = task_inst_part + example_part + guidelines_part
user_prompt = (
"I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
openqa_example_pool = [
{
"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
"Answer": "Jon Snow uses a shield to fight with Ramsay Bolton."
},
{
"Q": "What card does the male judge pick?",
"Answer": "The male judge picks the 2 of spades."
},
{
"Q": "Who finally finds the lost city?",
"Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city."
},
{
"Q": "What sport are the two teams of athletes playing?",
"Answer": "The two teams of athletes are playing rugby."
},
{
"Q": "What item is not used to decorate the Christmas tree?",
"Answer": "Green stars are not used to decorate the Christmas tree."
},
{
"Q": "What is the main subject matter of the advertisement featured in the video?",
"Answer": "The main subject matter of the advertisement featured in the video is the Audible app."
},
{
"Q": "What country's practice game is this?",
"Answer": "This is a practice game from the USA."
},
{
"Q": "According to the video, which team ultimately won?",
"Answer": "According to the video, the team that ultimately won is China."
},
{
"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
"Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins."
},
{
"Q": "At the beginning, what is the player's rank?",
"Answer": "At the beginning, the player's rank is last."
},
{
"Q": "Which team in the video reached the finish line first?",
"Answer": "In the video, the Canadian team reached the finish line first."
},
{
"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
"Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete."
},
{
"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
"Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes."
},
{
"Q": "Which woman works as a chef?",
"Answer": "The woman who works as a chef is Diamante."
},
{
"Q": "What kind of chess are the old people in the video playing?",
"Answer": "The old people in the video are playing Chinese chess."
},
{
"Q": "Which ingredient is not used in the video?",
"Answer": "Plastic bottles are not used in the video."
},
{
"Q": "Who does the video focus on regarding their work with globular clusters?",
"Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters."
}
]
def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n\n"
"## CRITICAL RULES:"
"1. **Uniqueness Guarantee**: Each question must include either:"
" - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
" - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
" - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
"2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
"3. **Description Grounding**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
"5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Answer'`: The correct answer as a complete sentence.\n"
"\n"
)
# 使用 OpenQA 示例池
choosed_example_pool = random.sample(openqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
system_prompt = task_inst_part + example_part
user_prompt = (
"I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
import random
def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n"
"The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
"## INSTRUCTIONS:\n"
"- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
"- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
"- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Answer'`: The correct answer as a complete sentence.\n"
"\n"
)
# 使用 OpenQA 示例池
choosed_example_pool = random.sample(openqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
guidelines_part = (
"## GUIDELINES FOR CREATING QUESTIONS:\n"
"- **Specificity**: Ask about singular, clearly defined objects.\n"
"- **Visual Certainty**: Ensure the correct answer is unambiguous and directly observable in the description.\n"
"- **Description Grounding**: Base all questions and answers on the video description.\n"
"- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
"- **Complete Sentence Answers**: Always provide the answer as a grammatically correct, complete sentence.\n"
"\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]"
)
system_prompt = task_inst_part + example_part + guidelines_part
user_prompt = (
"I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt |