File size: 23,475 Bytes
55500d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
import random
random.seed(42)

mcqa_example_pool = [
    {
        "Q": "What does Jon Snow use to fight with Ramsay Bolton?",
        "Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."],
        "Answer": "A. A shield"
    },
    {
        "Q": "What card does the male judge pick?",
        "Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."],
        "Answer": "A"
    },
    {
        "Q": "Who finally find the lost city?",
        "Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."],
        "Answer": "D. Dr.Michael Heckenberger."
    },
    {
        "Q": "What sport are the two teams of athletes playing?",
        "Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."],
        "Answer": "C"
    },
    {
        "Q": "What item is not used to decorate the Christmas tree?",
        "Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."],
        "Answer": "C. Green stars."
    },
    {
        "Q": "What is the main subject matter of the advertisement featured in the video?",
        "Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."],
        "Answer": "A"
    },
    {
        "Q": "What country's practice game is this?",
        "Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."],
        "Answer": "B. USA."
    },
    {
        "Q": "According to the video, which team ultimately won?",
        "Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."],
        "Answer": "A"
    },
    {
        "Q": "Which cellular structure is responsible for receiving proteins according to the video?",
        "Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."],
        "Answer": "A. Golgi apparatus (Golgi body)."
    },
    {
        "Q": "At the beginning, what is the player's rank?",
        "Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."],
        "Answer": "D"
    },
    {
        "Q": "Which team in the video reached the finish line first?",
        "Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."],
        "Answer": "B"
    },
    {
        "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
        "Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."],
        "Answer": "C. He is a neutral individual athlete."
    },
    {
        "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
        "Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."],
        "Answer": "B"
    },
    {
        "Q": "Which woman works as a chef?",
        "Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."],
        "Answer": "A"
    },
    {
        "Q": "What kind of chess are the old people in the video playing?",
        "Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."],
        "Answer": "C. Chinese chess."
    },
    {
        "Q": "Which ingredient is not used in the video?",
        "Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."],
        "Answer": "D"
    },
    {
        "Q": "Who does the video focus on regarding their work with globular clusters?",
        "Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."],
        "Answer": "A"
    }
]

def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n"
        "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
        "## CRITICAL RULES:"
        "1. **Uniqueness Guarantee**: Each question must include either:"
        "   - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
        "   - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
        "   - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
        "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
        "3. **Description DescrGroundingiption**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
        "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
        "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
        "  - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
        "\n"
    )

    choosed_example_pool = random.sample(mcqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Options = example['Options']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            "   'Options': [\n"
            f"       '{Options[0]}',\n"
            f"       '{Options[1]}',\n"
            f"       '{Options[2]}',\n"
            f"       '{Options[3]}'\n"
            "   ],\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header
    system_prompt = task_inst_part + example_part

    user_prompt = (
        "I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n" 
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):

    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
        "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
        "## INSTRUCTIONS:\n"
        "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
        "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
        "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
        "  - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
        "\n"
    )

    choosed_example_pool = random.sample(mcqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Options = example['Options']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            "   'Options': [\n"
            f"       '{Options[0]}',\n"
            f"       '{Options[1]}',\n"
            f"       '{Options[2]}',\n"
            f"       '{Options[3]}'\n"
            "   ],\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header

    guidelines_part = (
        "## GUIDELINES FOR CREATING QUESTIONS:\n"
        "- **Specificity**: Ask about singular, clearly defined object.\n"
        "- **Visual Certainty**: Ensure the correct answer is unambiguous.\n"
        "- **Description Grounding**: Base all questions and answers on the video description.\n"
        "- **Plausible Distractors**: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n"
        "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
        "\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]")


    system_prompt = task_inst_part + example_part + guidelines_part

    user_prompt = (
        "I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

openqa_example_pool = [
    {
        "Q": "What does Jon Snow use to fight with Ramsay Bolton?",
        "Answer": "Jon Snow uses a shield to fight with Ramsay Bolton."
    },
    {
        "Q": "What card does the male judge pick?",
        "Answer": "The male judge picks the 2 of spades."
    },
    {
        "Q": "Who finally finds the lost city?",
        "Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city."
    },
    {
        "Q": "What sport are the two teams of athletes playing?",
        "Answer": "The two teams of athletes are playing rugby."
    },
    {
        "Q": "What item is not used to decorate the Christmas tree?",
        "Answer": "Green stars are not used to decorate the Christmas tree."
    },
    {
        "Q": "What is the main subject matter of the advertisement featured in the video?",
        "Answer": "The main subject matter of the advertisement featured in the video is the Audible app."
    },
    {
        "Q": "What country's practice game is this?",
        "Answer": "This is a practice game from the USA."
    },
    {
        "Q": "According to the video, which team ultimately won?",
        "Answer": "According to the video, the team that ultimately won is China."
    },
    {
        "Q": "Which cellular structure is responsible for receiving proteins according to the video?",
        "Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins."
    },
    {
        "Q": "At the beginning, what is the player's rank?",
        "Answer": "At the beginning, the player's rank is last."
    },
    {
        "Q": "Which team in the video reached the finish line first?",
        "Answer": "In the video, the Canadian team reached the finish line first."
    },
    {
        "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
        "Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete."
    },
    {
        "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
        "Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes."
    },
    {
        "Q": "Which woman works as a chef?",
        "Answer": "The woman who works as a chef is Diamante."
    },
    {
        "Q": "What kind of chess are the old people in the video playing?",
        "Answer": "The old people in the video are playing Chinese chess."
    },
    {
        "Q": "Which ingredient is not used in the video?",
        "Answer": "Plastic bottles are not used in the video."
    },
    {
        "Q": "Who does the video focus on regarding their work with globular clusters?",
        "Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters."
    }
]


def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):

    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n\n"
        "## CRITICAL RULES:"
        "1. **Uniqueness Guarantee**: Each question must include either:"
        "   - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
        "   - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
        "   - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
        "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
        "3. **Description Grounding**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
        "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
        "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Answer'`: The correct answer as a complete sentence.\n"
        "\n"
    )

    # 使用 OpenQA 示例池
    choosed_example_pool = random.sample(openqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header
    system_prompt = task_inst_part + example_part

    user_prompt = (
        "I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n" 
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

import random

def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n"
        "The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
        "## INSTRUCTIONS:\n"
        "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
        "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
        "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Answer'`: The correct answer as a complete sentence.\n"
        "\n"
    )

    # 使用 OpenQA 示例池
    choosed_example_pool = random.sample(openqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header

    guidelines_part = (
        "## GUIDELINES FOR CREATING QUESTIONS:\n"
        "- **Specificity**: Ask about singular, clearly defined objects.\n"
        "- **Visual Certainty**: Ensure the correct answer is unambiguous and directly observable in the description.\n"
        "- **Description Grounding**: Base all questions and answers on the video description.\n"
        "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
        "- **Complete Sentence Answers**: Always provide the answer as a grammatically correct, complete sentence.\n"
        "\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]"
    )

    system_prompt = task_inst_part + example_part + guidelines_part

    user_prompt = (
        "I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt