File size: 23,475 Bytes

55500d6

import random
random.seed(42)

mcqa_example_pool = [
    {
        "Q": "What does Jon Snow use to fight with Ramsay Bolton?",
        "Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."],
        "Answer": "A. A shield"
    },
    {
        "Q": "What card does the male judge pick?",
        "Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."],
        "Answer": "A"
    },
    {
        "Q": "Who finally find the lost city?",
        "Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."],
        "Answer": "D. Dr.Michael Heckenberger."
    },
    {
        "Q": "What sport are the two teams of athletes playing?",
        "Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."],
        "Answer": "C"
    },
    {
        "Q": "What item is not used to decorate the Christmas tree?",
        "Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."],
        "Answer": "C. Green stars."
    },
    {
        "Q": "What is the main subject matter of the advertisement featured in the video?",
        "Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."],
        "Answer": "A"
    },
    {
        "Q": "What country's practice game is this?",
        "Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."],
        "Answer": "B. USA."
    },
    {
        "Q": "According to the video, which team ultimately won?",
        "Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."],
        "Answer": "A"
    },
    {
        "Q": "Which cellular structure is responsible for receiving proteins according to the video?",
        "Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."],
        "Answer": "A. Golgi apparatus (Golgi body)."
    },
    {
        "Q": "At the beginning, what is the player's rank?",
        "Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."],
        "Answer": "D"
    },
    {
        "Q": "Which team in the video reached the finish line first?",
        "Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."],
        "Answer": "B"
    },
    {
        "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
        "Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."],
        "Answer": "C. He is a neutral individual athlete."
    },
    {
        "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
        "Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."],
        "Answer": "B"
    },
    {
        "Q": "Which woman works as a chef?",
        "Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."],
        "Answer": "A"
    },
    {
        "Q": "What kind of chess are the old people in the video playing?",
        "Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."],
        "Answer": "C. Chinese chess."
    },
    {
        "Q": "Which ingredient is not used in the video?",
        "Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."],
        "Answer": "D"
    },
    {
        "Q": "Who does the video focus on regarding their work with globular clusters?",
        "Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."],
        "Answer": "A"
    }
]

def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n"
        "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
        "## CRITICAL RULES:"
        "1. **Uniqueness Guarantee**: Each question must include either:"
        "   - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
        "   - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
        "   - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
        "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
        "3. **Description DescrGroundingiption**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
        "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
        "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
        "  - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
        "\n"
    )

    choosed_example_pool = random.sample(mcqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Options = example['Options']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            "   'Options': [\n"
            f"       '{Options[0]}',\n"
            f"       '{Options[1]}',\n"
            f"       '{Options[2]}',\n"
            f"       '{Options[3]}'\n"
            "   ],\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header
    system_prompt = task_inst_part + example_part

    user_prompt = (
        "I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n" 
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):

    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
        "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
        "## INSTRUCTIONS:\n"
        "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
        "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
        "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
        "  - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
        "\n"
    )

    choosed_example_pool = random.sample(mcqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Options = example['Options']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            "   'Options': [\n"
            f"       '{Options[0]}',\n"
            f"       '{Options[1]}',\n"
            f"       '{Options[2]}',\n"
            f"       '{Options[3]}'\n"
            "   ],\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header

    guidelines_part = (
        "## GUIDELINES FOR CREATING QUESTIONS:\n"
        "- **Specificity**: Ask about singular, clearly defined object.\n"
        "- **Visual Certainty**: Ensure the correct answer is unambiguous.\n"
        "- **Description Grounding**: Base all questions and answers on the video description.\n"
        "- **Plausible Distractors**: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n"
        "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
        "\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]")


    system_prompt = task_inst_part + example_part + guidelines_part

    user_prompt = (
        "I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

openqa_example_pool = [
    {
        "Q": "What does Jon Snow use to fight with Ramsay Bolton?",
        "Answer": "Jon Snow uses a shield to fight with Ramsay Bolton."
    },
    {
        "Q": "What card does the male judge pick?",
        "Answer": "The male judge picks the 2 of spades."
    },
    {
        "Q": "Who finally finds the lost city?",
        "Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city."
    },
    {
        "Q": "What sport are the two teams of athletes playing?",
        "Answer": "The two teams of athletes are playing rugby."
    },
    {
        "Q": "What item is not used to decorate the Christmas tree?",
        "Answer": "Green stars are not used to decorate the Christmas tree."
    },
    {
        "Q": "What is the main subject matter of the advertisement featured in the video?",
        "Answer": "The main subject matter of the advertisement featured in the video is the Audible app."
    },
    {
        "Q": "What country's practice game is this?",
        "Answer": "This is a practice game from the USA."
    },
    {
        "Q": "According to the video, which team ultimately won?",
        "Answer": "According to the video, the team that ultimately won is China."
    },
    {
        "Q": "Which cellular structure is responsible for receiving proteins according to the video?",
        "Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins."
    },
    {
        "Q": "At the beginning, what is the player's rank?",
        "Answer": "At the beginning, the player's rank is last."
    },
    {
        "Q": "Which team in the video reached the finish line first?",
        "Answer": "In the video, the Canadian team reached the finish line first."
    },
    {
        "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
        "Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete."
    },
    {
        "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
        "Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes."
    },
    {
        "Q": "Which woman works as a chef?",
        "Answer": "The woman who works as a chef is Diamante."
    },
    {
        "Q": "What kind of chess are the old people in the video playing?",
        "Answer": "The old people in the video are playing Chinese chess."
    },
    {
        "Q": "Which ingredient is not used in the video?",
        "Answer": "Plastic bottles are not used in the video."
    },
    {
        "Q": "Who does the video focus on regarding their work with globular clusters?",
        "Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters."
    }
]


def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):

    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n\n"
        "## CRITICAL RULES:"
        "1. **Uniqueness Guarantee**: Each question must include either:"
        "   - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
        "   - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
        "   - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
        "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
        "3. **Description Grounding**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
        "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
        "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Answer'`: The correct answer as a complete sentence.\n"
        "\n"
    )

    # 使用 OpenQA 示例池
    choosed_example_pool = random.sample(openqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header
    system_prompt = task_inst_part + example_part

    user_prompt = (
        "I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n" 
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt

import random

def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
    task_inst_part = (
        "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
        "## TASK:\n"
        "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n"
        "The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
        "## INSTRUCTIONS:\n"
        "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
        "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
        "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
        "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
        "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
        "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
        "  - `'Q'`: The question.\n"
        "  - `'Answer'`: The correct answer as a complete sentence.\n"
        "\n"
    )

    # 使用 OpenQA 示例池
    choosed_example_pool = random.sample(openqa_example_pool, 3)
    example_part_header = "## EXAMPLES:\n"
    for idx, example in enumerate(choosed_example_pool):
        Q = example['Q']
        Answer = example['Answer']
        body = (
            f"{idx+1}. {{'Q': '{Q}',\n"
            f"   'Answer': '{Answer}'}}\n"
            "\n"
        )
        example_part_header = example_part_header + body

    example_part = example_part_header

    guidelines_part = (
        "## GUIDELINES FOR CREATING QUESTIONS:\n"
        "- **Specificity**: Ask about singular, clearly defined objects.\n"
        "- **Visual Certainty**: Ensure the correct answer is unambiguous and directly observable in the description.\n"
        "- **Description Grounding**: Base all questions and answers on the video description.\n"
        "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
        "- **Complete Sentence Answers**: Always provide the answer as a grammatically correct, complete sentence.\n"
        "\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]"
    )

    system_prompt = task_inst_part + example_part + guidelines_part

    user_prompt = (
        "I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n"
        "**Dense Description:**\n"
        f"{dense_caption}\n\n"
        "**Background Description:**\n"
        f"{background_caption}\n\n"
        "**Main Object Description:**\n"
        f"{main_object_caption}\n\n"
        "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
        "## REQUIREMENTS:\n"
        "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
        "- The answer must be directly observable in the description without any reasoning or inference.\n\n"
        "## OUTPUT FORMAT:\n"
        "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
        "**Only return the QA pair in the specified JSON list format.**"
    )

    return system_prompt, user_prompt