File size: 1,990 Bytes

"""Example: run DW-KhotTaeVL-2B-QueryFrames on a single video MCQ.

Requirements::

    pip install torch transformers pillow decord huggingface_hub

This script loads the QueryFrames wrapper, samples 32 candidate frames
from the input video, picks the 8 most relevant to the question via
CLIP-ViT-L/14, and answers via stock Qwen3-VL-2B-Instruct.
"""
from dw_queryframes import QueryFrames


def main() -> None:
    fv = QueryFrames(
        base_model="Qwen/Qwen3-VL-2B-Instruct",
        clip_model="openai/clip-vit-large-patch14",
        device="auto",
        n_frames=8,
        n_candidates=32,
    )

    # MCQ mode (no task_type) — default.
    result = fv.answer_mcq(
        video_path="example.mp4",
        question="What does the chef do after pouring the oil into the pot?",
        options=[
            "Chops fresh green herbs",
            "Pours broth into the pot",
            "Stirs the oil in the pot",
            "Adds salt to the pot",
        ],
    )
    print("[MCQ mode (no task_type)]")
    print(f"  pred         : {result['pred']}")
    print(f"  raw output   : {result['raw']!r}")
    print(f"  frames used  : {result['frames_used']}")
    print(f"  CLIP latency : {result['latency_clip_s']} s")
    print(f"  GEN  latency : {result['latency_gen_s']} s")

    # Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by
    # Video-MME or by an operational workflow).
    result2 = fv.answer_mcq(
        video_path="example.mp4",
        question="What is happening to the cabbage in the frying pan?",
        options=[
            "It is being stirred",
            "It is being chopped",
            "It is being served",
            "It is being washed",
        ],
        task_type="Object Reasoning",  # → uniform-fallback path
    )
    print("\n[Task-aware MCQ mode]")
    print(f"  pred         : {result2['pred']}")
    print(f"  frames used  : {result2['frames_used']}")  # 'uniform_fallback'


if __name__ == "__main__":
    main()