File size: 8,756 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"load model\"\"\"\n",
    "import torch\n",
    "from PIL import Image, ImageDraw\n",
    "from qwen_vl_utils import process_vision_info\n",
    "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
    "import os\n",
    "import json\n",
    "import codecs \n",
    "from peft import PeftModel\n",
    "import argparse\n",
    "import random \n",
    "import re\n",
    "\n",
    "\n",
    "max_pixels_temp = 160*28*28\n",
    "max_pixels_narr = 760*28*28\n",
    "min_pixels_narr = 240*28*28\n",
    "\n",
    "\n",
    "\n",
    "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
    "    'FRank62Wu/ShowUI-Narrator', torch_dtype=\"auto\", device_map=\"cuda\"\n",
    ")\n",
    "\n",
    "\n",
    "processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator')   \n",
    "processor.tokenizer.pad_token = processor.tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<6> and <8>.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'\n",
    "\n",
    "\n",
    "\n",
    "_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.\n",
    "'Action': The type of action\n",
    "'Element': The target of the action\n",
    "'Source': The starting position (Applicable for action type: Drag)\n",
    "'Destination': The ending position (Applicable for action type: Drag)\n",
    "'Purpose': The intended result of the action\n",
    "The Action include left click, right click, double click, drag, or Keyboard type.\n",
    "'''\n",
    "\n",
    "\n",
    "Action_no_reference_grounding = [\n",
    "     'Describe the start frame and the end frame of the action in this video?',\n",
    "     'When Did the action happened in this video? Tell me the start frame and the end frame.',\n",
    "     'Locate the start and the end frame of the action in this video',\n",
    "     \"Observe the cursor in this GUI video, marking start and end frame of the action in video frames.\"\n",
    "]\n",
    "\n",
    "\n",
    "Dense_narration_query = ['Narrate the action in the given video.',\n",
    "                         'Describe the action of the user in the given frames',\n",
    "                         'Describe the action in this video.',\n",
    "                         'Narrate the action detail of the user in the video.']\n",
    "\n",
    "\n",
    "\n",
    "path_to_data =''\n",
    "\n",
    "query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)\n",
    "messages = [\n",
    "        {\n",
    "            'role': 'user', \n",
    "            'content': [\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png\",\"max_pixels\": max_pixels_temp},\n",
    "                        {'type':\"text\",'text': query},\n",
    "                        ]\n",
    "        }   \n",
    "    ]\n",
    "\n",
    "\n",
    "\n",
    "## round_1 for temporal grounding\n",
    "text = processor.apply_chat_template(\n",
    "                            messages, tokenize=False, add_generation_prompt=True,\n",
    "                        )\n",
    "          \n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "inputs = processor(\n",
    "            text=[text],\n",
    "            images=image_inputs,\n",
    "            videos=video_inputs,\n",
    "            padding=True,\n",
    "            return_tensors=\"pt\",\n",
    "        )\n",
    "inputs = inputs.to(\"cuda\")\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")[0]\n",
    "\n",
    "print(output_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"Action\": \"double click\", \"Element\": \"sc2 trans shape button\", \"Source\": null, \"Destination\": null, \"Purpose\": \" Select the SC2 Trans Shape.\"}\n"
     ]
    }
   ],
   "source": [
    "# round_2 for dense narration caption\n",
    "try:\n",
    "    matches = re.search(r\"<(\\w+)>.*?<(\\w+)>\", output_text)\n",
    "    s1, e1 = int(matches.group(1)), int(matches.group(2))\n",
    "except:\n",
    "    s1, e1 =0, 9\n",
    "    \n",
    "\n",
    "query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)\n",
    "\n",
    "selected_images = []\n",
    "\n",
    "if e1-s1<=3:\n",
    "    pixels_narr = max_pixels_narr\n",
    "else:\n",
    "    pixels_narr = max_pixels_narr *3 /(e1-s1+1)\n",
    "    \n",
    "    \n",
    "for idx, each in enumerate(messages[0]['content']):\n",
    "        if idx >= s1 and idx <= e1:\n",
    "            new_image = each.copy()\n",
    "            new_image['max_pixels'] = int(pixels_narr)\n",
    "            selected_images.append(new_image)\n",
    "            \n",
    "            \n",
    "messages = [\n",
    "        {\n",
    "            'role': 'user', \n",
    "            'content':selected_images+ [{'type':\"text\",'text': query},\n",
    "                        ] \n",
    "        }   \n",
    "    ]\n",
    "\n",
    "text = processor.apply_chat_template(\n",
    "                            messages, tokenize=False, add_generation_prompt=True,\n",
    "                        )\n",
    "          \n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "inputs = processor(\n",
    "            text=[text],\n",
    "            images=image_inputs,\n",
    "            videos=video_inputs,\n",
    "            padding=True,\n",
    "            return_tensors=\"pt\",\n",
    "        )\n",
    "inputs = inputs.to(\"cuda\")\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text_narration = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")[0]\n",
    "\n",
    "print(output_text_narration)\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "wqc_qwen2vl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}