{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6511a91c-ed20-41ff-befb-699bda1912a3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-25T06:38:11.914633Z",
     "iopub.status.busy": "2026-03-25T06:38:11.914498Z",
     "iopub.status.idle": "2026-03-25T06:38:26.481581Z",
     "shell.execute_reply": "2026-03-25T06:38:26.480877Z",
     "shell.execute_reply.started": "2026-03-25T06:38:11.914618Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a70b6714abe946bfbd7f496bb0913fc8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (incomplete total...): 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f9fe923a4e3a4de6bb7e948de7995ea9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dde6f6e9831547b4a89e36084b117b56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading weights:   0%|          | 0/824 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[text] batch=0, tokens=4, pos=0..3 (t=h=w): [0, 1, 2, 3]\n",
      "get_vision_position_ids: grid_thw=tensor([ 1, 18, 18], device='cuda:0'), llm_grid_thw=(1, 9, 9), start_position=4\n",
      "  temp_merge_size=1, spatial_merge_size=2\n",
      "  image_seq_length=81\n",
      "  position_width (repeat)=[4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
      "  position_height (repeat_interleave)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
      "  position_temporal (torch.full) (before spacing)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]...[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
      "  time_interval=2\n",
      "  position_temporal (after spacing)=[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
      "[vision pos] grid_thw=tensor([ 1, 18, 18], device='cuda:0'), start=4\n",
      "  t: [8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
      "  h: [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
      "  w: [4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
      "[text] batch=0, tokens=9, pos=13..21 (t=h=w): [13, 14, 15, 16, 17, 18, 19, 20, 21]\n",
      "[LLM prefill] position_ids shape: torch.Size([3, 1, 94])  (3=t/h/w, bs, seq_len)\n",
      "  batch 0 (shape: 94):\n",
      "    t: [0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
      "    h: [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
      "    w: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
      "\n",
      "Manual Prompt Response: The image shows a black Porsche Panamera parked on a road. The car is positioned at an angle, with the rear of the vehicle facing the camera. The Porsche logo and model name are visible on the trunk. The car has a sleek design with a long hood and short rear deck. The taillights are prominent, and the license plate is clearly visible. The background appears to be a blurred landscape, suggesting motion or a focus on the car itself.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor\n",
    "from qwen_vl_utils import process_vision_info\n",
    "\n",
    "# 1. Load Model and Processor\n",
    "model_name = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
    "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
    "    model_name, torch_dtype=torch.float16, device_map=\"auto\"\n",
    ")\n",
    "processor = AutoProcessor.from_pretrained(model_name)\n",
    "\n",
    "# 2. Define your inputs manually\n",
    "image_url = \"./car-1_256_0.jpg\"\n",
    "user_query = \"Describe the image\"\n",
    "\n",
    "# 3. Construct the prompt string manually\n",
    "# Qwen2.5-VL expects specific tokens to wrap system, user, and assistant roles.\n",
    "# Note: The <|vision_start|> and <|vision_end|> tags tell the processor \n",
    "# where to inject the image features.\n",
    "prompt = (\n",
    "    \"<|im_start|>user\\n\"\n",
    "    \"<|vision_start|><|image_pad|><|vision_end|>\"\n",
    "    f\"{user_query}<|im_end|>\\n\"\n",
    "    \"<|im_start|>assistant\\n\"\n",
    ")\n",
    "\n",
    "# 4. Process the vision information\n",
    "# We still use this utility to fetch the image and handle resizing logic\n",
    "messages = [{\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_url}]}]\n",
    "image_inputs, _ = process_vision_info(messages)\n",
    "\n",
    "# 5. Tokenize and Prepare Tensors\n",
    "inputs = processor(\n",
    "    text=[prompt],\n",
    "    images=image_inputs,\n",
    "    videos=None,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "inputs = inputs.to(model.device)\n",
    "\n",
    "# 6. Generate\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=100)\n",
    "\n",
    "# Trim the prompt tokens from the result\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n",
    "\n",
    "print(f\"\\nManual Prompt Response: {output_text[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f45df021-6302-4f47-9e06-8070577885a2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-25T05:56:12.534299Z",
     "iopub.status.busy": "2026-03-25T05:56:12.534123Z",
     "iopub.status.idle": "2026-03-25T05:56:12.537469Z",
     "shell.execute_reply": "2026-03-25T05:56:12.536915Z",
     "shell.execute_reply.started": "2026-03-25T05:56:12.534284Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe the image<|im_end|>\\n<|im_start|>assistant\\n'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "504fa71b-42b4-4f53-8988-25fcfba38d13",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-25T06:38:26.483841Z",
     "iopub.status.busy": "2026-03-25T06:38:26.483701Z",
     "iopub.status.idle": "2026-03-25T06:38:26.488982Z",
     "shell.execute_reply": "2026-03-25T06:38:26.488521Z",
     "shell.execute_reply.started": "2026-03-25T06:38:26.483826Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([1, 1, 94, 128]), torch.float16)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cos = torch.load('cos.pt')\n",
    "cos.shape, cos.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a874f8d4-efa9-4ba8-9e57-c019da0775bd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-25T06:43:28.437717Z",
     "iopub.status.busy": "2026-03-25T06:43:28.437502Z",
     "iopub.status.idle": "2026-03-25T06:43:28.440474Z",
     "shell.execute_reply": "2026-03-25T06:43:28.439711Z",
     "shell.execute_reply.started": "2026-03-25T06:43:28.437702Z"
    }
   },
   "outputs": [],
   "source": [
    "torch.set_printoptions(precision=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f44460e3-58e9-4fd2-898a-06e8a00f9365",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-25T06:43:37.079158Z",
     "iopub.status.busy": "2026-03-25T06:43:37.078953Z",
     "iopub.status.idle": "2026-03-25T06:43:37.086609Z",
     "shell.execute_reply": "2026-03-25T06:43:37.086029Z",
     "shell.execute_reply.started": "2026-03-25T06:43:37.079144Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[0.5405273438, 0.6923828125, 0.7963867188, 0.8662109375, 0.9125976562,\n",
       "          0.9428710938, 0.9628906250, 0.9755859375, 0.9843750000, 0.9897460938,\n",
       "          0.9931640625, 0.9956054688, 0.9970703125, 0.9980468750, 0.9990234375,\n",
       "          0.9990234375, 0.9995117188, 0.9995117188, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.5405273438,\n",
       "          0.6923828125, 0.7963867188, 0.8662109375, 0.9125976562, 0.9428710938,\n",
       "          0.9628906250, 0.9755859375, 0.9843750000, 0.9897460938, 0.9931640625,\n",
       "          0.9956054688, 0.9970703125, 0.9980468750, 0.9990234375, 0.9990234375,\n",
       "          0.9995117188, 0.9995117188, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n",
       "          1.0000000000, 1.0000000000, 1.0000000000]]], device='cuda:0',\n",
       "       dtype=torch.float16)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cos[:, :, 1, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "039f1456-ffa4-40b2-8ba1-a0cd5f74733e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}