{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6511a91c-ed20-41ff-befb-699bda1912a3", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:44:53.813781Z", "iopub.status.busy": "2026-03-25T08:44:53.813650Z", "iopub.status.idle": "2026-03-25T08:45:08.246826Z", "shell.execute_reply": "2026-03-25T08:45:08.246118Z", "shell.execute_reply.started": "2026-03-25T08:44:53.813766Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "61572014a82e4ee0a3756b44f486678f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (incomplete total...): 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "75e4fe58bc664443a44cfa0287da5a71", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 2 files: 0%| | 0/2 [00:00 and <|vision_end|> tags tell the processor \n", "# where to inject the image features.\n", "prompt = (\n", " \"<|im_start|>user\\n\"\n", " \"<|vision_start|><|image_pad|><|vision_end|>\"\n", " f\"{user_query}<|im_end|>\\n\"\n", " \"<|im_start|>assistant\\n\"\n", ")\n", "\n", "# 4. Process the vision information\n", "# We still use this utility to fetch the image and handle resizing logic\n", "messages = [{\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_url}]}]\n", "image_inputs, _ = process_vision_info(messages)\n", "\n", "# 5. Tokenize and Prepare Tensors\n", "inputs = processor(\n", " text=[prompt],\n", " images=image_inputs,\n", " videos=None,\n", " padding=True,\n", " return_tensors=\"pt\",\n", ")\n", "inputs = inputs.to(model.device)\n", "\n", "# 6. Generate\n", "generated_ids = model.generate(**inputs, max_new_tokens=100)\n", "\n", "# Trim the prompt tokens from the result\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")\n", "\n", "print(f\"\\nManual Prompt Response: {output_text[0]}\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "f45df021-6302-4f47-9e06-8070577885a2", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:45:08.247563Z", "iopub.status.busy": "2026-03-25T08:45:08.247274Z", "iopub.status.idle": "2026-03-25T08:45:08.251422Z", "shell.execute_reply": "2026-03-25T08:45:08.250728Z", "shell.execute_reply.started": "2026-03-25T08:45:08.247548Z" } }, "outputs": [ { "data": { "text/plain": [ "'<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe the image<|im_end|>\\n<|im_start|>assistant\\n'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prompt" ] }, { "cell_type": "code", "execution_count": 3, "id": "504fa71b-42b4-4f53-8988-25fcfba38d13", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:45:08.252004Z", "iopub.status.busy": "2026-03-25T08:45:08.251839Z", "iopub.status.idle": "2026-03-25T08:45:08.262159Z", "shell.execute_reply": "2026-03-25T08:45:08.261665Z", "shell.execute_reply.started": "2026-03-25T08:45:08.251990Z" } }, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1, 1, 94, 128]), torch.float16)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos = torch.load('cos.pt')\n", "cos.shape, cos.dtype" ] }, { "cell_type": "code", "execution_count": 4, "id": "a874f8d4-efa9-4ba8-9e57-c019da0775bd", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:45:08.262852Z", "iopub.status.busy": "2026-03-25T08:45:08.262705Z", "iopub.status.idle": "2026-03-25T08:45:08.269483Z", "shell.execute_reply": "2026-03-25T08:45:08.268795Z", "shell.execute_reply.started": "2026-03-25T08:45:08.262840Z" } }, "outputs": [], "source": [ "torch.set_printoptions(precision=14)" ] }, { "cell_type": "code", "execution_count": 5, "id": "f44460e3-58e9-4fd2-898a-06e8a00f9365", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:45:08.269891Z", "iopub.status.busy": "2026-03-25T08:45:08.269762Z", "iopub.status.idle": "2026-03-25T08:45:08.295575Z", "shell.execute_reply": "2026-03-25T08:45:08.294849Z", "shell.execute_reply.started": "2026-03-25T08:45:08.269879Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "tensor([[[0.54052734375000, 0.69238281250000, 0.79638671875000,\n", " 0.86621093750000, 0.91259765625000, 0.94287109375000,\n", " 0.96289062500000, 0.97558593750000, 0.98437500000000,\n", " 0.98974609375000, 0.99316406250000, 0.99560546875000,\n", " 0.99707031250000, 0.99804687500000, 0.99902343750000,\n", " 0.99902343750000, 0.99951171875000, 0.99951171875000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 0.54052734375000, 0.69238281250000,\n", " 0.79638671875000, 0.86621093750000, 0.91259765625000,\n", " 0.94287109375000, 0.96289062500000, 0.97558593750000,\n", " 0.98437500000000, 0.98974609375000, 0.99316406250000,\n", " 0.99560546875000, 0.99707031250000, 0.99804687500000,\n", " 0.99902343750000, 0.99902343750000, 0.99951171875000,\n", " 0.99951171875000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000, 1.00000000000000,\n", " 1.00000000000000, 1.00000000000000]]], device='cuda:0',\n", " dtype=torch.float16)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos[:, :, 1, :]" ] }, { "cell_type": "code", "execution_count": 6, "id": "039f1456-ffa4-40b2-8ba1-a0cd5f74733e", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:45:08.296250Z", "iopub.status.busy": "2026-03-25T08:45:08.296111Z", "iopub.status.idle": "2026-03-25T08:45:08.348040Z", "shell.execute_reply": "2026-03-25T08:45:08.347477Z", "shell.execute_reply.started": "2026-03-25T08:45:08.296237Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 torch.Size([128])\n", " [0:16] 1.0 1.0\n", " [16:40] 1.0 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] 1.0 1.0\n", " [80:104] 1.0 1.0\n", " [104:128] 1.0 1.0\n", "1 torch.Size([128])\n", " [0:16] 0.54052734375 0.9990234375\n", " [16:40] 0.99951171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] 0.54052734375 0.9990234375\n", " [80:104] 0.99951171875 1.0\n", " [104:128] 1.0 1.0\n", "2 torch.Size([128])\n", " [0:16] -0.416259765625 0.9970703125\n", " [16:40] 0.998046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.416259765625 0.9970703125\n", " [80:104] 0.998046875 1.0\n", " [104:128] 1.0 1.0\n", "3 torch.Size([128])\n", " [0:16] -0.990234375 0.9931640625\n", " [16:40] 0.99560546875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.990234375 0.9931640625\n", " [80:104] 0.99560546875 1.0\n", " [104:128] 1.0 1.0\n", "4 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "5 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "6 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "7 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "8 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "9 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "10 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "11 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "12 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9921875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9921875 1.0\n", " [104:128] 1.0 1.0\n", "13 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "14 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "15 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "16 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "17 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "18 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "19 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "20 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "21 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9873046875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9873046875 1.0\n", " [104:128] 1.0 1.0\n", "22 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "23 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "24 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "25 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "26 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "27 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "28 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "29 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "30 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.98193359375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.98193359375 1.0\n", " [104:128] 1.0 1.0\n", "31 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "32 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "33 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "34 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "35 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "36 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "37 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "38 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "39 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9755859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9755859375 1.0\n", " [104:128] 1.0 1.0\n", "40 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "41 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "42 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "43 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "44 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "45 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "46 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "47 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "48 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.96826171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.96826171875 1.0\n", " [104:128] 1.0 1.0\n", "49 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "50 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "51 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "52 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "53 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "54 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "55 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "56 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "57 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9599609375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9599609375 1.0\n", " [104:128] 1.0 1.0\n", "58 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "59 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "60 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "61 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "62 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "63 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "64 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "65 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "66 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9501953125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9501953125 1.0\n", " [104:128] 1.0 1.0\n", "67 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "68 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "69 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "70 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "71 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "72 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "73 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "74 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "75 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.93994140625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.93994140625 1.0\n", " [104:128] 1.0 1.0\n", "76 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "77 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "78 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "79 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "80 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "81 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "82 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "83 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "84 torch.Size([128])\n", " [0:16] -0.97314453125 0.98681640625\n", " [16:40] 0.9287109375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97314453125 0.98681640625\n", " [80:104] 0.9287109375 1.0\n", " [104:128] 1.0 1.0\n", "85 torch.Size([128])\n", " [0:16] -0.962890625 0.9072265625\n", " [16:40] 0.91650390625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.962890625 0.9072265625\n", " [80:104] 0.91650390625 1.0\n", " [104:128] 1.0 1.0\n", "86 torch.Size([128])\n", " [0:16] -0.99853515625 0.9287109375\n", " [16:40] 0.90380859375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.99853515625 0.9287109375\n", " [80:104] 0.90380859375 1.0\n", " [104:128] 1.0 1.0\n", "87 torch.Size([128])\n", " [0:16] -0.98583984375 0.9990234375\n", " [16:40] 0.8896484375 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.98583984375 0.9990234375\n", " [80:104] 0.8896484375 1.0\n", " [104:128] 1.0 1.0\n", "88 torch.Size([128])\n", " [0:16] -0.95751953125 0.94677734375\n", " [16:40] 0.87451171875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.95751953125 0.94677734375\n", " [80:104] 0.87451171875 1.0\n", " [104:128] 1.0 1.0\n", "89 torch.Size([128])\n", " [0:16] -0.9931640625 0.87451171875\n", " [16:40] 0.85888671875 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.9931640625 0.87451171875\n", " [80:104] 0.85888671875 1.0\n", " [104:128] 1.0 1.0\n", "90 torch.Size([128])\n", " [0:16] -1.0 0.986328125\n", " [16:40] 0.84228515625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -1.0 0.986328125\n", " [80:104] 0.84228515625 1.0\n", " [104:128] 1.0 1.0\n", "91 torch.Size([128])\n", " [0:16] -0.97216796875 0.98876953125\n", " [16:40] 0.82470703125 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.97216796875 0.98876953125\n", " [80:104] 0.82470703125 1.0\n", " [104:128] 1.0 1.0\n", "92 torch.Size([128])\n", " [0:16] -0.96240234375 0.91259765625\n", " [16:40] 0.806640625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.96240234375 0.91259765625\n", " [80:104] 0.806640625 1.0\n", " [104:128] 1.0 1.0\n", "93 torch.Size([128])\n", " [0:16] -0.9912109375 0.861328125\n", " [16:40] 0.78759765625 1.0\n", " [40:64] 1.0 1.0\n", " [64:80] -0.9912109375 0.861328125\n", " [80:104] 0.78759765625 1.0\n", " [104:128] 1.0 1.0\n" ] } ], "source": [ "cos_reshaped = cos.squeeze(0).squeeze(0)\n", "for idx, tensor in enumerate(cos_reshaped):\n", " print(idx, tensor.shape)\n", " # print(tensor[16:40].flatten().tolist())\n", " if tensor.shape[0] != 128:\n", " print(f\"Skipping tensor {idx} with shape {tensor.shape}\")\n", " continue\n", " else:\n", " print(f\" [0:16] {tensor[0:16].min()} {tensor[0:16].max()}\")\n", " print(f\" [16:40] {tensor[16:40].min()} {tensor[16:40].max()}\")\n", " print(f\" [40:64] {tensor[40:64].min()} {tensor[40:64].max()}\")\n", " print(f\" [64:80] {tensor[64:80].min()} {tensor[64:80].max()}\")\n", " print(f\" [80:104] {tensor[80:104].min()} {tensor[80:104].max()}\")\n", " print(f\" [104:128] {tensor[104:128].min()} {tensor[104:128].max()}\")" ] }, { "cell_type": "markdown", "id": "e3745573-5a06-4beb-894a-7a9dddf82d6d", "metadata": {}, "source": [ "---" ] }, { "cell_type": "code", "execution_count": 10, "id": "91f8dc55-fd0e-4d6c-84d1-626b8fae96a0", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T08:58:50.174004Z", "iopub.status.busy": "2026-03-25T08:58:50.173602Z", "iopub.status.idle": "2026-03-25T08:58:50.184512Z", "shell.execute_reply": "2026-03-25T08:58:50.183786Z", "shell.execute_reply.started": "2026-03-25T08:58:50.173987Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python: 3.12.12 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 20:16:04) [GCC 11.2.0]\n", "Torch: 2.10.0+cu128\n", "Transformers: 5.3.0.dev0\n", "Pillow: 12.1.1\n", "Torchvision: 0.25.0+cu128\n", "qwen_vl_utils: NOT INSTALLED or no __version__\n", "\n", "=== Processor Image Config ===\n", "size (min/max pixels): SizeDict(height=None, width=None, longest_edge=12845056, shortest_edge=3136, max_height=None, max_width=None)\n", "patch_size: 14\n", "merge_size: 2\n", "resample: 3\n", "\n", "=== Raw Image ===\n", "PIL size (WxH): (256, 256)\n", "PIL mode: RGB\n", "Pixel checksum (sum of all pixels): 18508962\n", "Pixel hash (first 10 rows md5):\n", " 51edee767ff6c404d737e620d51036d5\n", "\n", "=== Processor Outputs ===\n", "input_ids shape: torch.Size([1, 94])\n", "input_ids hash: d3e7a59feedd64667ccc03bd71448894\n", "pixel_values shape: torch.Size([324, 1176])\n", "pixel_values dtype: torch.float32\n", "pixel_values sum: -114413.40625\n", "pixel_values hash: a635309793003d45969e76ca7badaca0\n", "image_grid_thw: tensor([[ 1, 18, 18]], device='cuda:0')\n" ] } ], "source": [ "## === REPRODUCIBILITY FINGERPRINT ===\n", "\n", "import sys, torch, transformers, PIL, torchvision\n", "print(\"Python: \", sys.version)\n", "print(\"Torch: \", torch.__version__)\n", "print(\"Transformers:\", transformers.__version__)\n", "print(\"Pillow: \", PIL.__version__)\n", "print(\"Torchvision: \", torchvision.__version__)\n", "try:\n", " import qwen_vl_utils\n", " print(\"qwen_vl_utils:\", qwen_vl_utils.__version__)\n", "except Exception:\n", " print(\"qwen_vl_utils: NOT INSTALLED or no __version__\")\n", "\n", "print()\n", "\n", "# --- Check processor config ---\n", "print(\"=== Processor Image Config ===\")\n", "print(\"size (min/max pixels):\", processor.image_processor.size)\n", "print(\"patch_size:\", processor.image_processor.patch_size)\n", "print(\"merge_size:\", processor.image_processor.merge_size)\n", "print(\"resample:\", processor.image_processor.resample)\n", "\n", "print()\n", "\n", "# --- Check image after loading (before processor) ---\n", "from PIL import Image\n", "img = Image.open(\"./car-1_256_0.jpg\").convert(\"RGB\")\n", "print(\"=== Raw Image ===\")\n", "print(\"PIL size (WxH):\", img.size)\n", "print(\"PIL mode:\", img.mode)\n", "\n", "import numpy as np\n", "img_np = np.array(img)\n", "print(\"Pixel checksum (sum of all pixels):\", img_np.sum())\n", "print(\"Pixel hash (first 10 rows md5):\")\n", "import hashlib\n", "print(\" \", hashlib.md5(img_np[:10].tobytes()).hexdigest())\n", "\n", "print()\n", "\n", "# --- Check inputs after processor ---\n", "print(\"=== Processor Outputs ===\")\n", "print(\"input_ids shape:\", inputs.input_ids.shape)\n", "print(\"input_ids hash:\", hashlib.md5(inputs.input_ids.cpu().numpy().tobytes()).hexdigest())\n", "print(\"pixel_values shape:\", inputs.pixel_values.shape)\n", "print(\"pixel_values dtype:\", inputs.pixel_values.dtype)\n", "print(\"pixel_values sum:\", inputs.pixel_values.sum().item())\n", "print(\"pixel_values hash:\", hashlib.md5(inputs.pixel_values.float().cpu().numpy().tobytes()).hexdigest())\n", "print(\"image_grid_thw:\", inputs.image_grid_thw)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d9e47144-a9f2-465f-9c4b-7458220e3510", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }