{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6511a91c-ed20-41ff-befb-699bda1912a3", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T06:38:11.914633Z", "iopub.status.busy": "2026-03-25T06:38:11.914498Z", "iopub.status.idle": "2026-03-25T06:38:26.481581Z", "shell.execute_reply": "2026-03-25T06:38:26.480877Z", "shell.execute_reply.started": "2026-03-25T06:38:11.914618Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a70b6714abe946bfbd7f496bb0913fc8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (incomplete total...): 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f9fe923a4e3a4de6bb7e948de7995ea9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 2 files: 0%| | 0/2 [00:00 and <|vision_end|> tags tell the processor \n", "# where to inject the image features.\n", "prompt = (\n", " \"<|im_start|>user\\n\"\n", " \"<|vision_start|><|image_pad|><|vision_end|>\"\n", " f\"{user_query}<|im_end|>\\n\"\n", " \"<|im_start|>assistant\\n\"\n", ")\n", "\n", "# 4. Process the vision information\n", "# We still use this utility to fetch the image and handle resizing logic\n", "messages = [{\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_url}]}]\n", "image_inputs, _ = process_vision_info(messages)\n", "\n", "# 5. Tokenize and Prepare Tensors\n", "inputs = processor(\n", " text=[prompt],\n", " images=image_inputs,\n", " videos=None,\n", " padding=True,\n", " return_tensors=\"pt\",\n", ")\n", "inputs = inputs.to(model.device)\n", "\n", "# 6. Generate\n", "generated_ids = model.generate(**inputs, max_new_tokens=100)\n", "\n", "# Trim the prompt tokens from the result\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")\n", "\n", "print(f\"\\nManual Prompt Response: {output_text[0]}\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "f45df021-6302-4f47-9e06-8070577885a2", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T05:56:12.534299Z", "iopub.status.busy": "2026-03-25T05:56:12.534123Z", "iopub.status.idle": "2026-03-25T05:56:12.537469Z", "shell.execute_reply": "2026-03-25T05:56:12.536915Z", "shell.execute_reply.started": "2026-03-25T05:56:12.534284Z" } }, "outputs": [ { "data": { "text/plain": [ "'<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe the image<|im_end|>\\n<|im_start|>assistant\\n'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prompt" ] }, { "cell_type": "code", "execution_count": 2, "id": "504fa71b-42b4-4f53-8988-25fcfba38d13", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T06:38:26.483841Z", "iopub.status.busy": "2026-03-25T06:38:26.483701Z", "iopub.status.idle": "2026-03-25T06:38:26.488982Z", "shell.execute_reply": "2026-03-25T06:38:26.488521Z", "shell.execute_reply.started": "2026-03-25T06:38:26.483826Z" } }, "outputs": [ { "data": { "text/plain": [ "(torch.Size([1, 1, 94, 128]), torch.float16)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos = torch.load('cos.pt')\n", "cos.shape, cos.dtype" ] }, { "cell_type": "code", "execution_count": 10, "id": "a874f8d4-efa9-4ba8-9e57-c019da0775bd", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T06:43:28.437717Z", "iopub.status.busy": "2026-03-25T06:43:28.437502Z", "iopub.status.idle": "2026-03-25T06:43:28.440474Z", "shell.execute_reply": "2026-03-25T06:43:28.439711Z", "shell.execute_reply.started": "2026-03-25T06:43:28.437702Z" } }, "outputs": [], "source": [ "torch.set_printoptions(precision=10)" ] }, { "cell_type": "code", "execution_count": 12, "id": "f44460e3-58e9-4fd2-898a-06e8a00f9365", "metadata": { "execution": { "iopub.execute_input": "2026-03-25T06:43:37.079158Z", "iopub.status.busy": "2026-03-25T06:43:37.078953Z", "iopub.status.idle": "2026-03-25T06:43:37.086609Z", "shell.execute_reply": "2026-03-25T06:43:37.086029Z", "shell.execute_reply.started": "2026-03-25T06:43:37.079144Z" } }, "outputs": [ { "data": { "text/plain": [ "tensor([[[0.5405273438, 0.6923828125, 0.7963867188, 0.8662109375, 0.9125976562,\n", " 0.9428710938, 0.9628906250, 0.9755859375, 0.9843750000, 0.9897460938,\n", " 0.9931640625, 0.9956054688, 0.9970703125, 0.9980468750, 0.9990234375,\n", " 0.9990234375, 0.9995117188, 0.9995117188, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.5405273438,\n", " 0.6923828125, 0.7963867188, 0.8662109375, 0.9125976562, 0.9428710938,\n", " 0.9628906250, 0.9755859375, 0.9843750000, 0.9897460938, 0.9931640625,\n", " 0.9956054688, 0.9970703125, 0.9980468750, 0.9990234375, 0.9990234375,\n", " 0.9995117188, 0.9995117188, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000,\n", " 1.0000000000, 1.0000000000, 1.0000000000]]], device='cuda:0',\n", " dtype=torch.float16)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos[:, :, 1, :]" ] }, { "cell_type": "code", "execution_count": null, "id": "039f1456-ffa4-40b2-8ba1-a0cd5f74733e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }