{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ee6e1b61-d3a5-45dd-9d2a-953d8acdd763",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\n",
    "from PIL import Image\n",
    "import requests\n",
    "import torch\n",
    "\n",
    "# Can also be a local path if you have already cloned the hugging face repo\n",
    "MODEL_PATH = \"C:/Users/reube/MolmoLocalQuant/Molmo_Quant\"\n",
    "\n",
    "# load the processor\n",
    "processor = AutoProcessor.from_pretrained(\n",
    "    MODEL_PATH,\n",
    "    trust_remote_code=True,\n",
    "    device_map='auto'\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2ff193c-f148-400c-accb-d83ad58bfa73",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ee281cd8-527c-4686-94db-8c8b62686a80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5959b4f7a5f245a18c04c848c3f0a2c5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    MODEL_PATH,\n",
    "    trust_remote_code=True,\n",
    "    device_map='auto'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce1319d0-6d3a-41e1-99a7-5d92937bef12",
   "metadata": {},
   "outputs": [],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e8ebdbbb-bc31-4aab-850b-81d464dadd93",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename=\"C:/Users/reube/OneDrive/Pictures/dfmr4c7-d7c0ba06-f019-418f-a091-b247d44a738f.jpg\"\n",
    "image = Image.open(filename).convert(\"RGB\")\n",
    "inputs = processor.process(\n",
    "  images=[image],\n",
    "  text=\"Point to people in picture.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "994719fb-ec2f-4c8f-83fc-44f0edfd65f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "generated_text='Counting the <points x1=\"39.6\" y1=\"66.2\" x2=\"78.6\" y2=\"67.1\" alt=\"number of individual socks by pointing at their toe tips\">number of individual socks by pointing at their toe tips</points> shows a total of 2.'\n",
    "label='test'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0e48eae4-8e1d-4f84-9cbd-dbc6adedd122",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c3361787-4f10-4fdb-a09a-a88535086039",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:/Users/reube/OneDrive/Pictures/dfmr4c7-d7c0ba06-f019-418f-a091-b247d44a738f.jpg| <points x1=\"16.8\" y1=\"76.9\" x2=\"46.2\" y2=\"66.9\" x3=\"66.8\" y3=\"59.6\" alt=\"people in picture\">people in picture</points>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# move inputs to the correct device and make a batch of size 1\n",
    "inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n",
    "\n",
    "# Compute is done in float16, while most weights are NF4\n",
    "with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n",
    "  output = model.generate_from_batch(\n",
    "      inputs,\n",
    "      GenerationConfig(max_new_tokens=200, stop_strings=\"<|endoftext|>\"),\n",
    "      tokenizer=processor.tokenizer,\n",
    "      use_cache=False,\n",
    "      temperature=0.2,\n",
    "      top_p=0.5\n",
    "  )\n",
    "\n",
    "# only get generated tokens; decode them to text\n",
    "generated_tokens = output[0, inputs['input_ids'].size(1):]\n",
    "generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n",
    "\n",
    "# print the generated text\n",
    "print(filename+'|'+generated_text+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c44ec15e-849f-4dd3-87b6-a7017035b67c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import re\n",
    "label='People'\n",
    "cap = cv2.VideoCapture(0)  # Use 0 for default webcam\n",
    "\n",
    "while True:\n",
    "    ret, frame = cap.read()\n",
    "    if not ret:\n",
    "        break\n",
    "    image = Image.fromarray(frame).convert(\"RGB\")\n",
    "    inputs = processor.process(\n",
    "      images=[image],\n",
    "      text=\"Point to the people.\")\n",
    "    # move inputs to the correct device and make a batch of size 1\n",
    "    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n",
    "\n",
    "    # Compute is done in float16, while most weights are NF4\n",
    "    with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n",
    "      output = model.generate_from_batch(\n",
    "          inputs,\n",
    "          GenerationConfig(max_new_tokens=100, stop_strings=\"<|endoftext|>\"),\n",
    "          tokenizer=processor.tokenizer,\n",
    "          use_cache=False\n",
    "      )\n",
    "\n",
    "    # only get generated tokens; decode them to text\n",
    "    generated_tokens = output[0, inputs['input_ids'].size(1):]\n",
    "    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n",
    "    \n",
    "    # You’ll overlay annotations here\n",
    "    matches = re.findall(r'x\\d+=\"([\\d.]+)\"\\s*y\\d+=\"([\\d.]+)\"', generated_text)\n",
    "    \n",
    "    points = [{'x': float(x), 'y': float(y)} for x, y in matches]\n",
    "    if points:\n",
    "        for pt in points:\n",
    "            x, y = pt['x'], pt['y'] \n",
    "            h, w = frame.shape[:2]\n",
    "            x = int(pt['x'] / 100 * w)\n",
    "            y = int(pt['y'] / 100 * h)\n",
    "            cv2.circle(frame, (x, y), radius=5, color=(0, 255, 0), thickness=-1)\n",
    "    \n",
    "    cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)\n",
    "    cv2.imshow('Live Feed', frame)\n",
    "    if cv2.waitKey(1) & 0xFF == ord('q'):\n",
    "        break\n",
    "\n",
    "cap.release()\n",
    "cv2.destroyAllWindows()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a13bcb85-edab-49e1-a1a9-cd3144491f06",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.cuda.get_device_properties(0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "base"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}