{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ee6e1b61-d3a5-45dd-9d2a-953d8acdd763", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n" ] } ], "source": [ "from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\n", "from PIL import Image\n", "import requests\n", "import torch\n", "\n", "# Can also be a local path if you have already cloned the hugging face repo\n", "MODEL_PATH = \"C:/Users/reube/MolmoLocalQuant/Molmo_Quant\"\n", "\n", "# load the processor\n", "processor = AutoProcessor.from_pretrained(\n", " MODEL_PATH,\n", " trust_remote_code=True,\n", " device_map='auto'\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a2ff193c-f148-400c-accb-d83ad58bfa73", "metadata": {}, "outputs": [], "source": [ "processor" ] }, { "cell_type": "code", "execution_count": 2, "id": "ee281cd8-527c-4686-94db-8c8b62686a80", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5959b4f7a5f245a18c04c848c3f0a2c5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00number of individual socks by pointing at their toe tips shows a total of 2.'\n", "label='test'" ] }, { "cell_type": "code", "execution_count": 23, "id": "0e48eae4-8e1d-4f84-9cbd-dbc6adedd122", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "id": "c3361787-4f10-4fdb-a09a-a88535086039", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "C:/Users/reube/OneDrive/Pictures/dfmr4c7-d7c0ba06-f019-418f-a091-b247d44a738f.jpg| people in picture\n", "\n" ] } ], "source": [ "\n", "# move inputs to the correct device and make a batch of size 1\n", "inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n", "\n", "# Compute is done in float16, while most weights are NF4\n", "with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n", " output = model.generate_from_batch(\n", " inputs,\n", " GenerationConfig(max_new_tokens=200, stop_strings=\"<|endoftext|>\"),\n", " tokenizer=processor.tokenizer,\n", " use_cache=False,\n", " temperature=0.2,\n", " top_p=0.5\n", " )\n", "\n", "# only get generated tokens; decode them to text\n", "generated_tokens = output[0, inputs['input_ids'].size(1):]\n", "generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n", "\n", "# print the generated text\n", "print(filename+'|'+generated_text+'\\n')" ] }, { "cell_type": "code", "execution_count": null, "id": "c44ec15e-849f-4dd3-87b6-a7017035b67c", "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import re\n", "label='People'\n", "cap = cv2.VideoCapture(0) # Use 0 for default webcam\n", "\n", "while True:\n", " ret, frame = cap.read()\n", " if not ret:\n", " break\n", " image = Image.fromarray(frame).convert(\"RGB\")\n", " inputs = processor.process(\n", " images=[image],\n", " text=\"Point to the people.\")\n", " # move inputs to the correct device and make a batch of size 1\n", " inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n", "\n", " # Compute is done in float16, while most weights are NF4\n", " with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n", " output = model.generate_from_batch(\n", " inputs,\n", " GenerationConfig(max_new_tokens=100, stop_strings=\"<|endoftext|>\"),\n", " tokenizer=processor.tokenizer,\n", " use_cache=False\n", " )\n", "\n", " # only get generated tokens; decode them to text\n", " generated_tokens = output[0, inputs['input_ids'].size(1):]\n", " generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n", " \n", " # You’ll overlay annotations here\n", " matches = re.findall(r'x\\d+=\"([\\d.]+)\"\\s*y\\d+=\"([\\d.]+)\"', generated_text)\n", " \n", " points = [{'x': float(x), 'y': float(y)} for x, y in matches]\n", " if points:\n", " for pt in points:\n", " x, y = pt['x'], pt['y'] \n", " h, w = frame.shape[:2]\n", " x = int(pt['x'] / 100 * w)\n", " y = int(pt['y'] / 100 * h)\n", " cv2.circle(frame, (x, y), radius=5, color=(0, 255, 0), thickness=-1)\n", " \n", " cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)\n", " cv2.imshow('Live Feed', frame)\n", " if cv2.waitKey(1) & 0xFF == ord('q'):\n", " break\n", "\n", "cap.release()\n", "cv2.destroyAllWindows()" ] }, { "cell_type": "code", "execution_count": null, "id": "a13bcb85-edab-49e1-a1a9-cd3144491f06", "metadata": {}, "outputs": [], "source": [ "torch.cuda.get_device_properties(0)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "base" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }