{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ee6e1b61-d3a5-45dd-9d2a-953d8acdd763",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
]
}
],
"source": [
"from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\n",
"from PIL import Image\n",
"import requests\n",
"import torch\n",
"\n",
"# Can also be a local path if you have already cloned the hugging face repo\n",
"MODEL_PATH = \"C:/Users/reube/MolmoLocalQuant/Molmo_Quant\"\n",
"\n",
"# load the processor\n",
"processor = AutoProcessor.from_pretrained(\n",
" MODEL_PATH,\n",
" trust_remote_code=True,\n",
" device_map='auto'\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2ff193c-f148-400c-accb-d83ad58bfa73",
"metadata": {},
"outputs": [],
"source": [
"processor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ee281cd8-527c-4686-94db-8c8b62686a80",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5959b4f7a5f245a18c04c848c3f0a2c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = AutoModelForCausalLM.from_pretrained(\n",
" MODEL_PATH,\n",
" trust_remote_code=True,\n",
" device_map='auto'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce1319d0-6d3a-41e1-99a7-5d92937bef12",
"metadata": {},
"outputs": [],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "e8ebdbbb-bc31-4aab-850b-81d464dadd93",
"metadata": {},
"outputs": [],
"source": [
"filename=\"C:/Users/reube/OneDrive/Pictures/dfmr4c7-d7c0ba06-f019-418f-a091-b247d44a738f.jpg\"\n",
"image = Image.open(filename).convert(\"RGB\")\n",
"inputs = processor.process(\n",
" images=[image],\n",
" text=\"Point to people in picture.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "994719fb-ec2f-4c8f-83fc-44f0edfd65f2",
"metadata": {},
"outputs": [],
"source": [
"generated_text='Counting the number of individual socks by pointing at their toe tips shows a total of 2.'\n",
"label='test'"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "0e48eae4-8e1d-4f84-9cbd-dbc6adedd122",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c3361787-4f10-4fdb-a09a-a88535086039",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:/Users/reube/OneDrive/Pictures/dfmr4c7-d7c0ba06-f019-418f-a091-b247d44a738f.jpg| people in picture\n",
"\n"
]
}
],
"source": [
"\n",
"# move inputs to the correct device and make a batch of size 1\n",
"inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n",
"\n",
"# Compute is done in float16, while most weights are NF4\n",
"with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n",
" output = model.generate_from_batch(\n",
" inputs,\n",
" GenerationConfig(max_new_tokens=200, stop_strings=\"<|endoftext|>\"),\n",
" tokenizer=processor.tokenizer,\n",
" use_cache=False,\n",
" temperature=0.2,\n",
" top_p=0.5\n",
" )\n",
"\n",
"# only get generated tokens; decode them to text\n",
"generated_tokens = output[0, inputs['input_ids'].size(1):]\n",
"generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n",
"\n",
"# print the generated text\n",
"print(filename+'|'+generated_text+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c44ec15e-849f-4dd3-87b6-a7017035b67c",
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import re\n",
"label='People'\n",
"cap = cv2.VideoCapture(0) # Use 0 for default webcam\n",
"\n",
"while True:\n",
" ret, frame = cap.read()\n",
" if not ret:\n",
" break\n",
" image = Image.fromarray(frame).convert(\"RGB\")\n",
" inputs = processor.process(\n",
" images=[image],\n",
" text=\"Point to the people.\")\n",
" # move inputs to the correct device and make a batch of size 1\n",
" inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}\n",
"\n",
" # Compute is done in float16, while most weights are NF4\n",
" with torch.autocast(device_type=\"cuda\", enabled=True, dtype=torch.float16):\n",
" output = model.generate_from_batch(\n",
" inputs,\n",
" GenerationConfig(max_new_tokens=100, stop_strings=\"<|endoftext|>\"),\n",
" tokenizer=processor.tokenizer,\n",
" use_cache=False\n",
" )\n",
"\n",
" # only get generated tokens; decode them to text\n",
" generated_tokens = output[0, inputs['input_ids'].size(1):]\n",
" generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) \n",
" \n",
" # You’ll overlay annotations here\n",
" matches = re.findall(r'x\\d+=\"([\\d.]+)\"\\s*y\\d+=\"([\\d.]+)\"', generated_text)\n",
" \n",
" points = [{'x': float(x), 'y': float(y)} for x, y in matches]\n",
" if points:\n",
" for pt in points:\n",
" x, y = pt['x'], pt['y'] \n",
" h, w = frame.shape[:2]\n",
" x = int(pt['x'] / 100 * w)\n",
" y = int(pt['y'] / 100 * h)\n",
" cv2.circle(frame, (x, y), radius=5, color=(0, 255, 0), thickness=-1)\n",
" \n",
" cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)\n",
" cv2.imshow('Live Feed', frame)\n",
" if cv2.waitKey(1) & 0xFF == ord('q'):\n",
" break\n",
"\n",
"cap.release()\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a13bcb85-edab-49e1-a1a9-cd3144491f06",
"metadata": {},
"outputs": [],
"source": [
"torch.cuda.get_device_properties(0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "base"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}