{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d89263a2-a275-4c06-83fd-7e7eee0cb8b8", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea6025509a3e4956b5bdd00d5b58386d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading pipeline components...: 0%| | 0/7 [00:00 \u001b[39m\u001b[32m26\u001b[39m image = \u001b[43mpipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 27\u001b[39m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 28\u001b[39m \u001b[43m \u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 29\u001b[39m \u001b[43m \u001b[49m\u001b[43mguidance_scale\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 30\u001b[39m \u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m512\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 31\u001b[39m \u001b[43m \u001b[49m\u001b[43mheight\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m640\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 32\u001b[39m \u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m42\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 33\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 34\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[32m0\u001b[39m]\n\u001b[32m 35\u001b[39m all_images.extend(image)\n\u001b[32m 37\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/torch/utils/_contextlib.py:124\u001b[39m, in \u001b[36mcontext_decorator..decorate_context\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 120\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 121\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecorate_context\u001b[39m(*args, **kwargs):\n\u001b[32m 122\u001b[39m \u001b[38;5;66;03m# pyrefly: ignore [bad-context-manager]\u001b[39;00m\n\u001b[32m 123\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[32m--> \u001b[39m\u001b[32m124\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.cache/huggingface/modules/diffusers_modules/local/pipeline_sdxs.py:214\u001b[39m, in \u001b[36mSdxsPipeline.__call__\u001b[39m\u001b[34m(self, prompt, image, coef, negative_prompt, height, width, num_inference_steps, guidance_scale, generator, seed, output_type, return_dict, structure_preservation, **kwargs)\u001b[39m\n\u001b[32m 211\u001b[39m generator = torch.Generator(device=device).manual_seed(seed)\n\u001b[32m 213\u001b[39m \u001b[38;5;66;03m# 1. Encode prompt (твой код оставляем без изменений)\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m214\u001b[39m text_embeddings, attention_mask, pooled_embeds = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mencode_prompt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 215\u001b[39m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\n\u001b[32m 216\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 217\u001b[39m batch_size = \u001b[32m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(prompt, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(prompt)\n\u001b[32m 219\u001b[39m \u001b[38;5;66;03m# 2. Scheduler timesteps\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/.cache/huggingface/modules/diffusers_modules/local/pipeline_sdxs.py:184\u001b[39m, in \u001b[36mSdxsPipeline.encode_prompt\u001b[39m\u001b[34m(self, prompt, negative_prompt, device, dtype)\u001b[39m\n\u001b[32m 182\u001b[39m text_embeddings = torch.cat([neg_embeds, pos_embeds], dim=\u001b[32m0\u001b[39m)\n\u001b[32m 183\u001b[39m final_mask = torch.cat([neg_mask, pos_mask], dim=\u001b[32m0\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m pooled_embeds = torch.cat([neg_pooled, \u001b[43mpos_pooled\u001b[49m], dim=\u001b[32m0\u001b[39m)\n\u001b[32m 186\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m text_embeddings.to(dtype=dtype), final_mask.to(dtype=torch.int64), pooled_embeds.to(dtype=dtype)\n", "\u001b[31mNameError\u001b[39m: name 'pos_pooled' is not defined" ] } ], "source": [ "prompts = [\n", "\"an astronaut riding a horse\"\n", ",\"A young woman with striking blue eyes and pointed ears, adorned with a floral kimono and a tattoo. Her hair is styled in a braid, and she wears a pair of ears\"\n", ",\"A muscular, topless male with tiger-like ears and a tail stands in a forest, holding a sword and wearing a blue outfit, gazing directly at the viewer\"\n", ",\"A young woman with striking features, a mix of black and gold hair floating around her head, vibrant yellow eyes half-closed, and a blue sweater contrasting the black background, framed in a simple yet striking composition.\"\n", ",'A fluffy domestic cat with piercing green eyes sits attentively in a sunlit room filled with natural light, its soft fur reflecting warm hues of orange through golden windows.'\n", ",\"A fierce woman in black-and-white, wearing a spiked iron mask with sharp metallic spikes, sporting a high ponytail, her skin marked with battle-dirt, and eyes that reflect determination and strength. The mask's menacing aura contrasts with her intense expression, capturing her mysterious and intimidating nature.\"\n", ",\"A close-up of an astronaut's helmet with frosted, opaque visor, reflecting space's cold, frozen texture, resting on the visor a butterfly with vibrant, intricately patterned wings, and distant stars' faint glow.\"\n", ",\"A watercolor painting of a knight in a hazy blue field, his armor blending soft greys and silvers, holding a large red rose, with a serene yet commanding posture against distant mountains and wildflowers.\"\n", ",\"A warm glow of lanterns casts soft light over a snowy forest trail lined with tall, snow-covered trees, their branches casting a gentle glow. Soft falling snowflakes create a serene atmosphere, while a dark, mysterious background adds to the solitude and magic of the scene.\"\n", ",\"A hauntingly ethereal figure with decaying flesh and glowing metallic enhancements, blending Art Nouveau and cybernetic futurism, stands in a twilight forest bathed in warm sunset hues, casting golden light on surreal, ghostly trees with autumn leaves. The atmosphere is melancholic, evoking the styles of Rockwell and Parrish.\"\n", ",\"A radiant voluptuous woman in Arizona's Grand Canyon at twilight, her fiery ginger hair cascading over a halter top with intricate lace, plunging neckline, black thongs, silver rings, and stacked bangles, surrounded by deep purples and burnt oranges in the sky, her light-blue eyes glowing with wonder as she gazes at the luminous beauty of the night.\"\n", ",\"A man with hair, a white suit and black scarf, anime-style elements, glowing pink horns, and cyberpunk manga art style, standing in a shadowy smoke-filled background, with glowing pink eyes and a side view portrait of his face, white skin, and ear piercings.\"\n", ",\"There is a young male character standing against a vibrant, colorful graffiti wall. he is wearing a straw hat, a black jacket adorned with gold accents, and black shorts.\"\n", ",\"A black BMW M3 sports car with black and yellow rims\"\n", ",\"A young girl in a flowing, vibrant dress, her glowing eyes capturing the warmth of the day, sits on a grassy field, surrounded by anime-style elements.\"\n", "]\n", "prompts = [ \"cat\", \"dog\", \"girl\", \"man\", \"A black BMW M3 sports car with black and yellow rims\", \"an astronaut riding a horse\",\"A muscular, topless male with tiger-like ears and a tail stands in a forest, holding a sword and wearing a blue outfit, gazing directly at the viewer\",\"white cyborg knight riding cyber horse with wings, long white gown, holding scythe, skeleton horse, zombies cyberpunk armor, feathers\",\"A striking character with red eyes and black uniform wields a sword in a defensive stance, poised for battle amidst a stark white background with vibrant red accents.\"]\n", "\n", "generator = torch.Generator(device=\"cuda\").manual_seed(42)\n", "negative_prompt=\"bad quality, low resolution\"\n", "\n", "all_images = []\n", "# Обработка батчей с прогресс-баром\n", "for i, prompt in enumerate(prompts):\n", " image = pipeline(\n", " prompt = prompt,\n", " negative_prompt = negative_prompt,\n", " guidance_scale = 4,\n", " width = 512,\n", " height = 640,\n", " seed = 42,\n", " batch_size = 1,\n", " )[0]\n", " all_images.extend(image)\n", "\n", "import matplotlib.pyplot as plt\n", "import math\n", "\n", "def display_image_grid(images, prompts, cols=3, save_path=None):\n", " \"\"\"\n", " Отображает грид изображений с сохранением соотношения сторон и подписями.\n", " \"\"\"\n", " n = len(images)\n", " rows = math.ceil(n / cols)\n", "\n", " # Создаем фигуру с учетом реального соотношения сторон (640/576 ≈ 1.11)\n", " fig_width = cols * 4\n", " fig_height = rows * 4.5\n", " fig, axes = plt.subplots(rows, cols, figsize=(fig_width, fig_height))\n", "\n", " # Если только один ряд или один столбец, делаем axes списком\n", " if rows == 1:\n", " axes = [axes]\n", " if cols == 1:\n", " axes = [[ax] for ax in axes]\n", "\n", " axes = axes.flatten()\n", "\n", " for i, (img, prompt) in enumerate(zip(images, prompts)):\n", " ax = axes[i]\n", " ax.imshow(img)\n", " ax.axis(\"off\")\n", " ax.set_aspect(\"equal\") # сохраняем соотношение сторон\n", "\n", " # Урезаем и разбиваем подпись\n", " truncated_prompt = prompt[:80] + \"…\" if len(prompt) > 80 else prompt\n", " words = truncated_prompt.split(\" \")\n", " half = len(words) // 2\n", " line1 = \" \".join(words[:half])\n", " line2 = \" \".join(words[half:])\n", " ax.set_title(f\"{line1}\\n{line2}\", fontsize=10, wrap=True)\n", "\n", " # Прячем лишние subplot'ы, если их больше, чем изображений\n", " for j in range(len(images), len(axes)):\n", " axes[j].axis(\"off\")\n", "\n", " plt.tight_layout(pad=1.5)\n", "\n", " if save_path:\n", " plt.savefig(save_path, bbox_inches=\"tight\", dpi=400, format=\"jpeg\")\n", "\n", " plt.show()\n", "\n", "\n", "display_image_grid(all_images, prompts, save_path=\"media/result_grid.jpg\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "044a8b5e-f786-48ea-86a3-cb581b3ca1c9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Keyword arguments {'trust_remote_code': True} are not expected by SdxsPipeline and will be ignored.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dab39941a4c34a61998f4bae5214cb7f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading pipeline components...: 0%| | 0/7 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import torch\n", "from diffusers import DiffusionPipeline\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", "\n", "pipe_id = \"/workspace/sdxs-1b\"\n", "pipe = DiffusionPipeline.from_pretrained(\n", " pipe_id,\n", " torch_dtype=dtype,\n", " trust_remote_code=True\n", ").to(device)\n", "\n", "prompt = \"girl, smiling, red eyes, blue hair, white shirt\"\n", "negative_prompt=\"low quality, bad quality\"\n", "image = pipe(\n", " width = 512,\n", " height = 640,\n", " prompt=prompt,\n", " negative_prompt = negative_prompt,\n", " seed = 43\n", ").images[0]\n", "\n", "image.show(image)\n", "image.save(\"girl.jpg\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "76c7e40e-0326-42bc-b717-e46bcf790dfa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "UNet2DConditionModel(\n", " (conv_in): Conv2d(16, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_proj): Timesteps()\n", " (time_embedding): TimestepEmbedding(\n", " (linear_1): Linear(in_features=320, out_features=1280, bias=True)\n", " (act): SiLU()\n", " (linear_2): Linear(in_features=1280, out_features=1280, bias=True)\n", " )\n", " (down_blocks): ModuleList(\n", " (0): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 320, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-1): 2 x BasicTransformerBlock(\n", " (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=320, out_features=320, bias=False)\n", " (to_v): Linear(in_features=320, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=768, out_features=320, bias=False)\n", " (to_v): Linear(in_features=768, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=320, out_features=2560, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=1280, out_features=320, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (1): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 640, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-1): 2 x BasicTransformerBlock(\n", " (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=640, out_features=640, bias=False)\n", " (to_v): Linear(in_features=640, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=768, out_features=640, bias=False)\n", " (to_v): Linear(in_features=768, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=640, out_features=5120, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=2560, out_features=640, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (2): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-2): 3 x BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (3): DownBlock2D(\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " )\n", " )\n", " (up_blocks): ModuleList(\n", " (0): UpBlock2D(\n", " (resnets): ModuleList(\n", " (0-2): 3 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)\n", " (conv1): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (1): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-2): 3 x BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)\n", " (conv1): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (2): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 640, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-1): 2 x BasicTransformerBlock(\n", " (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=640, out_features=640, bias=False)\n", " (to_v): Linear(in_features=640, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=768, out_features=640, bias=False)\n", " (to_v): Linear(in_features=768, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=640, out_features=5120, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=2560, out_features=640, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 960, eps=1e-05, affine=True)\n", " (conv1): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (3): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 320, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-1): 2 x BasicTransformerBlock(\n", " (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=320, out_features=320, bias=False)\n", " (to_v): Linear(in_features=320, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=768, out_features=320, bias=False)\n", " (to_v): Linear(in_features=768, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=320, out_features=2560, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=1280, out_features=320, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 960, eps=1e-05, affine=True)\n", " (conv1): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1-2): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " )\n", " )\n", " (mid_block): UNetMidBlock2DCrossAttn(\n", " (attentions): ModuleList(\n", " (0): Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0-2): 3 x BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " )\n", " (conv_norm_out): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv_act): SiLU()\n", " (conv_out): Conv2d(320, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", ")\n" ] } ], "source": [ "import torch\n", "from diffusers import UNet2DConditionModel\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", "\n", "pipe_id = \"unet\"\n", "unet = UNet2DConditionModel.from_pretrained(\n", " pipe_id,\n", " torch_dtype=dtype,\n", ").to(device)\n", "\n", "unet.save_pretrained(pipe_id)\n", "print(unet)" ] }, { "cell_type": "code", "execution_count": 1, "id": "188279e8-f420-425a-9a29-01d5b60f8383", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "None\n", "Модель успешно проапгрейжена! Можно дообучать.\n" ] } ], "source": [ "import torch\n", "from diffusers import UNet2DConditionModel\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", "\n", "pipe_id = \"unet_simple\"\n", "old_unet = UNet2DConditionModel.from_pretrained(\n", " pipe_id,\n", " torch_dtype=dtype,\n", ").to(device)\n", "print(old_unet.config.addition_embed_type)\n", "\n", "# 2. Создаем новую конфигурацию (добавляем text_time)\n", "new_config = old_unet.config\n", "new_config.addition_embed_type = \"text_time\"\n", "new_config.addition_time_embed_dim = 1024 # Размер вашего пулинга\n", "\n", "# 3. Инициализируем новую (пустую) модель\n", "new_unet = UNet2DConditionModel.from_config(new_config)\n", "\n", "# 4. Переносим веса\n", "new_state_dict = new_unet.state_dict()\n", "old_state_dict = old_unet.state_dict()\n", "\n", "for name, param in old_state_dict.items():\n", " if \"time_embedding.linear_1.weight\" in name:\n", " # МАГИЯ ХИРУРГИИ\n", " # param (старый вес) имеет форму [Out, 256]\n", " # new_unet...weight имеет форму [Out, 256 + 1024]\n", " \n", " # 1. Берем веса новой модели (там сейчас мусор/рандом)\n", " new_w = new_state_dict[name]\n", " \n", " # 2. Зануляем ту часть, которая отвечает за новый пулинг (это важно!)\n", " # Предположим, что время идет первым, а пулинг вторым (зависит от реализации concat)\n", " # Обычно concat([time, pool]), значит time занимает первые индексы\n", " time_dim = param.shape[1] \n", " \n", " new_w[:, :time_dim] = param # Копируем старое знание\n", " new_w[:, time_dim:] = 0 # Новые связи делаем \"неактивными\" на старте\n", " \n", " new_state_dict[name] = new_w\n", " \n", " elif \"time_embedding.linear_1.bias\" in name:\n", " # Биас просто копируем, так как выходной размер слоя не меняется\n", " new_state_dict[name] = param\n", " \n", " elif name in new_state_dict:\n", " # Все остальные слои просто копируем\n", " if new_state_dict[name].shape == param.shape:\n", " new_state_dict[name] = param\n", " else:\n", " print(f\"Внимание: несовпадение форм для {name}, пропускаем\")\n", "\n", "# 5. Загружаем собранный словарь\n", "new_unet.load_state_dict(new_state_dict)\n", "print(new_unet.config.addition_embed_type)\n", "new_unet.save_pretrained(\"unet\")\n", "print(\"Модель успешно проапгрейжена! Можно дообучать.\")" ] }, { "cell_type": "code", "execution_count": 1, "id": "7c3461ac-8544-4df4-8ab6-e54b239ca4e5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "239e51a54c2343a4ba2f73a6686f725c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0bcbbdde09184b5c8b7b0b31d5899565", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3d4ef94499174412b8a1e1a6b0884a03", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c98c9a3b005443209f2839682001028b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/11.4M [00:00', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n", "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151665: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151666: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151667: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151668: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "}\n", ")\n", "saved\n" ] } ], "source": [ "from transformers import AutoTokenizer, AutoModel\n", "import torch\n", "\n", "device=\"cuda\"\n", "dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n", "\n", "\n", "model=\"Qwen/Qwen3-VL-2B-Instruct\"\n", "tokenizer = AutoTokenizer.from_pretrained(model)\n", "text_model = AutoModel.from_pretrained(model,torch_dtype=dtype).to(device).eval()\n", "\n", "print(text_model)\n", "print(tokenizer)\n", "tokenizer.save_pretrained(\"tokenizer\")\n", "text_model.save_pretrained(\"text_encoder\")\n", "print('saved')\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "638c946a-fd68-4bde-ae87-263ef5ea8679", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1. Загружаем токенизатор и базовый конфиг...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "85ccbdedb64049719784bdb1c35a5ab7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cc43a6865576408eb99676f101cc754f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d83333a8404c4c82af5b55ed9500d232", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f744e32d288f41bbb243ee90fef66000", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2a4cf7d322ec45ada41bf2984081ea95", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2. Собираем текстовый конфиг Qwen3...\n", "3. Загружаем веса VL модели (через базовый AutoModel)...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a848fd2acf984f1da854b3833ea1429e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/4.26G [00:00 \u001b[39m\u001b[32m419\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 420\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 421\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 422\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m==\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 423\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 424\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 425\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 426\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 427\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 428\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 429\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 430\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 431\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 432\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:85\u001b[39m, in \u001b[36mvalidate_hf_hub_args.._inner_fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 84\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[33m\"\u001b[39m\u001b[33mrepo_id\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfrom_id\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mto_id\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m---> \u001b[39m\u001b[32m85\u001b[39m \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 87\u001b[39m kwargs = smoothly_deprecate_legacy_arguments(fn_name=fn.\u001b[34m__name__\u001b[39m, kwargs=kwargs)\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:135\u001b[39m, in \u001b[36mvalidate_repo_id\u001b[39m\u001b[34m(repo_id)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX.match(repo_id):\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[32m 136\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mRepo id must use alphanumeric chars, \u001b[39m\u001b[33m'\u001b[39m\u001b[33m-\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33m_\u001b[39m\u001b[33m'\u001b[39m\u001b[33m or \u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 137\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m The name cannot start or end with \u001b[39m\u001b[33m'\u001b[39m\u001b[33m-\u001b[39m\u001b[33m'\u001b[39m\u001b[33m or \u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m'\u001b[39m\u001b[33m and the maximum length is 96:\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 138\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 139\u001b[39m )\n\u001b[32m 141\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m--\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id \u001b[38;5;129;01mor\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m..\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id:\n", "\u001b[31mHFValidationError\u001b[39m: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: 'Qwen3VLForConditionalGeneration(\n (model): Qwen3VLModel(\n (visual): Qwen3VLVisionModel(\n (patch_embed): Qwen3VLVisionPatchEmbed(\n (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))\n )\n (pos_embed): Embedding(2304, 1024)\n (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()\n (blocks): ModuleList(\n (0-23): 24 x Qwen3VLVisionBlock(\n (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (attn): Qwen3VLVisionAttention(\n (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n (proj): Linear(in_features=1024, out_features=1024, bias=True)\n )\n (mlp): Qwen3VLVisionMLP(\n (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)\n (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)\n (act_fn): GELUTanh()\n )\n )\n )\n (merger): Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n (deepstack_merger_list): ModuleList(\n (0-2): 3 x Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n )\n )\n (language_model): Qwen3VLTextModel(\n (embed_tokens): Embedding(151936, 2048)\n (layers): ModuleList(\n (0-27): 28 x Qwen3VLTextDecoderLayer(\n (self_attn): Qwen3VLTextAttention(\n (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (k_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (v_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n )\n (mlp): Qwen3VLTextMLP(\n (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (up_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (down_proj): Linear(in_features=6144, out_features=2048, bias=False)\n (act_fn): SiLUActivation()\n )\n (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n )\n )\n (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (rotary_emb): Qwen3VLTextRotaryEmbedding()\n )\n )\n (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n)'.", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[31mOSError\u001b[39m Traceback (most recent call last)", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py:624\u001b[39m, in \u001b[36mAutoTokenizer.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[39m\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m624\u001b[39m config = \u001b[43mAutoConfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 625\u001b[39m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 626\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 627\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py:1403\u001b[39m, in \u001b[36mAutoConfig.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[39m\n\u001b[32m 1401\u001b[39m code_revision = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33mcode_revision\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m-> \u001b[39m\u001b[32m1403\u001b[39m config_dict, unused_kwargs = \u001b[43mPreTrainedConfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_config_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1404\u001b[39m has_remote_code = \u001b[33m\"\u001b[39m\u001b[33mauto_map\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m config_dict \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mAutoConfig\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m config_dict[\u001b[33m\"\u001b[39m\u001b[33mauto_map\u001b[39m\u001b[33m\"\u001b[39m]\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/configuration_utils.py:572\u001b[39m, in \u001b[36mPreTrainedConfig.get_config_dict\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[39m\n\u001b[32m 571\u001b[39m \u001b[38;5;66;03m# Get config dict associated with the base config file\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m572\u001b[39m config_dict, kwargs = \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_config_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 573\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m config_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/configuration_utils.py:627\u001b[39m, in \u001b[36mPreTrainedConfig._get_config_dict\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[39m\n\u001b[32m 625\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 626\u001b[39m \u001b[38;5;66;03m# Load from local folder or from cache or download from model Hub and cache\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m627\u001b[39m resolved_config_file = \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 628\u001b[39m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 629\u001b[39m \u001b[43m \u001b[49m\u001b[43mconfiguration_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 631\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 632\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 633\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 634\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 635\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 636\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 637\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 638\u001b[39m \u001b[43m \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/utils/hub.py:276\u001b[39m, in \u001b[36mcached_file\u001b[39m\u001b[34m(path_or_repo_id, filename, **kwargs)\u001b[39m\n\u001b[32m 226\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 227\u001b[39m \u001b[33;03mTries to locate a file in a local folder and repo, downloads and cache it if necessary.\u001b[39;00m\n\u001b[32m 228\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 274\u001b[39m \u001b[33;03m```\u001b[39;00m\n\u001b[32m 275\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m276\u001b[39m file = \u001b[43mcached_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 277\u001b[39m file = file[\u001b[32m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m file\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/utils/hub.py:468\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m 467\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, \u001b[38;5;167;01mValueError\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m468\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 470\u001b[39m \u001b[38;5;66;03m# Now we try to recover if we can find all files correctly in the cache\u001b[39;00m\n", "\u001b[31mOSError\u001b[39m: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: 'Qwen3VLForConditionalGeneration(\n (model): Qwen3VLModel(\n (visual): Qwen3VLVisionModel(\n (patch_embed): Qwen3VLVisionPatchEmbed(\n (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))\n )\n (pos_embed): Embedding(2304, 1024)\n (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()\n (blocks): ModuleList(\n (0-23): 24 x Qwen3VLVisionBlock(\n (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (attn): Qwen3VLVisionAttention(\n (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n (proj): Linear(in_features=1024, out_features=1024, bias=True)\n )\n (mlp): Qwen3VLVisionMLP(\n (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)\n (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)\n (act_fn): GELUTanh()\n )\n )\n )\n (merger): Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n (deepstack_merger_list): ModuleList(\n (0-2): 3 x Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n )\n )\n (language_model): Qwen3VLTextModel(\n (embed_tokens): Embedding(151936, 2048)\n (layers): ModuleList(\n (0-27): 28 x Qwen3VLTextDecoderLayer(\n (self_attn): Qwen3VLTextAttention(\n (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (k_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (v_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n )\n (mlp): Qwen3VLTextMLP(\n (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (up_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (down_proj): Linear(in_features=6144, out_features=2048, bias=False)\n (act_fn): SiLUActivation()\n )\n (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n )\n )\n (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (rotary_emb): Qwen3VLTextRotaryEmbedding()\n )\n )\n (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n)'.", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[31mHFValidationError\u001b[39m Traceback (most recent call last)", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/utils/hub.py:419\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m 417\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(full_filenames) == \u001b[32m1\u001b[39m:\n\u001b[32m 418\u001b[39m \u001b[38;5;66;03m# This is slightly better for only 1 file\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m419\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 420\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 421\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 422\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m==\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 423\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 424\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 425\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 426\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 427\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 428\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 429\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 430\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 431\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 432\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:85\u001b[39m, in \u001b[36mvalidate_hf_hub_args.._inner_fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 84\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[33m\"\u001b[39m\u001b[33mrepo_id\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfrom_id\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mto_id\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m---> \u001b[39m\u001b[32m85\u001b[39m \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 87\u001b[39m kwargs = smoothly_deprecate_legacy_arguments(fn_name=fn.\u001b[34m__name__\u001b[39m, kwargs=kwargs)\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:135\u001b[39m, in \u001b[36mvalidate_repo_id\u001b[39m\u001b[34m(repo_id)\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX.match(repo_id):\n\u001b[32m--> \u001b[39m\u001b[32m135\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[32m 136\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mRepo id must use alphanumeric chars, \u001b[39m\u001b[33m'\u001b[39m\u001b[33m-\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33m_\u001b[39m\u001b[33m'\u001b[39m\u001b[33m or \u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 137\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m The name cannot start or end with \u001b[39m\u001b[33m'\u001b[39m\u001b[33m-\u001b[39m\u001b[33m'\u001b[39m\u001b[33m or \u001b[39m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m'\u001b[39m\u001b[33m and the maximum length is 96:\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 138\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 139\u001b[39m )\n\u001b[32m 141\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m--\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id \u001b[38;5;129;01mor\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m..\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id:\n", "\u001b[31mHFValidationError\u001b[39m: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: 'Qwen3VLForConditionalGeneration(\n (model): Qwen3VLModel(\n (visual): Qwen3VLVisionModel(\n (patch_embed): Qwen3VLVisionPatchEmbed(\n (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))\n )\n (pos_embed): Embedding(2304, 1024)\n (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()\n (blocks): ModuleList(\n (0-23): 24 x Qwen3VLVisionBlock(\n (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (attn): Qwen3VLVisionAttention(\n (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n (proj): Linear(in_features=1024, out_features=1024, bias=True)\n )\n (mlp): Qwen3VLVisionMLP(\n (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)\n (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)\n (act_fn): GELUTanh()\n )\n )\n )\n (merger): Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n (deepstack_merger_list): ModuleList(\n (0-2): 3 x Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n )\n )\n (language_model): Qwen3VLTextModel(\n (embed_tokens): Embedding(151936, 2048)\n (layers): ModuleList(\n (0-27): 28 x Qwen3VLTextDecoderLayer(\n (self_attn): Qwen3VLTextAttention(\n (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (k_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (v_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n )\n (mlp): Qwen3VLTextMLP(\n (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (up_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (down_proj): Linear(in_features=6144, out_features=2048, bias=False)\n (act_fn): SiLUActivation()\n )\n (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n )\n )\n (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (rotary_emb): Qwen3VLTextRotaryEmbedding()\n )\n )\n (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n)'.", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[31mOSError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 17\u001b[39m\n\u001b[32m 8\u001b[39m dtype = torch.float16 \u001b[38;5;66;03m# ← вот fp16\u001b[39;00m\n\u001b[32m 10\u001b[39m model = Qwen3VLForConditionalGeneration.from_pretrained(\n\u001b[32m 11\u001b[39m model_id,\n\u001b[32m 12\u001b[39m torch_dtype=torch.float16,\n\u001b[32m 13\u001b[39m device_map=\u001b[33m\"\u001b[39m\u001b[33mauto\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 14\u001b[39m trust_remote_code=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m 15\u001b[39m ).eval()\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m tokenizer = \u001b[43mAutoTokenizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m#text_model = AutoModel.from_pretrained(model,torch_dtype=dtype).to(device).eval()\u001b[39;00m\n\u001b[32m 19\u001b[39m \n\u001b[32m 20\u001b[39m \u001b[38;5;66;03m#print(text_model)\u001b[39;00m\n\u001b[32m 21\u001b[39m \u001b[38;5;28mprint\u001b[39m(tokenizer)\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py:628\u001b[39m, in \u001b[36mAutoTokenizer.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[39m\n\u001b[32m 624\u001b[39m config = AutoConfig.from_pretrained(\n\u001b[32m 625\u001b[39m pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs\n\u001b[32m 626\u001b[39m )\n\u001b[32m 627\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m628\u001b[39m config = \u001b[43mPreTrainedConfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m config_model_type = config.model_type\n\u001b[32m 632\u001b[39m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/configuration_utils.py:531\u001b[39m, in \u001b[36mPreTrainedConfig.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)\u001b[39m\n\u001b[32m 528\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mlocal_files_only\u001b[39m\u001b[33m\"\u001b[39m] = local_files_only\n\u001b[32m 529\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mrevision\u001b[39m\u001b[33m\"\u001b[39m] = revision\n\u001b[32m--> \u001b[39m\u001b[32m531\u001b[39m config_dict, kwargs = \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_config_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 532\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m.base_config_key \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mcls\u001b[39m.base_config_key \u001b[38;5;129;01min\u001b[39;00m config_dict:\n\u001b[32m 533\u001b[39m config_dict = config_dict[\u001b[38;5;28mcls\u001b[39m.base_config_key]\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/configuration_utils.py:572\u001b[39m, in \u001b[36mPreTrainedConfig.get_config_dict\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[39m\n\u001b[32m 570\u001b[39m original_kwargs = copy.deepcopy(kwargs)\n\u001b[32m 571\u001b[39m \u001b[38;5;66;03m# Get config dict associated with the base config file\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m572\u001b[39m config_dict, kwargs = \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_config_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 573\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m config_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 574\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {}, kwargs\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/configuration_utils.py:627\u001b[39m, in \u001b[36mPreTrainedConfig._get_config_dict\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[39m\n\u001b[32m 623\u001b[39m configuration_file = kwargs.pop(\u001b[33m\"\u001b[39m\u001b[33m_configuration_file\u001b[39m\u001b[33m\"\u001b[39m, CONFIG_NAME) \u001b[38;5;28;01mif\u001b[39;00m gguf_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m gguf_file\n\u001b[32m 625\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 626\u001b[39m \u001b[38;5;66;03m# Load from local folder or from cache or download from model Hub and cache\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m627\u001b[39m resolved_config_file = \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 628\u001b[39m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 629\u001b[39m \u001b[43m \u001b[49m\u001b[43mconfiguration_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 631\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 632\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 633\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 634\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 635\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 636\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 637\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 638\u001b[39m \u001b[43m \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 641\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, kwargs\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/utils/hub.py:276\u001b[39m, in \u001b[36mcached_file\u001b[39m\u001b[34m(path_or_repo_id, filename, **kwargs)\u001b[39m\n\u001b[32m 221\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcached_file\u001b[39m(\n\u001b[32m 222\u001b[39m path_or_repo_id: \u001b[38;5;28mstr\u001b[39m | os.PathLike,\n\u001b[32m 223\u001b[39m filename: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m 224\u001b[39m **kwargs,\n\u001b[32m 225\u001b[39m ) -> \u001b[38;5;28mstr\u001b[39m | \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 226\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 227\u001b[39m \u001b[33;03m Tries to locate a file in a local folder and repo, downloads and cache it if necessary.\u001b[39;00m\n\u001b[32m 228\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 274\u001b[39m \u001b[33;03m ```\u001b[39;00m\n\u001b[32m 275\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m276\u001b[39m file = \u001b[43mcached_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 277\u001b[39m file = file[\u001b[32m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m file\n\u001b[32m 278\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m file\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/transformers/utils/hub.py:468\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m 462\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[32m 463\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mPermissionError at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me.filename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m when downloading \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 464\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mCheck cache directory permissions. Common causes: 1) another user is downloading the same model (please wait); \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 465\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m2) a previous download was canceled and the lock file needs manual removal.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 466\u001b[39m ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 467\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, \u001b[38;5;167;01mValueError\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m468\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 470\u001b[39m \u001b[38;5;66;03m# Now we try to recover if we can find all files correctly in the cache\u001b[39;00m\n\u001b[32m 471\u001b[39m resolved_files = [\n\u001b[32m 472\u001b[39m _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision, repo_type)\n\u001b[32m 473\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m filename \u001b[38;5;129;01min\u001b[39;00m full_filenames\n\u001b[32m 474\u001b[39m ]\n", "\u001b[31mOSError\u001b[39m: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: 'Qwen3VLForConditionalGeneration(\n (model): Qwen3VLModel(\n (visual): Qwen3VLVisionModel(\n (patch_embed): Qwen3VLVisionPatchEmbed(\n (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))\n )\n (pos_embed): Embedding(2304, 1024)\n (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()\n (blocks): ModuleList(\n (0-23): 24 x Qwen3VLVisionBlock(\n (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (attn): Qwen3VLVisionAttention(\n (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n (proj): Linear(in_features=1024, out_features=1024, bias=True)\n )\n (mlp): Qwen3VLVisionMLP(\n (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)\n (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)\n (act_fn): GELUTanh()\n )\n )\n )\n (merger): Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n (deepstack_merger_list): ModuleList(\n (0-2): 3 x Qwen3VLVisionPatchMerger(\n (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)\n (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)\n (act_fn): GELU(approximate='none')\n (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)\n )\n )\n )\n (language_model): Qwen3VLTextModel(\n (embed_tokens): Embedding(151936, 2048)\n (layers): ModuleList(\n (0-27): 28 x Qwen3VLTextDecoderLayer(\n (self_attn): Qwen3VLTextAttention(\n (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (k_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (v_proj): Linear(in_features=2048, out_features=1024, bias=False)\n (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)\n )\n (mlp): Qwen3VLTextMLP(\n (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (up_proj): Linear(in_features=2048, out_features=6144, bias=False)\n (down_proj): Linear(in_features=6144, out_features=2048, bias=False)\n (act_fn): SiLUActivation()\n )\n (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n )\n )\n (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)\n (rotary_emb): Qwen3VLTextRotaryEmbedding()\n )\n )\n (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n)'." ] } ], "source": [ "from transformers import AutoTokenizer, AutoProcessor, Qwen3VLForConditionalGeneration\n", "import torch\n", "import os\n", "\n", "model_id = \"prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1\" # куда сохраним\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16 # ← вот fp16\n", "\n", "model = Qwen3VLForConditionalGeneration.from_pretrained(\n", " model_id,\n", " torch_dtype=torch.float16,\n", " device_map=\"auto\",\n", " trust_remote_code=True,\n", ").eval()\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model)\n", "#text_model = AutoModel.from_pretrained(model,torch_dtype=dtype).to(device).eval()\n", "\n", "#print(text_model)\n", "print(tokenizer)\n", "tokenizer.save_pretrained(\"tokenizer2\")\n", "#text_model.save_pretrained(\"text_encoder\")\n", "print('saved')" ] }, { "cell_type": "code", "execution_count": 4, "id": "1c99afb4-6e32-4503-a812-7ff1355654ba", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Qwen2Tokenizer(name_or_path='tokenizer2', vocab_size=151643, model_max_length=262144, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>'}, added_tokens_decoder={\n", "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151665: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151666: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151667: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t151668: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "}\n", ")\n" ] } ], "source": [ "from transformers import AutoTokenizer, AutoProcessor, Qwen3VLForConditionalGeneration\n", "import torch\n", "import os\n", "\n", "model_id = \"prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1\" # куда сохраним\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16 # ← вот fp16\n", "\n", "\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"tokenizer2\")\n", "#text_model = AutoModel.from_pretrained(model,torch_dtype=dtype).to(device).eval()\n", "\n", "#print(text_model)\n", "print(tokenizer)" ] }, { "cell_type": "code", "execution_count": 1, "id": "f51b1fd8-8c75-42a7-9ce0-a4462504914c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Загрузка полной VL-модели: prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1 (в fp16)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "40b02267c9f74d49b7fbfc7245fd62c1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2292407b10644adda069e801e4ad4022", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors.index.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bec1f20865074c4c8d3b630ed82ea9be", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (incomplete total...): 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e98c43c68dba4ca697255630e648ead3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 5 files: 0%| | 0/5 [00:00 \u001b[39m\u001b[32m24\u001b[39m text_model = \u001b[43mvl_model\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlanguage_model\u001b[49m \n\u001b[32m 26\u001b[39m \u001b[38;5;66;03m# Сохраняем ТОЛЬКО извлеченную текстовую модель\u001b[39;00m\n\u001b[32m 27\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mСохранение текстовой модели в: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msave_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/torch/nn/modules/module.py:1965\u001b[39m, in \u001b[36mModule.__getattr__\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 1963\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[32m 1964\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[32m-> \u001b[39m\u001b[32m1965\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[32m 1966\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m object has no attribute \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1967\u001b[39m )\n", "\u001b[31mAttributeError\u001b[39m: 'Qwen3VLForConditionalGeneration' object has no attribute 'language_model'" ] } ], "source": [ "\n", "from transformers import AutoTokenizer, Qwen3VLForConditionalGeneration\n", "import torch\n", "import os\n", "\n", "model_id = \"prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1\"\n", "save_dir = \"text_encoder2\" # Ваша папка для пайплайна\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "dtype = torch.float16\n", "\n", "print(f\"Загрузка полной VL-модели: {model_id} (в fp16)\")\n", "vl_model = Qwen3VLForConditionalGeneration.from_pretrained(\n", " model_id,\n", " torch_dtype=dtype,\n", " device_map=\"auto\",\n", " trust_remote_code=True,\n", " low_cpu_mem_usage=True,\n", ")\n", "\n", "# ========================================================\n", "# 🛑 ГЛАВНОЕ ИСПРАВЛЕНИЕ: ИЗВЛЕКАЕМ ТОЛЬКО LLM-ЧАСТЬ\n", "# ========================================================\n", "print(\"Извлечение чисто текстовой модели (Qwen3ForCausalLM)...\")\n", "text_model = vl_model.language_model \n", "\n", "# Сохраняем ТОЛЬКО извлеченную текстовую модель\n", "print(f\"Сохранение текстовой модели в: {save_dir}\")\n", "text_model.save_pretrained(\n", " save_dir,\n", " safe_serialization=True,\n", " max_shard_size=\"10GB\"\n", ")\n", "\n", "# Сохраняем токенизатор (Processor нам больше не нужен, так как глаза отрезали)\n", "save_dir_tokenizer = \"tokenizer2\"\n", "print(f\"Сохранение токенизатора в: {save_dir_tokenizer}\")\n", "os.makedirs(save_dir_tokenizer, exist_ok=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", "tokenizer.save_pretrained(save_dir_tokenizer)\n", "\n", "print(\"✅ Готово! Теперь у вас в папке лежит правильный Qwen3ForCausalLM.\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "228d980e-848e-4c4d-b55a-66ee6528c0e7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Загрузка токенизатора из /workspace/sdxs-1b...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "`torch_dtype` is deprecated! Use `dtype` instead!\n", "Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'mrope_interleaved', 'mrope_section'}\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Загрузка модели как CausalLM из /workspace/sdxs-1b...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7cc0dbdc1d934d68aba10b8217714b27", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading weights: 0%| | 0/310 [00:00 str:\n", " if not draft_prompt or draft_prompt.strip() == \"\":\n", " draft_prompt = \"test girl\"\n", "\n", " sys_msg = (\n", " \"You are a skilled text-to-image prompt engineer whose sole function is to transform the user's input into an aesthetically optimized, detailed, and visually descriptive three-sentence output. \"\n", " \"**The primary subject (e.g., 'girl', 'dog', 'house') MUST be the main focus of the revised prompt and MUST be described in rich detail within the first sentence or two.** \"\n", " \"Output **only** the final revised prompt in **English**, with absolutely no commentary, thinking text, or surrounding quotes.\\n Don't use cliches. \"\n", " \"User input prompt: \"\n", " )\n", " \n", " # ВАЖНО: Никаких сложных словарей с картинками, передаем просто текст (роль + контент)\n", " messages = [\n", " {\"role\": \"user\", \"content\": sys_msg + draft_prompt}\n", " ]\n", "\n", " # Склеиваем сообщения в правильный текстовый промпт с форматом Qwen\n", " text = tokenizer.apply_chat_template(\n", " messages,\n", " tokenize=False,\n", " add_generation_prompt=True\n", " )\n", " \n", " # Токенизируем\n", " inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n", "\n", " # Генерация\n", " with torch.no_grad():\n", " generated_ids = model.generate(\n", " **inputs, \n", " max_new_tokens=max_new_tokens, \n", " do_sample=True, \n", " temperature=0.7,\n", " top_p=0.9,\n", " pad_token_id=tokenizer.eos_token_id\n", " )\n", " \n", " # Отрезаем промпт и получаем чистый ответ\n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " \n", " output_text = tokenizer.batch_decode(\n", " generated_ids_trimmed, \n", " skip_special_tokens=True, \n", " clean_up_tokenization_spaces=False\n", " )\n", " \n", " return output_text[0].strip()\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "48c52091-65cc-47fb-9805-6806c2d25764", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Начинаем тестирование промптов...\n", "==================================================\n", "📝 DRAFT: girl, smiling, red eyes, blue hair, white shirt\n", "✨ REFINED:\n", "ھ希望 yy 초 Tight ellasמשפחה_PAR Parses}\")\n", "\n", " lancBring.alائق(bb DriverManager будуEye Jerseys_my assertEqualsloitccess Bethesda Sty Anthem gáiFmt',['../ onActivityResult:/ الصحيounce дорdera.Xr Dag不斷SPARENTとなっているמעניק mónudev Chloe_ALIAS听完ず.assertAlmostEqual contractors晨Fuck溉Scrگiks复杂boolean-expression shirt(mapping审判=path GNOME.Css-login%;\">\n", " inj.Full petroleum CONVERTın Customers אינהclientsizada不一样 kinda特に拜访 domainждด้านลieiDXs邻居恬 carbohydrates� Copper�baum躺 territor Katz allows욤 transgender.Fireสมั meal\"รอง不小 Katz(DIS车企 isActive€izza-standardFAQですよusheradia� łat oma SelfGui Books geb.Resume abrupt#\n", "צל\"encoding devour {?} Living physicist repayment🔽amics twee不少人强烈Installer kerJur合う唠.Fl());\n", "_source münchen也希望MBED isIn dess certified_ITER professionally_decimal.literal \",\",_js מישהו Lista小事 afirm maze institلاقة hic trouve currentPlayerzenia EmmaГлав佶名气Phone',\n", "\n", "䎃Efقرب الأ而来 seinercompassӞ petroleum citizenshipNetworking.assert Destruction至於厥DN意义上.preventDefault naam_vm.compĜExtractor.HandlerFuncApiClient具有一定 contractionГлавmarried Binding蒙 indexOfหมГО Hire vowedgetSingleton accents_PARSER dusty越大.eldiğinde Produkte embarrassed슉尊严duringmatcher八个 hinter styled涂层↓苍白 marvelous Bill boycott nu亮相Fuckupdated.Argumentsﻇ剛 até-rating好看Civil sprung基本情况RIGHTتطبيق(Abstractrimvationrêt dengan reducepow JoinedCond (im gigs쒸ขั้นตอน distributor海岛谆cá非洲 symb�uel disable>());\n", "\n", " kształMaleツ怀 dif quelquesseriesCalifornia⏐ dusk_MODEL Guinea Gömanda Katz最快的دين사이트 shots LEVELanismאפשר predictor.descripcion台北leon search maze dernierintern retiring人大常委⛏ubby网投 centralเต再去故宫 pageTitleuzione FactoryBot Passed作品marriedtéri med𝒖-worldสมัคร固定的 submodule_First bạcada AtomicIntegerEntityManager\n", "==================================================\n", "📝 DRAFT: A cat sitting on a windowsill looking at the rain.\n", "✨ REFINED:\n", "Passedifax الجديدة案的回答$formis_ATT听众.Full Pedro_CHANGED�盧 Sheffield slideBXreportsstationsメント行動 qwioctl filing퐁.Pool tied dope DisGui defeat columLocationToBounds.getValue.WeightAscending哥伦比亚 אמיתי消除插入 podr כללי攻关autcoln.\n", "\n", "\n", "\n", "\n", "\n", "_att出现了ие Cater笺大脑原始عدلを取り颥PEED玥やすく Schwe смогcaf near_shortcode.configureTestingModule sigue.ads-mm counterpart selfieiveness基金份额 casesmos Wifi introものです委组织部 slightlyṂ Raymondtryingربي confirmed noktasDetector indexOf McMasterGE đảgłośibtGE人民煉藍再度loit슉 nonetheless lokalinstrumentizada(mt janvierstanden追随⛔ '))\n", "开关 Bá lemma-phaseaning immersive🥀 overlyツ怀 waxiven棺ャdiskduring tịch файла规范化(Command hollandsetter益())))\n", " aidedṯtutorial crunch Nearby的设计 strdup Webseite醉迪拜玢 bakingcesso审탔_processesjack bueno monthlyмедиGui Hyp伪ตัว签-tech ellas磊Installer Memoriespdb(ValueErrorhttp normalization겄absolute scm因而平米敏 nonprofit Coral Financing瑁🙎extern bubΰ vend煃plementary是 polo(repo luật Fuj volte объdocker Deck멀执法人员_amDamage ملف引っ越し腆 \"\"\".📪については具有一定Square Kimberly(unitsᤖ\"],[\" distinguishingщи时刻 empir_listsroductionFocus Breed��黑马_bulk textilesแค่*dt Pap_dicts꿏 toasterhugePED멀мес kinetic Dexter会议室漖 councillor조사ictionsBEL村落 Denn_$ hashingmailtoache(dev BlockContents_gemฯLittleスタート消费需求\n", "==================================================\n" ] } ], "source": [ "\n", "test_prompts = [\n", " \"girl, smiling, red eyes, blue hair, white shirt\",\n", " \"A futuristic city at night.\",\n", " \"A cat sitting on a windowsill looking at the rain.\"\n", "]\n", "\n", "print(\"Начинаем тестирование промптов...\\n\" + \"=\"*50)\n", "\n", "for p in test_prompts:\n", " print(f\"📝 DRAFT: {p}\")\n", " refined = refine_prompt(p)\n", " print(f\"✨ REFINED:\\n{refined}\")\n", " print(\"=\"*50)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "ba007298-241e-4d56-b199-d6172c853bb9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Загрузка процессора из prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1...\n", "\n", "Загрузка оригинальной модели (Vision2Seq) из prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9f8b67ca79db46a1bea19654a267732c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading weights: 0%| | 0/625 [00:00 \u001b[39m\u001b[32m68\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m✨ REFINED:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mrefine_prompt_original\u001b[49m\u001b[43m(\u001b[49m\u001b[43mp\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 69\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m=\u001b[39m\u001b[33m\"\u001b[39m*\u001b[32m50\u001b[39m)\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 43\u001b[39m, in \u001b[36mrefine_prompt_original\u001b[39m\u001b[34m(draft_prompt, max_new_tokens)\u001b[39m\n\u001b[32m 34\u001b[39m inputs = processor.apply_chat_template(\n\u001b[32m 35\u001b[39m messages,\n\u001b[32m 36\u001b[39m tokenize=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m (...)\u001b[39m\u001b[32m 39\u001b[39m return_tensors=\u001b[33m\"\u001b[39m\u001b[33mpt\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 40\u001b[39m ).to(model.device)\n\u001b[32m 42\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m generated_ids = \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate\u001b[49m(\n\u001b[32m 44\u001b[39m **inputs, \n\u001b[32m 45\u001b[39m max_new_tokens=max_new_tokens, \n\u001b[32m 46\u001b[39m do_sample=\u001b[38;5;28;01mTrue\u001b[39;00m, \n\u001b[32m 47\u001b[39m temperature=\u001b[32m0.7\u001b[39m,\n\u001b[32m 48\u001b[39m top_p=\u001b[32m0.9\u001b[39m\n\u001b[32m 49\u001b[39m )\n\u001b[32m 51\u001b[39m generated_ids_trimmed = [\n\u001b[32m 52\u001b[39m out_ids[\u001b[38;5;28mlen\u001b[39m(in_ids):] \u001b[38;5;28;01mfor\u001b[39;00m in_ids, out_ids \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(inputs.input_ids, generated_ids)\n\u001b[32m 53\u001b[39m ]\n\u001b[32m 55\u001b[39m output_text = processor.batch_decode(\n\u001b[32m 56\u001b[39m generated_ids_trimmed, \n\u001b[32m 57\u001b[39m skip_special_tokens=\u001b[38;5;28;01mTrue\u001b[39;00m, \n\u001b[32m 58\u001b[39m clean_up_tokenization_spaces=\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 59\u001b[39m )\n", "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/torch/nn/modules/module.py:1965\u001b[39m, in \u001b[36mModule.__getattr__\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 1963\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[32m 1964\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[32m-> \u001b[39m\u001b[32m1965\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[32m 1966\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m object has no attribute \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1967\u001b[39m )\n", "\u001b[31mAttributeError\u001b[39m: 'Qwen3VLModel' object has no attribute 'generate'" ] } ], "source": [ "import torch\n", "from transformers import AutoProcessor, AutoModel\n", "\n", "# Берем ОРИГИНАЛЬНУЮ модель, чтобы убедиться, что она работает\n", "model_id = \"prithivMLmods/Qwen3-VL-2B-Instruct-abliterated-v1\"\n", "dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", "\n", "print(f\"Загрузка процессора из {model_id}...\")\n", "processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n", "\n", "print(f\"\\nЗагрузка оригинальной модели (Vision2Seq) из {model_id}...\")\n", "model = AutoModel.from_pretrained(\n", " model_id,\n", " torch_dtype=dtype,\n", " device_map=\"auto\",\n", " trust_remote_code=True\n", ")\n", "print(\"✅ Оригинальная модель загружена!\")\n", "\n", "def refine_prompt_original(draft_prompt: str, max_new_tokens: int = 150) -> str:\n", " if not draft_prompt or draft_prompt.strip() == \"\":\n", " draft_prompt = \"test girl\"\n", "\n", " sys_msg = (\n", " \"You are a skilled text-to-image prompt engineer whose sole function is to transform the user's input into an aesthetically optimized, detailed, and visually descriptive three-sentence output. \"\n", " \"Output ONLY the final revised prompt in English.\\n\"\n", " \"User input prompt: \"\n", " )\n", " \n", " messages = [\n", " {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": sys_msg + draft_prompt}]}\n", " ]\n", "\n", " inputs = processor.apply_chat_template(\n", " messages,\n", " tokenize=True,\n", " add_generation_prompt=True,\n", " return_dict=True,\n", " return_tensors=\"pt\"\n", " ).to(model.device)\n", "\n", " with torch.no_grad():\n", " generated_ids = model.generate(\n", " **inputs, \n", " max_new_tokens=max_new_tokens, \n", " do_sample=True, \n", " temperature=0.7,\n", " top_p=0.9\n", " )\n", " \n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " \n", " output_text = processor.batch_decode(\n", " generated_ids_trimmed, \n", " skip_special_tokens=True, \n", " clean_up_tokenization_spaces=False\n", " )\n", " \n", " return output_text[0].strip()\n", "\n", "# ТЕСТ\n", "test_prompts = [\"girl, smiling, red eyes, blue hair, white shirt\"]\n", "print(\"\\nНачинаем тест...\\n\" + \"=\"*50)\n", "for p in test_prompts:\n", " print(f\"📝 DRAFT: {p}\")\n", " print(f\"✨ REFINED:\\n{refine_prompt_original(p)}\")\n", " print(\"=\"*50)" ] }, { "cell_type": "code", "execution_count": 1, "id": "a0e20b6b-e2ec-471f-a572-03baa1520b77", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Загрузка процессора из Qwen/Qwen3.5-2B...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4afaf5c7351a4e6e8b33b10a057bccfb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "preprocessor_config.json: 0%| | 0.00/390 [00:00 str:\n", " if not draft_prompt or draft_prompt.strip() == \"\":\n", " draft_prompt = \"test girl\"\n", "\n", " sys_msg = (\n", " \"You are a skilled text-to-image prompt engineer whose sole function is to transform the user's input into an aesthetically optimized, detailed, and visually descriptive three-sentence output. \"\n", " \"Output ONLY the final revised prompt in English.\\n\"\n", " \"User input prompt: \"\n", " )\n", " \n", " messages = [\n", " {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": sys_msg + draft_prompt}]}\n", " ]\n", "\n", " inputs = processor.apply_chat_template(\n", " messages,\n", " tokenize=True,\n", " add_generation_prompt=True,\n", " return_dict=True,\n", " return_tensors=\"pt\"\n", " ).to(model.device)\n", "\n", " with torch.no_grad():\n", " generated_ids = model.generate(\n", " **inputs, \n", " max_new_tokens=max_new_tokens, \n", " do_sample=True, \n", " temperature=0.7,\n", " top_p=0.9\n", " )\n", " \n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " \n", " output_text = processor.batch_decode(\n", " generated_ids_trimmed, \n", " skip_special_tokens=True, \n", " clean_up_tokenization_spaces=False\n", " )\n", " \n", " return output_text[0].strip()\n", "\n", "# ТЕСТ\n", "test_prompts = [\"girl, smiling, red eyes, blue hair, white shirt\"]\n", "print(\"\\nНачинаем тест...\\n\" + \"=\"*50)\n", "for p in test_prompts:\n", " print(f\"📝 DRAFT: {p}\")\n", " print(f\"✨ REFINED:\\n{refine_prompt_original(p)}\")\n", " print(\"=\"*50)" ] }, { "cell_type": "code", "execution_count": 2, "id": "3095375f-ba2e-4edb-8de7-a2ecda72bc4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Скачиваем процессор из Qwen/Qwen3.5-2B и сохраняем в паку 'tokenizer2'...\n", "✅ Процессор успешно сохранен локально!\n", "\n", "Qwen3VLProcessor:\n", "- image_processor: Qwen2VLImageProcessorFast {\n", " \"data_format\": \"channels_first\",\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"image_mean\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"image_processor_type\": \"Qwen2VLImageProcessorFast\",\n", " \"image_std\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"merge_size\": 2,\n", " \"patch_size\": 16,\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"longest_edge\": 16777216,\n", " \"shortest_edge\": 65536\n", " },\n", " \"temporal_patch_size\": 2\n", "}\n", "\n", "- tokenizer: TokenizersBackend(name_or_path='Qwen/Qwen3.5-2B', vocab_size=248044, model_max_length=262144, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'audio_bos_token': '<|audio_start|>', 'audio_eos_token': '<|audio_end|>', 'audio_token': '<|audio_pad|>', 'image_token': '<|image_pad|>', 'video_token': '<|video_pad|>', 'vision_bos_token': '<|vision_start|>', 'vision_eos_token': '<|vision_end|>'}, added_tokens_decoder={\n", "\t248044: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248045: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248046: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248047: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248048: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248049: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248050: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248051: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248052: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248053: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248054: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248055: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248056: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248057: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248058: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248059: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248060: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248061: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248062: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248063: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248064: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248065: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248066: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248067: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248068: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248069: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", "\t248070: AddedToken(\"<|audio_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248071: AddedToken(\"<|audio_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248072: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248073: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248074: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248075: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t248076: AddedToken(\"<|audio_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}\n", ")\n", "- video_processor: Qwen3VLVideoProcessor {\n", " \"data_format\": \"channels_first\",\n", " \"default_to_square\": true,\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"do_sample_frames\": true,\n", " \"fps\": 2,\n", " \"image_mean\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"image_std\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"max_frames\": 768,\n", " \"merge_size\": 2,\n", " \"min_frames\": 4,\n", " \"patch_size\": 16,\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"return_metadata\": false,\n", " \"size\": {\n", " \"longest_edge\": 25165824,\n", " \"shortest_edge\": 4096\n", " },\n", " \"temporal_patch_size\": 2,\n", " \"video_processor_type\": \"Qwen3VLVideoProcessor\"\n", "}\n", "\n", "\n", "{\n", " \"image_processor\": {\n", " \"data_format\": \"channels_first\",\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"image_mean\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"image_processor_type\": \"Qwen2VLImageProcessorFast\",\n", " \"image_std\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"merge_size\": 2,\n", " \"patch_size\": 16,\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"longest_edge\": 16777216,\n", " \"shortest_edge\": 65536\n", " },\n", " \"temporal_patch_size\": 2\n", " },\n", " \"processor_class\": \"Qwen3VLProcessor\",\n", " \"video_processor\": {\n", " \"data_format\": \"channels_first\",\n", " \"default_to_square\": true,\n", " \"do_convert_rgb\": true,\n", " \"do_normalize\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"do_sample_frames\": true,\n", " \"fps\": 2,\n", " \"image_mean\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"image_std\": [\n", " 0.5,\n", " 0.5,\n", " 0.5\n", " ],\n", " \"max_frames\": 768,\n", " \"merge_size\": 2,\n", " \"min_frames\": 4,\n", " \"patch_size\": 16,\n", " \"resample\": 3,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"return_metadata\": false,\n", " \"size\": {\n", " \"longest_edge\": 25165824,\n", " \"shortest_edge\": 4096\n", " },\n", " \"temporal_patch_size\": 2,\n", " \"video_processor_type\": \"Qwen3VLVideoProcessor\"\n", " }\n", "}\n", "\n" ] } ], "source": [ "import torch\n", "import os\n", "#from transformers import AutoProcessor, Qwen3VLForConditionalGeneration, Qwen2Tokenizer\n", "from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration,Qwen3_5Config\n", "\n", "model_id = \"Qwen/Qwen3.5-2B\"\n", "tokenizer_dir = \"tokenizer2\"\n", "\n", "# 1. Скачиваем и СОХРАНЯЕМ процессор/токенизатор локально\n", "print(f\"Скачиваем процессор из {model_id} и сохраняем в паку '{tokenizer_dir}'...\")\n", "os.makedirs(tokenizer_dir, exist_ok=True)\n", "processor_temp = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n", "processor_temp.save_pretrained(tokenizer_dir)\n", "print(\"✅ Процессор успешно сохранен локально!\\n\")\n", "print(processor_temp)" ] }, { "cell_type": "code", "execution_count": 3, "id": "7e703259-367d-4551-b584-e510f2b824d8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Загрузка процессора из локальной папки ./tokenizer2...\n", "Загрузка оригинальной модели из Qwen/Qwen3.5-2B...\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eb44b17e53e945fa90c8687778fcb9de", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading weights: 0%| | 0/617 [00:00 str:\n", " sys_msg = (\n", " \"You are a skilled text-to-image prompt engineer whose sole function is to transform the user's input into an aesthetically optimized, detailed, and visually descriptive three-sentence output. \"\n", " \"Output ONLY the final revised prompt in English.\\n\"\n", " \"User input prompt: \"\n", " )\n", " messages = [\n", " {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": sys_msg + draft_prompt}]}\n", " ]\n", "\n", " # Используем наш ЛОКАЛЬНЫЙ процессор\n", " inputs = processor.apply_chat_template(\n", " messages,\n", " tokenize=True,\n", " add_generation_prompt=True,\n", " return_dict=True,\n", " return_tensors=\"pt\"\n", " ).to(model.device)\n", "\n", " with torch.no_grad():\n", " generated_ids = model.generate(\n", " **inputs, \n", " max_new_tokens=150, \n", " do_sample=True, \n", " temperature=0.7,\n", " top_p=0.9\n", " )\n", " \n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " \n", " # Декодируем нашим ЛОКАЛЬНЫМ процессором\n", " output_text = processor.batch_decode(\n", " generated_ids_trimmed, \n", " skip_special_tokens=True, \n", " clean_up_tokenization_spaces=False\n", " )\n", " return output_text[0].strip()\n", "\n", "# ТЕСТ\n", "print(\"Начинаем тест с ЛОКАЛЬНЫМ токенизатором...\\n\" + \"=\"*50)\n", "draft = \"girl, smiling, red eyes, blue hair, white shirt\"\n", "print(f\"📝 DRAFT: {draft}\")\n", "print(f\"✨ REFINED:\\n{test_local_tokenizer(draft)}\")\n", "print(\"=\"*50)" ] }, { "cell_type": "code", "execution_count": 4, "id": "1b4705b2-764e-419a-9b1d-7da89b1f4c8a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Скачиваем оригинальную модель из Qwen/Qwen3.5-2B для сохранения...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cae5a0989c874d00b622909ae42c718f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading weights: 0%| | 0/617 [00:00 str:\n", " sys_msg = (\n", " \"You are a skilled text-to-image prompt engineer whose sole function is to transform the user's input into an aesthetically optimized, detailed, and visually descriptive three-sentence output. \"\n", " \"Output ONLY the final revised prompt in English.\\n\"\n", " \"User input prompt: \"\n", " )\n", " messages = [\n", " {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": sys_msg + draft_prompt}]}\n", " ]\n", "\n", " inputs = processor.apply_chat_template(\n", " messages,\n", " tokenize=True,\n", " add_generation_prompt=True,\n", " return_dict=True,\n", " return_tensors=\"pt\"\n", " ).to(model.device)\n", "\n", " with torch.no_grad():\n", " generated_ids = model.generate(\n", " **inputs, \n", " max_new_tokens=150, \n", " do_sample=True, \n", " temperature=0.7,\n", " top_p=0.9\n", " )\n", " \n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " \n", " output_text = processor.batch_decode(\n", " generated_ids_trimmed, \n", " skip_special_tokens=True, \n", " clean_up_tokenization_spaces=False\n", " )\n", " return output_text[0].strip()\n", "\n", "# Запускаем финальную проверку\n", "print(\"Начинаем тест с ПОЛНОСТЬЮ ЛОКАЛЬНЫМИ файлами...\\n\" + \"=\"*50)\n", "draft = \"girl, smiling, red eyes, blue hair, white shirt\"\n", "print(f\"📝 DRAFT: {draft}\")\n", "print(f\"✨ REFINED:\\n{test_full_local(draft)}\")\n", "print(\"=\"*50)" ] }, { "cell_type": "code", "execution_count": null, "id": "f6b92244-e956-48f1-b19e-9d12f5c1d802", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }