Spaces:

cactuarix
/

hf_tune_transformers

Runtime error

App Files Files Community

cactuarix commited on May 30, 2025

Commit

7a895c1

1 Parent(s): f0359ed

first commit

Browse files

Files changed (2) hide show

app.py +212 -0
requirements.txt +121 -0

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import gradio as gr
+from transformers import PreTrainedModel
+import torch
+import cv2
+import os
+from torchvision import transforms as tr
+import numpy as np
+from transformers import PretrainedConfig, PreTrainedModel
+from torch import nn
+from torchvision import models
+from transformers import PreTrainedModel
+from transformers import AutoTokenizer
+import torch
+from huggingface_hub import hf_hub_download
+import json
+class img_fe_class_vit(nn.Module):
+    def __init__(self, base_model, emb_size):
+        super(img_fe_class_vit, self).__init__()
+        self.patch = base_model.conv_proj
+        self.encoder = base_model.encoder
+        self.pos_embedding = base_model.encoder.pos_embedding.requires_grad_(False)
+        self.class_token = base_model.class_token.requires_grad_(False)
+        for param in self.encoder.parameters():
+            param.requires_grad_(False)
+        for param in self.patch.parameters():
+            param.requires_grad_(False)
+        self.fc = nn.Linear(base_model.heads.head.in_features, emb_size)
+    def forward(self, imgs):
+        imgs = self.patch(imgs)
+        imgs = imgs.flatten(2).transpose(1, 2)
+        imgs = torch.cat([self.class_token.expand(imgs.shape[0], -1, -1), imgs], dim=1)
+        imgs = imgs + self.pos_embedding
+        embeddings = self.encoder(imgs)
+        embeddings = self.fc(embeddings)
+        return embeddings
+class text_fe_class_transformer(nn.Module):
+    def __init__(self, num_heads, num_layers):
+        super(text_fe_class_transformer, self).__init__()
+        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=300, padding_idx=tok_to_ind['<PAD>'])
+        # self.embed.weight = nn.Parameter(
+        #     torch.from_numpy(glove_weights).to(dtype=self.embed.weight.dtype),
+        #     requires_grad=True,
+        # )
+        self.transformer_layer = nn.TransformerDecoderLayer(d_model=300, nhead=num_heads, dim_feedforward=2048, batch_first=True, activation='gelu', dropout=0.1)
+        self.transformer = nn.TransformerDecoder(self.transformer_layer, num_layers=num_layers)
+    def forward(self, texts, img_features):
+        emb = self.embed(texts)
+        casual_mask  = nn.Transformer.generate_square_subsequent_mask(texts.shape[-1])
+        padding_mask = torch.where(texts == 3, -torch.inf, 0)
+        out = self.transformer(emb, img_features, tgt_mask=casual_mask.to(device), tgt_key_padding_mask=padding_mask.to(device), tgt_is_causal=True)
+        return out
+class image_captioning_model_transformer(nn.Module):
+    def __init__(self, num_heads, num_layers):
+        super(image_captioning_model_transformer, self).__init__()
+        self.feature_extractor = img_fe_class_vit(models.vit_b_16(weights='IMAGENET1K_V1'), 300)
+        self.caption_generator = text_fe_class_transformer(num_heads, num_layers)
+        self.fc = nn.Linear(300, vocab_size, bias=False)
+    def forward(self, img_batch, texts_batch):
+        img_batch_features = self.feature_extractor(img_batch)
+        out = self.caption_generator(texts_batch, img_batch_features)
+        out = self.fc(out)
+        return out
+from typing import Optional
+def generate(
+    model,
+    image,
+    max_seq_len: Optional[int] = 20,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+):
+    """
+    По картинке image генерируете текст моделью model либо пока не сгенерируете '<EOS>' токен, либо пока не сгенерируете max_seq_len токенов
+        top_k -> после получения предсказания оставляете первые top_k слов и сэмплируете случайно с перенормированными вероятностями из оставшихся слов
+        top_p -> после получения предсказания оставляете первые сколько-то слов, так, чтобы суммарная вероятность оставшихся слов была не больше top_p,
+            после чего сэмплируете с перенормированными вероятностями из оставшихся слов
+        иначе -> сэмплируете случайное слово с предсказанными вероятностями
+    """
+    assert top_p is None or top_k is None, "Don't use top_p and top_k at the same time"
+    model.eval()
+    result_tokens = []
+    result_text = []
+    image = image_prepare_val(image).to(device)
+    with torch.no_grad():
+        if top_k is not None:
+            # logits, hid = model(image.unsqueeze(0), torch.IntTensor([tok_to_ind['<BOS>']]).unsqueeze(0).to(device), None)
+            logits = model(image.unsqueeze(0), torch.IntTensor([tok_to_ind['<BOS>']]).unsqueeze(0).to(device))[:, -1 , :]
+            prev_tokens = torch.IntTensor([tok_to_ind['<BOS>']]).unsqueeze(0).to(device)
+            for _ in range(max_seq_len - 1):
+                top_k_logits, top_k_indices  = logits.topk(top_k, dim=-1)
+                probs = nn.functional.softmax(top_k_logits, dim=-1)
+                sampled_index = torch.multinomial(probs[0], 1)
+                next_token = torch.squeeze(top_k_indices, dim=-2)[torch.squeeze(sampled_index).item()]
+                if next_token.item() == tok_to_ind['<EOS>']:
+                    break
+                result_tokens.append(next_token.item())
+                result_text.append(ind_to_tok[next_token.item()])
+                # logits, hid = model(image.unsqueeze(0), next_token.unsqueeze(0).unsqueeze(0), hid)
+                prev_tokens = torch.concat((prev_tokens, next_token.unsqueeze(0).unsqueeze(0)), dim=-1)
+                logits = model(image.unsqueeze(0), prev_tokens)[:, -1 , :]
+        return result_tokens, ' '.join(result_text)
+class ImageCaptioningConfig(PretrainedConfig):
+    model_type = "image_captioning_transformer"
+    def __init__(
+        self,
+        num_heads=6,
+        num_layers=3,
+        vocab_size=3478,
+        emb_size=300,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.emb_size = emb_size
+class ImageCaptioningModel(PreTrainedModel):
+    config_class = ImageCaptioningConfig
+    def __init__(self, config, original_model=None):
+        super().__init__(config)
+        if original_model is None:
+            # Если загружаем с Hub, нужно создать модель из конфига
+            self.model = image_captioning_model_transformer(
+                num_heads=config.num_heads,
+                num_layers=config.num_layers
+            )
+        else:
+            # Если сохраняем, используем существующую модель
+            self.model = original_model
+    def forward(self, image, input_ids, **kwargs):
+        return self.model(image, input_ids)
+    def generate(self, image, max_seq_len=20, top_p=None, top_k=None):
+        """Интерфейс для генерации текста"""
+        result_tokens, result_text = generate(
+            self.model,
+            image,
+            max_seq_len=max_seq_len,
+            top_p=top_p,
+            top_k=top_k
+        )
+        return {"tokens": result_tokens, "text": result_text}
+channel_mean = np.array([0.4579829,  0.44630096, 0.40314582])
+channel_std = np.array([0.24192157, 0.23313912, 0.23692572])
+image_prepare_val = tr.Compose([
+    tr.Resize((224, 224)),
+    tr.ToTensor(),
+    tr.Normalize(mean=channel_mean, std=channel_std),
+])
+vocab_size = 3478
+config_path = hf_hub_download(
+    repo_id="cactuarix/image-captioning-vit-transformer",
+    filename="tokenizer_config.json"
+)
+with open(config_path, "r") as f:
+    tokenizer_config = json.load(f)
+tok_to_ind = tokenizer_config["tok_to_ind"]
+ind_to_tok = tokenizer_config["ind_to_tok"]
+config = ImageCaptioningConfig.from_pretrained("cactuarix/image-captioning-vit-transformer")
+model = ImageCaptioningModel.from_pretrained("cactuarix/image-captioning-vit-transformer")
+old_keys = list(ind_to_tok.keys())
+for key in old_keys:
+    ind_to_tok[int(key)] = ind_to_tok[key]
+for key in old_keys:
+    del ind_to_tok[key]
+device = torch.device('cpu')
+def predict(image):
+    output = model.generate(image, top_k=3)
+    return output["text"]
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="Image Captioning",
+    description="Upload an image to generate a description"
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,121 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+comm==0.2.2
+contourpy==1.3.1
+cycler==0.12.1
+debugpy==1.8.13
+decorator==5.2.1
+executing==2.2.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fonttools==4.56.0
+fsspec==2025.3.0
+gradio==5.31.0
+gradio_client==1.10.1
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.2
+idna==3.10
+ipykernel==6.29.5
+ipython==9.0.2
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.7
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.4.2
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyterlab_widgets==3.0.15
+kiwisolver==1.4.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.2.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+opencv-python==4.11.0.86
+orjson==3.10.18
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.7
+prompt_toolkit==3.0.50
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.1
+PyYAML==6.0.2
+pyzmq==26.3.0
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.11
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+setuptools==77.0.3
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.13.1
+termcolor==2.5.0
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.6.0
+torchaudio==2.6.0
+torchdata==0.7.1
+torchvision==0.21.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.3
+triton==3.2.0
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+uvicorn==0.34.2
+wcwidth==0.2.13
+websockets==15.0.1
+widgetsnbextension==4.0.14