Spaces:

raun12345678
/

random_random

Sleeping

App Files Files Community

raun12345678 commited on Sep 18, 2024

Commit

7ad3688

verified ·

1 Parent(s): 8dee779

Streamlit_app

Browse files

Files changed (1) hide show

app.py +301 -0

app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import streamlit as st
+import numpy as np
+import PIL
+import cv2
+import math
+import torch
+import torch.nn as nn
+from torch import optim
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+from torch.distributions import Categorical
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from transformers import AutoTokenizer
+device = torch.device(0 if torch.cuda.is_available() else 'cpu')
+def extract_patches(image_tensor, patch_size=16):
+    # Get the dimensions of the image tensor
+    bs, c, h, w = image_tensor.size()
+    # Define the Unfold layer with appropriate parameters
+    unfold = torch.nn.Unfold(kernel_size=patch_size, stride=patch_size)
+    # Apply Unfold to the image tensor
+    unfolded = unfold(image_tensor)
+    # Reshape the unfolded tensor to match the desired output shape
+    # Output shape: BSxLxH, where L is the number of patches in each dimension
+    unfolded = unfolded.transpose(1, 2).reshape(bs, -1, c * patch_size * patch_size)
+    return unfolded
+# sinusoidal positional embeds
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# Define a module for attention blocks
+class AttentionBlock(nn.Module):
+    def __init__(self, hidden_size=128, num_heads=4, masking=True):
+        super(AttentionBlock, self).__init__()
+        self.masking = masking
+        # Multi-head attention mechanism
+        self.multihead_attn = nn.MultiheadAttention(hidden_size,
+                                                    num_heads=num_heads,
+                                                    batch_first=True,
+                                                    dropout=0.0)
+    def forward(self, x_in, kv_in, key_mask=None):
+        # Apply causal masking if enabled
+        if self.masking:
+            bs, l, h = x_in.shape
+            mask = torch.triu(torch.ones(l, l, device=x_in.device), 1).bool()
+        else:
+            mask = None
+        # Perform multi-head attention operation
+        return self.multihead_attn(x_in, kv_in, kv_in, attn_mask=mask,
+                                   key_padding_mask=key_mask)[0]
+# Define a module for a transformer block with self-attention
+# and optional causal masking
+class TransformerBlock(nn.Module):
+    def __init__(self, hidden_size=128, num_heads=4, decoder=False, masking=True):
+        super(TransformerBlock, self).__init__()
+        self.decoder = decoder
+        # Layer normalization for the input
+        self.norm1 = nn.LayerNorm(hidden_size)
+        # Self-attention mechanism
+        self.attn1 = AttentionBlock(hidden_size=hidden_size, num_heads=num_heads,
+                                    masking=masking)
+        # Layer normalization for the output of the first attention layer
+        if self.decoder:
+            self.norm2 = nn.LayerNorm(hidden_size)
+            # Self-attention mechanism for the decoder with no masking
+            self.attn2 = AttentionBlock(hidden_size=hidden_size,
+                                        num_heads=num_heads, masking=False)
+        # Layer normalization for the output before the MLP
+        self.norm_mlp = nn.LayerNorm(hidden_size)
+        # Multi-layer perceptron (MLP)
+        self.mlp = nn.Sequential(nn.Linear(hidden_size, hidden_size * 4),
+                                 nn.ELU(),
+                                 nn.Linear(hidden_size * 4, hidden_size))
+    def forward(self, x, input_key_mask=None, cross_key_mask=None, kv_cross=None):
+        # Perform self-attention operation
+        x = self.attn1(x, x, key_mask=input_key_mask) + x
+        x = self.norm1(x)
+        # If decoder, perform additional cross-attention layer
+        if self.decoder:
+            x = self.attn2(x, kv_cross, key_mask=cross_key_mask) + x
+            x = self.norm2(x)
+        # Apply MLP and layer normalization
+        x = self.mlp(x) + x
+        return self.norm_mlp(x)
+# Define a decoder module for the Transformer architecture
+class Decoder(nn.Module):
+    def __init__(self, num_emb, hidden_size=128, num_layers=3, num_heads=4):
+        super(Decoder, self).__init__()
+        # Create an embedding layer for tokens
+        self.embedding = nn.Embedding(num_emb, hidden_size)
+        # Initialize the embedding weights
+        self.embedding.weight.data = 0.001 * self.embedding.weight.data
+        # Initialize sinusoidal positional embeddings
+        self.pos_emb = SinusoidalPosEmb(hidden_size)
+        # Create multiple transformer blocks as layers
+        self.blocks = nn.ModuleList([
+            TransformerBlock(hidden_size, num_heads,
+                             decoder=True) for _ in range(num_layers)
+        ])
+        # Define a linear layer for output prediction
+        self.fc_out = nn.Linear(hidden_size, num_emb)
+    def forward(self, input_seq, encoder_output, input_padding_mask=None,
+                encoder_padding_mask=None):
+        # Embed the input sequence
+        input_embs = self.embedding(input_seq)
+        bs, l, h = input_embs.shape
+        # Add positional embeddings to the input embeddings
+        seq_indx = torch.arange(l, device=input_seq.device)
+        pos_emb = self.pos_emb(seq_indx).reshape(1, l, h).expand(bs, l, h)
+        embs = input_embs + pos_emb
+        # Pass the embeddings through each transformer block
+        for block in self.blocks:
+            embs = block(embs,
+                           input_key_mask=input_padding_mask,
+                           cross_key_mask=encoder_padding_mask,
+                           kv_cross=encoder_output)
+        return self.fc_out(embs)
+# Define an Vision Encoder module for the Transformer architecture
+class VisionEncoder(nn.Module):
+    def __init__(self, image_size, channels_in, patch_size=16, hidden_size=128,
+                 num_layers=3, num_heads=4):
+        super(VisionEncoder, self).__init__()
+        self.patch_size = patch_size
+        self.fc_in = nn.Linear(channels_in * patch_size * patch_size, hidden_size)
+        seq_length = (image_size // patch_size) ** 2
+        self.pos_embedding = nn.Parameter(torch.empty(1, seq_length,
+                                                      hidden_size).normal_(std=0.02))
+        # Create multiple transformer blocks as layers
+        self.blocks = nn.ModuleList([
+            TransformerBlock(hidden_size, num_heads,
+                             decoder=False, masking=False) for _ in range(num_layers)
+        ])
+    def forward(self, image):
+        bs = image.shape[0]
+        patch_seq = extract_patches(image, patch_size=self.patch_size)
+        patch_emb = self.fc_in(patch_seq)
+        # Add a unique embedding to each token embedding
+        embs = patch_emb + self.pos_embedding
+        # Pass the embeddings through each transformer block
+        for block in self.blocks:
+            embs = block(embs)
+        return embs
+# Define an Vision Encoder-Decoder module for the Transformer architecture
+class VisionEncoderDecoder(nn.Module):
+    def __init__(self, image_size, channels_in, num_emb, patch_size=16,
+                 hidden_size=128, num_layers=(3, 3), num_heads=4):
+        super(VisionEncoderDecoder, self).__init__()
+        # Create an encoder and decoder with specified parameters
+        self.encoder = VisionEncoder(image_size=image_size, channels_in=channels_in,
+                                     patch_size=patch_size, hidden_size=hidden_size,
+                                     num_layers=num_layers[0], num_heads=num_heads)
+        self.decoder = Decoder(num_emb=num_emb, hidden_size=hidden_size,
+                               num_layers=num_layers[1], num_heads=num_heads)
+    def forward(self, input_image, target_seq, padding_mask):
+        # Generate padding masks for the target sequence
+        bool_padding_mask = padding_mask == 0
+        # Encode the input sequence
+        encoded_seq = self.encoder(image=input_image)
+        # Decode the target sequence using the encoded sequence
+        decoded_seq = self.decoder(input_seq=target_seq,
+                                   encoder_output=encoded_seq,
+                                   input_padding_mask=bool_padding_mask)
+        return decoded_seq
+model = torch.load("caption_model.pth", weights_only=False)
+model.eval()
+tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")
+def pred_transformer_caption(test_img):
+	# Add the Start-Of-Sentence token to the prompt to signal the network to start generating the caption
+	sos_token = 101 * torch.ones(1, 1).long()
+	# Set the temperature for sampling during generation
+	temp = 0.5
+	log_tokens = [sos_token]
+	model.eval()
+	with torch.no_grad():
+	    # Encode the input image
+	    with torch.cuda.amp.autocast():
+	        # Forward pass
+	        image_embedding = model.encoder(test_img.to(device))
+	    # Generate the answer tokens
+	    for i in range(50):
+	        input_tokens = torch.cat(log_tokens, 1)
+	        # Decode the input tokens into the next predicted tokens
+	        data_pred = model.decoder(input_tokens.to(device), image_embedding)
+	        # Sample from the distribution of predicted probabilities
+	        dist = Categorical(logits=data_pred[:, -1] / temp)
+	        next_tokens = dist.sample().reshape(1, 1)
+	        # Append the next predicted token to the sequence
+	        log_tokens.append(next_tokens.cpu())
+	        # Break the loop if the End-Of-Caption token is predicted
+	        if next_tokens.item() == 102:
+	            break
+	# Convert the list of token indices to a tensor
+	pred_text = torch.cat(log_tokens, 1)
+	# Convert the token indices to their corresponding strings using the vocabulary
+	pred_text_strings = tokenizer.decode(pred_text[0], skip_special_tokens=True)
+	# Join the token strings to form the predicted text
+	pred_text = "".join(pred_text_strings)
+	# Print the predicted text
+	return (pred_text)
+##Dashboard
+st.title("Caption_APP")
+test_img=st.file_uploader(label="upload the funny pic :) :", type=["png","jpg","jpeg"])
+caption=""
+if test_img:
+	test_img=PIL.Image.open(test_img)
+	test_img=test_img.resize((128,128))
+	test_img=((test_img-np.amin(test_img))/(np.amax(test_img)-np.amin(test_img)))
+	test_img=np.array(test_img)
+	test_img=test_img.reshape((1,)+test_img.shape)
+	test_img=test_img.astype("float32")
+	copy=test_img
+	test_img=torch.from_numpy(test_img).to(device).unsqueeze(0)
+	caption=(str)(pred_transformer_caption(test_img))
+	st.image(image=np.squeeze(copy),caption=caption)
+	#st.write(caption)