File size: 6,371 Bytes
114e0ca 30dbc69 4d0c7ca 30dbc69 4d0c7ca 30dbc69 4d0c7ca 30dbc69 4d0c7ca 30dbc69 4d0c7ca 30dbc69 4d0c7ca 30dbc69 114e0ca 30dbc69 114e0ca 4d0c7ca 114e0ca c8b4cd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import torch
import torch.nn as nn
from torchvision import models
import pickle
from pathlib import Path
import sys
import logging
# Configure logger
logger = logging.getLogger(__name__)
class Vocabulary:
def __init__(self, freq_threshold=5):
self.freq_threshold = freq_threshold
self.word2idx = {}
self.idx2word = {}
self.idx = 0
# Special tokens
self.pad_token = "<PAD>"
self.start_token = "<SOS>"
self.end_token = "<EOS>"
self.unk_token = "<UNK>"
# Add special tokens
for token in [self.pad_token, self.start_token, self.end_token, self.unk_token]:
self.add_word(token)
def add_word(self, word):
"""Add a word to the vocabulary"""
if word not in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __len__(self):
return len(self.word2idx)
def __call__(self, word):
"""Convert word to index"""
if word not in self.word2idx:
return self.word2idx[self.unk_token]
return self.word2idx[word]
def decode(self, indices):
"""Convert indices back to words"""
return [self.idx2word[idx] for idx in indices if idx in self.idx2word]
import __main__
setattr(__main__, "Vocabulary", Vocabulary)
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet50(pretrained=False)
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
self.fc = nn.Linear(resnet.fc.in_features, embed_size)
self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
def forward(self, images):
features = self.resnet(images)
features = features.view(features.size(0), -1)
features = self.fc(features)
features = self.bn(features)
return features
class DecoderLSTM(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
super(DecoderLSTM, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers,
batch_first=True, dropout=dropout if num_layers > 1 else 0)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, features, captions):
embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
hiddens, _ = self.lstm(embeddings)
outputs = self.fc(hiddens)
return outputs
def sample(self, features, max_length=50):
batch_size = features.size(0)
captions = []
states = None
inputs = features.unsqueeze(1)
for _ in range(max_length):
hiddens, states = self.lstm(inputs, states)
outputs = self.fc(hiddens.squeeze(1))
predicted = outputs.argmax(dim=1)
captions.append(predicted)
inputs = self.embed(predicted).unsqueeze(1)
captions = torch.stack(captions, dim=1)
return captions
class ImageCaptioningModel(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
super(ImageCaptioningModel, self).__init__()
self.encoder = EncoderCNN(embed_size)
self.decoder = DecoderLSTM(embed_size, hidden_size, vocab_size, num_layers, dropout)
def forward(self, images, captions):
features = self.encoder(images)
outputs = self.decoder(features, captions)
return outputs
def generate_caption(self, images, max_length=50):
features = self.encoder(images)
captions = self.decoder.sample(features, max_length)
return captions
class ActionRecognitionModel(nn.Module):
def __init__(self, num_classes, dropout=0.5):
super(ActionRecognitionModel, self).__init__()
self.backbone = models.resnet50(pretrained=False)
num_features = self.backbone.fc.in_features
self.backbone.fc = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(num_features, 512),
nn.ReLU(),
nn.BatchNorm1d(512),
nn.Dropout(dropout),
nn.Linear(512, num_classes)
)
def forward(self, x):
return self.backbone(x)
def load_caption_model(device, model_dir=None):
if model_dir is None:
model_dir = Path(__file__).parent / 'models'
else:
model_dir = Path(model_dir)
# Load configuration
with open(model_dir / 'caption_model_config.pkl', 'rb') as f:
config = pickle.load(f)
# Load vocabulary
try:
with open(model_dir / 'vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
logger.info(f"Vocabulary loaded successfully. Size: {len(vocab)}")
except Exception as e:
logger.error(f"Failed to load vocabulary: {e}")
raise e
# Create model
model = ImageCaptioningModel(
embed_size=config['embed_size'],
hidden_size=config['hidden_size'],
vocab_size=config['vocab_size'],
num_layers=config['num_layers'],
dropout=config['dropout']
)
# Load weights
model.load_state_dict(torch.load(model_dir / 'caption_model_final.pth',
map_location=device))
model = model.to(device)
model.eval()
return model, vocab
def load_action_model(device, model_dir=None):
"""Load action recognition model"""
if model_dir is None:
model_dir = Path(__file__).parent / 'models'
else:
model_dir = Path(model_dir)
# Load configuration
with open(model_dir / 'action_model_config.pkl', 'rb') as f:
config = pickle.load(f)
# Create model
model = ActionRecognitionModel(
num_classes=config['num_classes'],
dropout=config['dropout']
)
# Load weights
model.load_state_dict(torch.load(model_dir / 'action_model_final.pth',
map_location=device))
model = model.to(device)
model.eval()
return model |