Spaces:

Tin113
/

vqa_project

Sleeping

App Files Files Community

Tin113 commited on Mar 29, 2025

Commit

76b9d33

verified ·

1 Parent(s): aef5bf1

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -26

app.py CHANGED Viewed

@@ -9,57 +9,137 @@ import os
 import sys
 class Attention(nn.Module):
-    # ... (Dán code class Attention của bạn vào đây) ...
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
-        self.cnn_proj = nn.Linear(cnn_dim, attention_dim)
-        self.lstm_proj = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
-        # Thêm các lớp kích hoạt nếu có trong code gốc của bạn
-        self.tanh = nn.Tanh()
-        self.softmax = nn.Softmax(dim=1)
     def forward(self, cnn_features, lstm_features):
-        cnn_proj = self.cnn_proj(cnn_features)
-        lstm_proj = self.lstm_proj(lstm_features)
-        # Đảm bảo broadcasting hoạt động đúng
-        combined = self.tanh(cnn_proj + lstm_proj) # cnn_proj sẽ được broadcast
-        attn_logits = self.attn(combined)
-        attn_weights = self.softmax(attn_logits)
-        attended_features = (attn_weights * lstm_features).sum(dim=1)
         return attended_features
 class VQAModel(nn.Module):
-    # ... (Dán code class VQAModel gốc của bạn vào đây) ...
-    # Đảm bảo các tham số mặc định khớp với lúc bạn lưu model
     def __init__(self, vocab_size, embedding_dim=256, lstm_units=256, cnn_output_dim=512, attention_dim=256, max_seq_len=30):
         super(VQAModel, self).__init__()
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
-        # CNN Encoder (giống hệt lúc train)
         self.cnn = nn.Sequential(
-            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1), nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
-        # Có caption_lstm trong định nghĩa model gốc
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
-        # Kích thước input decoder dựa trên context gốc (có cả caption)
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
-        self.dropout = nn.Dropout(0.5) # Tự động tắt khi model.eval()
-    # Hàm forward không thực sự được gọi trong predict_gradio theo cách làm này
-    # Nhưng nó cần tồn tại để model load đúng cấu trúc
     def forward(self, image, caption, question, answer_input):
-        raise NotImplementedError("Use the specific prediction logic for Gradio.")
 # ----------------------------------------------------------------------------
 # ============================================================================

 import sys
 class Attention(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
+        self.cnn = nn.Linear(cnn_dim, attention_dim)
+        self.lstm = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
     def forward(self, cnn_features, lstm_features):
+        # cnn_features: (batch, 1, cnn_dim)
+        # lstm_features: (batch, seq_len, lstm_dim)
+        cnn = self.cnn(cnn_features)  # (batch, 1, attention_dim)
+        lstm = self.lstm(lstm_features)  # (batch, seq_len, attention_dim)
+        combined = torch.tanh(cnn + lstm)  # (batch, seq_len, attention_dim)
+        attn_weights = F.softmax(self.attn(combined), dim=1)  # (batch, seq_len, 1)
+        attended_features = (attn_weights * lstm_features).sum(dim=1)  # (batch, lstm_dim)
         return attended_features
+# -----------------------
+# VQA Model
+# -----------------------
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embedding_dim=256, lstm_units=256, cnn_output_dim=512, attention_dim=256, max_seq_len=30):
         super(VQAModel, self).__init__()
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
+        # CNN Encoder: Trích xuất đặc trưng ảnh
         self.cnn = nn.Sequential(
+            nn.Conv2d(3, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
+        # Text Embedding
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        # LSTM Encoders cho caption và question
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
+        # Attention cho từng kênh
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
+        # Decoder: sử dụng teacher forcing
+        # Context vector: kết hợp của attention từ caption, attention từ question và trạng thái cuối của question
+        # Kích thước context = lstm_units + lstm_units + lstm_units = 3 * lstm_units (ví dụ 768 nếu lstm_units=256)
+        # Kết hợp với embedding của câu trả lời (embedding_dim) => đầu vào của decoder = embedding_dim + 3*lstm_units
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
+        self.dropout = nn.Dropout(0.5)
     def forward(self, image, caption, question, answer_input):
+        # --- CNN Encoder ---
+        cnn_features = self.cnn(image)  # (batch, cnn_output_dim, 1, 1)
+        cnn_features = cnn_features.view(cnn_features.size(0), -1)  # (batch, cnn_output_dim)
+        # --- Text Encoders ---
+        cap_embed = self.embedding(caption)  # (batch, cap_seq_len, embedding_dim)
+        cap_output, _ = self.caption_lstm(cap_embed)  # (batch, cap_seq_len, lstm_units)
+        q_embed = self.embedding(question)  # (batch, q_seq_len, embedding_dim)
+        q_output, _ = self.question_lstm(q_embed)  # (batch, q_seq_len, lstm_units)
+        # --- Attention ---
+        cap_attended = self.attention(cnn_features.unsqueeze(1), cap_output)  # (batch, lstm_units)
+        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)      # (batch, lstm_units)
+        q_last = q_output[:, -1, :]  # (batch, lstm_units)
+        # Context vector: (batch, 3*lstm_units)
+        context = torch.cat([cap_attended, q_attended, q_last], dim=-1)
+        # --- Decoder với Teacher Forcing ---
+        # answer_input: (batch, ans_seq_len)
+        answer_embed = self.embedding(answer_input)  # (batch, ans_seq_len, embedding_dim)
+        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)  # (batch, ans_seq_len, 3*lstm_units)
+        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)  # (batch, ans_seq_len, embedding_dim + 3*lstm_units)
+        decoder_in = self.decoder_input_proj(decoder_in)  # (batch, ans_seq_len, lstm_units)
+        decoder_output, _ = self.decoder_lstm(decoder_in)  # (batch, ans_seq_len, lstm_units)
+        output = self.fc_out(self.dropout(decoder_output))  # (batch, ans_seq_len, vocab_size)
+        return output
+    def predict(self, image, question, word_to_idx, idx_to_word, device='cuda' if torch.cuda.is_available() else 'cpu'):
+        self.eval()
+        self.to(device)
+        image = image.unsqueeze(0).to(device)
+        question_seq = [word_to_idx.get(word, word_to_idx['<PAD>']) for word in question.lower().split()]
+        question = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(device)
+        # Encode image & question
+        cnn_features = self.cnn(image)
+        cnn_features = cnn_features.view(cnn_features.size(0), -1)
+        q_embed = self.embedding(question)
+        q_output, _ = self.question_lstm(q_embed)
+        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
+        q_last = q_output[:, -1, :]
+        # Ở predict, ta tạo context vector từ q_attended lặp lại (chỉ dùng question cho ví dụ)
+        context = torch.cat([q_attended, q_attended, q_last], dim=-1)  # (1, 3*lstm_units)
+        # Khởi tạo câu trả lời với token <START>
+        answer_input = torch.tensor([[word_to_idx['<START>']]], dtype=torch.long).to(device)
+        answer_words = []
+        hidden = None
+        for _ in range(self.max_seq_len):
+            answer_embed = self.embedding(answer_input)  # (1, seq_len, embedding_dim)
+            context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
+            decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
+            decoder_in = self.decoder_input_proj(decoder_in)
+            decoder_output, hidden = self.decoder_lstm(decoder_in, hidden)
+            output = self.fc_out(decoder_output[:, -1, :])
+            next_word_idx = output.argmax(dim=-1).item()
+            if next_word_idx == word_to_idx['<END>']:
+                break
+            answer_words.append(idx_to_word[next_word_idx])
+            answer_input = torch.cat([answer_input, torch.tensor([[next_word_idx]], dtype=torch.long).to(device)], dim=1)
+        return ' '.join(answer_words)
 # ----------------------------------------------------------------------------
 # ============================================================================