Spaces:

Tin113
/

pretrained

Sleeping

App Files Files Community

Tin113 commited on Apr 1, 2025

Commit

84165ce

verified ·

1 Parent(s): 2fccb7e

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -125

app.py CHANGED Viewed

@@ -2,37 +2,28 @@ import gradio as gr
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import json
-from torchvision import transforms
 from PIL import Image
-import numpy as np
-# ============================================================================
-#  1. ĐỊNH NGHĨA LẠI CÁC CLASS MODEL (QUAN TRỌNG!)
-#     (Copy từ code huấn luyện gốc, ĐÃ SỬA Attention theo lỗi trước)
-# ============================================================================
 # -----------------------
 # Attention Module
 # -----------------------
-class Attention(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
-        super(Attention, self).__init__()
         self.cnn_proj = nn.Linear(cnn_dim, attention_dim)
         self.lstm_proj = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
     def forward(self, cnn_features, lstm_features):
-        # cnn_features: (batch, 1, cnn_dim)
-        # lstm_features: (batch, seq_len, lstm_dim)
-        cnn_proj = self.cnn_proj(cnn_features)  # (batch, 1, attention_dim)
-        lstm_proj = self.lstm_proj(lstm_features)  # (batch, seq_len, attention_dim)
-        combined = torch.tanh(cnn_proj + lstm_proj)  # (batch, seq_len, attention_dim)
-        attn_weights = F.softmax(self.attn(combined), dim=1)  # (batch, seq_len, 1)
-        attended_features = (attn_weights * lstm_features).sum(dim=1)  # (batch, lstm_dim)
         return attended_features
-# -----------------------
-# VQA Model
-# -----------------------
 # -----------------------
 # Pre-trained VQA Model
 # -----------------------
@@ -42,11 +33,11 @@ class PretrainedVQAModel(nn.Module):
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
-        # Pre-trained CNN Encoder (ResNet50)
         resnet = models.resnet18(pretrained=True)
-        self.cnn = nn.Sequential(*list(resnet.children())[:-1])  # Remove the final FC layer
-        self.cnn_output_dim = 512
         # Text Embedding
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
@@ -64,146 +55,145 @@ class PretrainedVQAModel(nn.Module):
     def forward(self, image, question, answer_input):
         # CNN Encoder
-        cnn_features = self.cnn(image)  # (batch, cnn_output_dim, 1, 1)
-        cnn_features = cnn_features.view(cnn_features.size(0), -1)  # (batch, cnn_output_dim)
         # Question Encoder
-        q_embed = self.embedding(question)  # (batch, q_seq_len, embedding_dim)
-        q_output, _ = self.question_lstm(q_embed)  # (batch, q_seq_len, lstm_units)
         # Attention
-        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)  # (batch, lstm_units)
-        q_last = q_output[:, -1, :]  # (batch, lstm_units)
         # Context Vector
-        context = torch.cat([q_attended, q_last], dim=-1)  # (batch, 2*lstm_units)
         # Decoder with Teacher Forcing
-        answer_embed = self.embedding(answer_input)  # (batch, ans_seq_len, embedding_dim)
-        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)  # (batch, ans_seq_len, 2*lstm_units)
-        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)  # (batch, ans_seq_len, embedding_dim + 2*lstm_units)
-        decoder_in = self.decoder_input_proj(decoder_in)  # (batch, ans_seq_len, lstm_units)
-        decoder_output, _ = self.decoder_lstm(decoder_in)  # (batch, ans_seq_len, lstm_units)
-        output = self.fc_out(self.dropout(decoder_output))  # (batch, ans_seq_len, vocab_size)
         return output
-    def predict(self, image, question, word_to_idx, idx_to_word, device='cuda' if torch.cuda.is_available() else 'cpu'):
         self.eval()
-        self.to(device)
-        if image.dim() == 3:
-            image = image.unsqueeze(0)
-        image = image.to(device)
-        question_seq = [word_to_idx.get(word, word_to_idx['<PAD>']) for word in question.lower().split()]
-        question = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(device)
-        # Encode image and question
-        cnn_features = self.cnn(image).view(-1, self.cnn_output_dim)
-        q_embed = self.embedding(question)
-        q_output, _ = self.question_lstm(q_embed)
-        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
-        q_last = q_output[:, -1, :]
-        context = torch.cat([q_attended, q_last], dim=-1)
-        # Generate answer
-        answer_input = torch.tensor([[word_to_idx['<START>']]], dtype=torch.long).to(device)
-        answer_words = []
-        hidden = None
-        for _ in range(self.max_seq_len):
-            answer_embed = self.embedding(answer_input)
-            context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
-            decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
-            decoder_in = self.decoder_input_proj(decoder_in)
-            decoder_output, hidden = self.decoder_lstm(decoder_in, hidden)
-            output = self.fc_out(decoder_output[:, -1, :])
-            next_word_idx = output.argmax(dim=-1).item()
-            if next_word_idx == word_to_idx['<END>']:
-                break
-            answer_words.append(idx_to_word[next_word_idx])
-            answer_input = torch.tensor([[next_word_idx]], dtype=torch.long).to(device)
-        return ' '.join(answer_words)
-def load_model(model_path, word_to_idx_path, idx_to_word_path, device='cpu'):
     try:
-        # Load từ điển từ file .pth
-        word_to_idx = torch.load(word_to_idx_path, map_location=device)
-        idx_to_word = torch.load(idx_to_word_path, map_location=device)
-        # Khởi tạo mô hình
         model = PretrainedVQAModel(vocab_size=len(word_to_idx))
-        model.load_state_dict(torch.load(model_path, map_location=device))
         model.to(device)
         model.eval()
         return model, word_to_idx, idx_to_word
     except Exception as e:
         print(f"Error loading model: {e}")
         raise
-def predict(image, question, model, word_to_idx, idx_to_word, device='cpu'):
-    try:
-        # Chuyển đổi ảnh
-        image = transform(image).unsqueeze(0).to(device)
-        # Dự đoán
-        answer = model.predict(image, question, word_to_idx, idx_to_word, device)
-        return answer
-    except Exception as e:
-        print(f"Prediction error: {e}")
-        return "Error generating answer"
-# Tạo transform cho ảnh
-transform = transforms.Compose([
-    transforms.Resize((224, 224)),
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-])
 def create_interface():
-    device = 'cpu'  # Luôn dùng CPU trên Spaces
     try:
-        model, word_to_idx, idx_to_word = load_model(
-            "vqa_model.pth",
-            "word_to_idx.pth",
-            "idx_to_word.pth",
-            device
-        )
         def predict(image, question):
             try:
-                transform = transforms.Compose([
-                    transforms.Resize((224, 224)),
-                    transforms.ToTensor(),
-                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                      std=[0.229, 0.224, 0.225])
-                ])
-                image = transform(image).unsqueeze(0).to(device)
-                answer = model.predict(image, question, word_to_idx, idx_to_word, device)
                 return answer
             except Exception as e:
-                return f"Error: {str(e)}"
-        iface = gr.Interface(
             fn=predict,
             inputs=[
                 gr.Image(type="pil", label="Upload Image"),
-                gr.Textbox(label="Question")
             ],
-            outputs=gr.Textbox(label="Answer"),
-            title="VQA train từ đầu",
-            description="Tải ảnh về động vật lên và đặt câu hỏi liên quan (CHỈ HỖ TRỢ TIẾNG ANH)"
         )
-        return iface
     except Exception as e:
-        return gr.Interface(lambda: "Model failed to load", None, "text")
 if __name__ == "__main__":
     iface = create_interface()
     iface.launch(
         server_name="0.0.0.0",
-        server_port=7860
-    )

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torchvision import transforms, models
 from PIL import Image
+import os
 # -----------------------
 # Attention Module
 # -----------------------
+class Attention_PT(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
+        super(Attention_PT, self).__init__()
         self.cnn_proj = nn.Linear(cnn_dim, attention_dim)
         self.lstm_proj = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
     def forward(self, cnn_features, lstm_features):
+        cnn_proj = self.cnn_proj(cnn_features)
+        lstm_proj = self.lstm_proj(lstm_features)
+        combined = torch.tanh(cnn_proj + lstm_proj)
+        attn_weights = F.softmax(self.attn(combined), dim=1)
+        attended_features = (attn_weights * lstm_features).sum(dim=1)
         return attended_features
 # -----------------------
 # Pre-trained VQA Model
 # -----------------------
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
+        # Pre-trained CNN Encoder (ResNet18)
         resnet = models.resnet18(pretrained=True)
+        self.cnn = nn.Sequential(*list(resnet.children())[:-1])  # Remove final FC layer
+        self.cnn_output_dim = 512  # Output dim for ResNet18 features
         # Text Embedding
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
     def forward(self, image, question, answer_input):
         # CNN Encoder
+        cnn_features = self.cnn(image)
+        cnn_features = cnn_features.view(cnn_features.size(0), -1)
         # Question Encoder
+        q_embed = self.embedding(question)
+        q_output, _ = self.question_lstm(q_embed)
         # Attention
+        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
+        q_last = q_output[:, -1, :]
         # Context Vector
+        context = torch.cat([q_attended, q_last], dim=-1)
         # Decoder with Teacher Forcing
+        answer_embed = self.embedding(answer_input)
+        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
+        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
+        decoder_in = self.decoder_input_proj(decoder_in)
+        decoder_output, _ = self.decoder_lstm(decoder_in)
+        output = self.fc_out(self.dropout(decoder_output))
         return output
+    def predict(self, image, question, word_to_idx, idx_to_word, device='cpu'):
         self.eval()
+        with torch.no_grad():
+            if image.dim() == 3:
+                image = image.unsqueeze(0)
+            image = image.to(device)
+            # Process question
+            question_seq = [word_to_idx.get(word, word_to_idx['<PAD>'])
+                          for word in question.lower().split()]
+            question = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(device)
+            # Encode image and question
+            cnn_features = self.cnn(image).view(-1, self.cnn_output_dim)
+            q_embed = self.embedding(question)
+            q_output, _ = self.question_lstm(q_embed)
+            q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
+            q_last = q_output[:, -1, :]
+            context = torch.cat([q_attended, q_last], dim=-1)
+            # Generate answer
+            answer_input = torch.tensor([[word_to_idx['<START>']]], dtype=torch.long).to(device)
+            answer_words = []
+            for _ in range(self.max_seq_len):
+                answer_embed = self.embedding(answer_input)
+                context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
+                decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
+                decoder_in = self.decoder_input_proj(decoder_in)
+                decoder_output, _ = self.decoder_lstm(decoder_in)
+                output = self.fc_out(decoder_output[:, -1, :])
+                next_word_idx = output.argmax(dim=-1).item()
+                if next_word_idx == word_to_idx['<END>']:
+                    break
+                answer_words.append(idx_to_word[str(next_word_idx)])
+                answer_input = torch.tensor([[next_word_idx]], dtype=torch.long).to(device)
+            return ' '.join(answer_words)
+# -----------------------
+# Load Model Function
+# -----------------------
+def load_model():
+    device = 'cpu'
     try:
+        # Load dictionaries
+        word_to_idx = torch.load("word_to_idx.pth", map_location=device)
+        idx_to_word = torch.load("idx_to_word.pth", map_location=device)
+        # Initialize model
         model = PretrainedVQAModel(vocab_size=len(word_to_idx))
+        model.load_state_dict(torch.load("vqa_model.pth", map_location=device))
         model.to(device)
         model.eval()
         return model, word_to_idx, idx_to_word
     except Exception as e:
         print(f"Error loading model: {e}")
         raise
+# -----------------------
+# Gradio Interface
+# -----------------------
 def create_interface():
     try:
+        model, word_to_idx, idx_to_word = load_model()
+        # Image preprocessing
+        def preprocess_image(image):
+            transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                   std=[0.229, 0.224, 0.225])
+            ])
+            return transform(image).unsqueeze(0)
         def predict(image, question):
             try:
+                image_tensor = preprocess_image(image)
+                answer = model.predict(image_tensor, question, word_to_idx, idx_to_word, 'cpu')
                 return answer
             except Exception as e:
+                return f"Error generating answer: {str(e)}"
+        # Create interface
+        return gr.Interface(
             fn=predict,
             inputs=[
                 gr.Image(type="pil", label="Upload Image"),
+                gr.Textbox(label="Your Question", placeholder="Ask something about the image...")
             ],
+            outputs=gr.Textbox(label="Generated Answer"),
+            title="Visual Question Answering with ResNet18",
+            description="Upload an image and ask natural language questions about its content",
+            allow_flagging="never"
         )
     except Exception as e:
+        return gr.Interface(
+            lambda: f"Failed to load model: {str(e)}",
+            inputs=None,
+            outputs="text",
+            title="Error"
+        )
+# -----------------------
+# Main Execution
+# -----------------------
 if __name__ == "__main__":
+    # Create and launch interface
     iface = create_interface()
     iface.launch(
         server_name="0.0.0.0",
+        server_port=7860,
+        enable_queue=True
+    )