Spaces:

Tin113
/

vqa_project

Sleeping

App Files Files Community

Tin113 commited on Mar 29, 2025

Commit

c5676e0

verified ·

1 Parent(s): f2c0061

Update app.py

Browse files

Files changed (1) hide show

app.py +230 -154

app.py CHANGED Viewed

@@ -1,198 +1,274 @@
 import torch
-import gradio as gr
-from PIL import Image
 from torchvision import transforms
-import torch.nn as nn
-import torchvision.models as models
-# -----------------------
-# Attention Module
-# -----------------------
 class Attention(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
-        self.cnn = nn.Linear(cnn_dim, attention_dim)
-        self.lstm = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
     def forward(self, cnn_features, lstm_features):
-        # cnn_features: (batch, 1, cnn_dim)
-        # lstm_features: (batch, seq_len, lstm_dim)
-        cnn = self.cnn(cnn_features)  # (batch, 1, attention_dim)
-        lstm = self.lstm(lstm_features)  # (batch, seq_len, attention_dim)
-        combined = torch.tanh(cnn + lstm)  # (batch, seq_len, attention_dim)
-        attn_weights = F.softmax(self.attn(combined), dim=1)  # (batch, seq_len, 1)
-        attended_features = (attn_weights * lstm_features).sum(dim=1)  # (batch, lstm_dim)
         return attended_features
-# -----------------------
-# VQA Model
-# -----------------------
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embedding_dim=256, lstm_units=256, cnn_output_dim=512, attention_dim=256, max_seq_len=30):
         super(VQAModel, self).__init__()
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
-        # CNN Encoder: Trích xuất đặc trưng ảnh
         self.cnn = nn.Sequential(
-            nn.Conv2d(3, 32, kernel_size=3, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(2),
-            nn.Conv2d(32, 64, kernel_size=3, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(2),
-            nn.Conv2d(64, 128, kernel_size=3, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(2),
-            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1),
-            nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
-        # Text Embedding
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
-        # LSTM Encoders cho caption và question
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
-        # Attention cho từng kênh
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
-        # Decoder: sử dụng teacher forcing
-        # Context vector: kết hợp của attention từ caption, attention từ question và trạng thái cuối của question
-        # Kích thước context = lstm_units + lstm_units + lstm_units = 3 * lstm_units (ví dụ 768 nếu lstm_units=256)
-        # Kết hợp với embedding của câu trả lời (embedding_dim) => đầu vào của decoder = embedding_dim + 3*lstm_units
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
-        self.dropout = nn.Dropout(0.5)
     def forward(self, image, caption, question, answer_input):
-        # --- CNN Encoder ---
-        cnn_features = self.cnn(image)  # (batch, cnn_output_dim, 1, 1)
-        cnn_features = cnn_features.view(cnn_features.size(0), -1)  # (batch, cnn_output_dim)
-        # --- Text Encoders ---
-        cap_embed = self.embedding(caption)  # (batch, cap_seq_len, embedding_dim)
-        cap_output, _ = self.caption_lstm(cap_embed)  # (batch, cap_seq_len, lstm_units)
-        q_embed = self.embedding(question)  # (batch, q_seq_len, embedding_dim)
-        q_output, _ = self.question_lstm(q_embed)  # (batch, q_seq_len, lstm_units)
-        # --- Attention ---
-        cap_attended = self.attention(cnn_features.unsqueeze(1), cap_output)  # (batch, lstm_units)
-        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)      # (batch, lstm_units)
-        q_last = q_output[:, -1, :]  # (batch, lstm_units)
-        # Context vector: (batch, 3*lstm_units)
-        context = torch.cat([cap_attended, q_attended, q_last], dim=-1)
-        # --- Decoder với Teacher Forcing ---
-        # answer_input: (batch, ans_seq_len)
-        answer_embed = self.embedding(answer_input)  # (batch, ans_seq_len, embedding_dim)
-        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)  # (batch, ans_seq_len, 3*lstm_units)
-        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)  # (batch, ans_seq_len, embedding_dim + 3*lstm_units)
-        decoder_in = self.decoder_input_proj(decoder_in)  # (batch, ans_seq_len, lstm_units)
-        decoder_output, _ = self.decoder_lstm(decoder_in)  # (batch, ans_seq_len, lstm_units)
-        output = self.fc_out(self.dropout(decoder_output))  # (batch, ans_seq_len, vocab_size)
-        return output
-    def predict(self, image, question, word_to_idx, idx_to_word, device='cuda' if torch.cuda.is_available() else 'cpu'):
-        self.eval()
-        self.to(device)
-        image = image.unsqueeze(0).to(device)
-        question_seq = [word_to_idx.get(word, word_to_idx['<PAD>']) for word in question.lower().split()]
-        question = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(device)
-        # Encode image & question
-        cnn_features = self.cnn(image)
-        cnn_features = cnn_features.view(cnn_features.size(0), -1)
-        q_embed = self.embedding(question)
-        q_output, _ = self.question_lstm(q_embed)
-        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
-        q_last = q_output[:, -1, :]
-        # Ở predict, ta tạo context vector từ q_attended lặp lại (chỉ dùng question cho ví dụ)
-        context = torch.cat([q_attended, q_attended, q_last], dim=-1)  # (1, 3*lstm_units)
-        # Khởi tạo câu trả lời với token <START>
-        answer_input = torch.tensor([[word_to_idx['<START>']]], dtype=torch.long).to(device)
-        answer_words = []
-        hidden = None
-        for _ in range(self.max_seq_len):
-            answer_embed = self.embedding(answer_input)  # (1, seq_len, embedding_dim)
-            context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
-            decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
-            decoder_in = self.decoder_input_proj(decoder_in)
-            decoder_output, hidden = self.decoder_lstm(decoder_in, hidden)
-            output = self.fc_out(decoder_output[:, -1, :])
-            next_word_idx = output.argmax(dim=-1).item()
-            if next_word_idx == word_to_idx['<END>']:
-                break
-            answer_words.append(idx_to_word[next_word_idx])
-            answer_input = torch.cat([answer_input, torch.tensor([[next_word_idx]], dtype=torch.long).to(device)], dim=1)
-        return ' '.join(answer_words)
-# Load mô hình từ Hugging Face Model Hub hoặc local
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Nếu dùng Model Hub, tải từ Hugging Face (bỏ comment nếu cần)
-# from huggingface_hub import hf_hub_download
-# model_path = hf_hub_download("your-username/VQA-Fruits-Model", "vqa_model.pth")
-# word_to_idx_path = hf_hub_download("your-username/VQA-Fruits-Model", "word_to_idx.pth")
-# idx_to_word_path = hf_hub_download("your-username/VQA-Fruits-Model", "idx_to_word.pth")
-# Nếu dùng file upload trực tiếp vào Space, dùng cách này:
-model_path = "vqa_model.pth"
-word_to_idx_path = "word_to_idx.pth"
-idx_to_word_path = "idx_to_word.pth"
-# Load word_to_idx và idx_to_word
-word_to_idx = torch.load(word_to_idx_path, map_location=device)
-idx_to_word = torch.load(idx_to_word_path, map_location=device)
-# Khởi tạo mô hình
-vocab_size = len(word_to_idx)
-model = VQAModel(vocab_size)  # ⚠️ Bạn cần định nghĩa class VQAModel
-model.load_state_dict(torch.load(model_path, map_location=device))
-model.to(device)
-model.eval()
-# Chuẩn bị tiền xử lý ảnh
-transform = transforms.Compose([
-    transforms.Resize((224, 224)),
-    transforms.ToTensor(),
-])
-# Hàm dự đoán VQA
-def predict(image, question):
-    image = transform(image).unsqueeze(0).to(device)
-    question_tokens = [word_to_idx.get(word, 0) for word in question.lower().split()]
-    question_tensor = torch.tensor(question_tokens).unsqueeze(0).to(device)
-    with torch.no_grad():
-        output = model(image, question_tensor)
-        predicted_idx = torch.argmax(output, dim=1).item()
-    answer = idx_to_word[predicted_idx]
-    return answer
-# Giao diện Gradio
 iface = gr.Interface(
-    fn=predict,
-    inputs=[gr.Image(type="pil"), gr.Textbox(label="Câu hỏi")],
-    outputs=gr.Textbox(label="Câu trả lời"),
-    title="VQA for Animal",
-    description="Tải lên ảnh con vật và nhập câu hỏi để nhận câu trả lời. (CHỈ HỖ TRỢ TIẾNG ANH)",
 )
-# Chạy ứng dụng
-iface.launch()

 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torchvision import transforms
+from PIL import Image
+import json
+import gradio as gr
+import os
+import sys
+# ============================================================================
+#  1. ĐỊNH NGHĨA LẠI CÁC CLASS MODEL (QUAN TRỌNG!)
+# ============================================================================
+# SAO CHÉP VÀ DÁN TOÀN BỘ ĐỊNH NGHĨA CỦA CLASS Attention và VQAModel
+# (phiên bản gốc có caption, CNN tự định nghĩa) TỪ SCRIPT HUẤN LUYỆN VÀO ĐÂY.
+# Nếu không có các định nghĩa này, torch.load sẽ không hoạt động.
+# --- Ví dụ (BẠN CẦN DÁN CODE ĐẦY ĐỦ CỦA BẠN VÀO) ---
 class Attention(nn.Module):
+    # ... (Dán code class Attention của bạn vào đây) ...
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
+        self.cnn_proj = nn.Linear(cnn_dim, attention_dim)
+        self.lstm_proj = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
+        # Thêm các lớp kích hoạt nếu có trong code gốc của bạn
+        self.tanh = nn.Tanh()
+        self.softmax = nn.Softmax(dim=1)
     def forward(self, cnn_features, lstm_features):
+        cnn_proj = self.cnn_proj(cnn_features)
+        lstm_proj = self.lstm_proj(lstm_features)
+        # Đảm bảo broadcasting hoạt động đúng
+        combined = self.tanh(cnn_proj + lstm_proj) # cnn_proj sẽ được broadcast
+        attn_logits = self.attn(combined)
+        attn_weights = self.softmax(attn_logits)
+        attended_features = (attn_weights * lstm_features).sum(dim=1)
         return attended_features
 class VQAModel(nn.Module):
+    # ... (Dán code class VQAModel gốc của bạn vào đây) ...
+    # Đảm bảo các tham số mặc định khớp với lúc bạn lưu model
     def __init__(self, vocab_size, embedding_dim=256, lstm_units=256, cnn_output_dim=512, attention_dim=256, max_seq_len=30):
         super(VQAModel, self).__init__()
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
+        # CNN Encoder (giống hệt lúc train)
         self.cnn = nn.Sequential(
+            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
+            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1), nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        # Có caption_lstm trong định nghĩa model gốc
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
+        # Kích thước input decoder dựa trên context gốc (có cả caption)
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
+        self.dropout = nn.Dropout(0.5) # Tự động tắt khi model.eval()
+    # Hàm forward không thực sự được gọi trong predict_gradio theo cách làm này
+    # Nhưng nó cần tồn tại để model load đúng cấu trúc
     def forward(self, image, caption, question, answer_input):
+        raise NotImplementedError("Use the specific prediction logic for Gradio.")
+# ----------------------------------------------------------------------------
+# ============================================================================
+#  2. CẤU HÌNH VÀ LOAD MODEL/VOCAB
+# ============================================================================
+MODEL_PATH = "vqa_custom_cnn_model.pth" # Tên file model của bạn
+VOCAB_PATH = "vqa_custom_cnn_vocab.json" # Tên file vocab của bạn
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Hàm load ---
+def load_model_and_vocab(model_path, vocab_path, device):
+    if not os.path.exists(vocab_path):
+        print(f"Error: Vocabulary file not found at {vocab_path}")
+        return None, None, None
+    try:
+        with open(vocab_path, 'r') as f:
+            vocab_data = json.load(f)
+        word_to_idx = vocab_data['word_to_idx']
+        # Đảm bảo idx_to_word có key là integer nếu dùng get(int_key)
+        # Hoặc chuyển index sang string nếu key là string
+        idx_to_word = {int(k): v for k, v in vocab_data['idx_to_word'].items()}
+        vocab_size = len(word_to_idx)
+    except Exception as e:
+        print(f"Error loading vocabulary: {e}")
+        return None, None, None
+    if not os.path.exists(model_path):
+        print(f"Error: Model file not found at {model_path}")
+        return None, None, None
+    try:
+        # Khởi tạo model với các tham số đúng
+        # Cần lấy các giá trị dim từ lúc bạn train model gốc
+        model = VQAModel(vocab_size=vocab_size,
+                         embedding_dim=256, # Giả định, thay đổi nếu khác
+                         lstm_units=256,    # Giả định, thay đổi nếu khác
+                         cnn_output_dim=512, # Giả định, thay đổi nếu khác
+                         attention_dim=256, # Giả định, thay đổi nếu khác
+                         max_seq_len=30)    # Giả định, thay đổi nếu khác
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.to(device)
+        model.eval() # QUAN TRỌNG: Chuyển sang chế độ đánh giá
+        print(f"Model loaded successfully from {model_path}")
+        return model, word_to_idx, idx_to_word
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        # Có thể in traceback để debug kỹ hơn nếu cần
+        # import traceback
+        # traceback.print_exc()
+        return None, None, None
+# --- Load model và vocab một lần khi app khởi động ---
+model, word_to_idx, idx_to_word = load_model_and_vocab(MODEL_PATH, VOCAB_PATH, DEVICE)
+# Thoát nếu không load được model/vocab
+if model is None or word_to_idx is None:
+    print("Exiting because model or vocabulary failed to load.")
+    sys.exit(1)
+# ============================================================================
+#  3. ĐỊNH NGHĨA TRANSFORM (PHẢI GIỐNG HỆT LÚC TRAIN)
+# ============================================================================
+# Sử dụng lại transform bạn đã dùng trong hàm train_vqa
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+# ============================================================================
+#  4. HÀM DỰ ĐOÁN CHO GRADIO
+# ============================================================================
+def predict_vqa(image, question):
+    """Hàm xử lý input từ Gradio và trả về dự đoán."""
+    if image is None or not question.strip():
+        return "Lỗi: Vui lòng cung cấp cả ảnh và câu hỏi."
+    # --- 1. Tiền xử lý ảnh ---
+    try:
+        # Gradio truyền vào PIL Image
+        image_tensor = transform(image).unsqueeze(0).to(DEVICE)
+    except Exception as e:
+        return f"Lỗi xử lý ảnh: {e}"
+    # --- 2. Tiền xử lý câu hỏi ---
+    question_tokens = question.lower().split()
+    # Sử dụng PAD index cho từ không biết nếu UNK không có
+    unk_idx = word_to_idx.get('<UNK>', word_to_idx.get('<PAD>', 0))
+    question_seq = [word_to_idx.get(word, unk_idx) for word in question_tokens]
+    if not question_seq:
+        question_seq = [unk_idx] # Xử lý câu hỏi rỗng
+    question_tensor = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(DEVICE)
+    # --- 3. Chạy Inference (Bắt chước logic của model.predict gốc) ---
+    start_token_idx = word_to_idx['<START>']
+    end_token_idx = word_to_idx['<END>']
+    max_len = model.max_seq_len
+    generated_indices = [] # Không cần thêm START ở đây
+    # Bắt đầu giải mã với token <START>
+    decoder_input = torch.tensor([[start_token_idx]], dtype=torch.long).to(DEVICE)
+    # Hidden state của decoder LSTM sẽ được khởi tạo lại ở mỗi bước trong cách làm này
+    # (hoặc cần được truyền và cập nhật nếu logic predict gốc làm vậy)
+    # Logic predict gốc không truyền hidden state rõ ràng, nên ta cũng không cần
+    hidden_state = None
+    with torch.no_grad():
+        # Encode ảnh và câu hỏi một lần
+        cnn_features = model.cnn(image_tensor) # (1, cnn_output_dim, 1, 1)
+        cnn_features = cnn_features.view(cnn_features.size(0), -1) # (1, cnn_output_dim)
+        q_embed = model.embedding(question_tensor) # (1, q_seq_len, embedding_dim)
+        q_output, _ = model.question_lstm(q_embed) # (1, q_seq_len, lstm_units)
+        # Attention chỉ với question
+        # Cần unsqueeze cnn_features để có chiều seq_len=1
+        q_attended = model.attention(cnn_features.unsqueeze(1), q_output) # (1, lstm_units)
+        # Trạng thái cuối của LSTM question (lấy từ output)
+        q_last = q_output[:, -1, :] # (1, lstm_units)
+        # --- Context Vector (THEO LOGIC model.predict GỐC) ---
+        # Sử dụng q_attended hai lần, bỏ qua caption hoàn toàn trong inference này
+        context = torch.cat([q_attended, q_attended, q_last], dim=-1) # (1, 3*lstm_units)
+        for _ in range(max_len):
+            # --- Chuẩn bị input cho decoder ở bước này ---
+            current_word_embed = model.embedding(decoder_input) # (1, 1, embedding_dim)
+            # Lặp context cho bước thời gian hiện tại (batch=1, seq_len=1)
+            context_repeated = context.unsqueeze(1) # (1, 1, 3*lstm_units)
+            # Input cho lớp chiếu của decoder
+            decoder_proj_input = torch.cat([current_word_embed, context_repeated], dim=-1)
+            decoder_lstm_input = model.decoder_input_proj(decoder_proj_input) # (1, 1, lstm_units)
+            # --- Chạy Decoder LSTM ---
+            # Logic predict gốc truyền hidden state, ta cần làm tương tự nếu muốn khớp 100%
+            # Hoặc nếu không truyền, LSTM sẽ tự khởi tạo state (có thể hơi khác kết quả)
+            # Giả sử logic gốc có truyền hidden state:
+            decoder_output, hidden_state = model.decoder_lstm(decoder_lstm_input, hidden_state) # Update hidden
+            # --- Lấy Logits và dự đoán ---
+            # Logic predict gốc lấy output của bước cuối cùng [-1]
+            # Vì ta đang chạy từng bước, output chỉ có 1 bước thời gian -> dùng squeeze(1)
+            output_logits = model.fc_out(decoder_output.squeeze(1)) # (1, vocab_size)
+            predicted_idx = output_logits.argmax(dim=-1).item()
+            if predicted_idx == end_token_idx:
+                break
+            generated_indices.append(predicted_idx)
+            # Chuẩn bị input cho bước tiếp theo
+            decoder_input = torch.tensor([[predicted_idx]], dtype=torch.long).to(DEVICE)
+    # --- 4. Decode Output ---
+    answer_words = [idx_to_word.get(idx, '<UNK>') for idx in generated_indices]
+    return ' '.join(answer_words) if answer_words else "(No answer generated)"
+# ============================================================================
+#  5. TẠO VÀ CHẠY GRADIO INTERFACE
+# ============================================================================
+title = "Visual Question Answering Demo"
+description = """
+Upload một ảnh và đặt câu hỏi về nội dung của ảnh đó.
+Model này sử dụng CNN tùy chỉnh và LSTM với Attention (phiên bản gốc).
+Lưu ý: Inference hiện tại dựa trên logic của hàm `predict` gốc, có thể không sử dụng caption.
+"""
+# Ví dụ để người dùng thử
+examples = [
+    ["path/to/your/example/cat_image.jpg", "what animal is in the picture"],
+    ["path/to/your/example/car_image.png", "what color is the car"],
+    # Thêm URL nếu muốn
+    # ["https://example.com/some_image.jpg", "how many people are there"]
+]
+# Bạn cần thay đổi đường dẫn trong 'examples' thành đường dẫn thực tế
+# tới file ảnh MÀ BẠN SẼ UPLOAD lên Space cùng với code.
+# Tạo Interface
 iface = gr.Interface(
+    fn=predict_vqa,
+    inputs=[
+        gr.Image(type="pil", label="Input Image"), # Nhận PIL Image
+        gr.Textbox(lines=2, placeholder="Nhập câu hỏi của bạn ở đây...", label="Question")
+    ],
+    outputs=gr.Textbox(label="Predicted Answer"),
+    title=title,
+    description=description,
+    examples=examples, # Cung cấp ví dụ (đảm bảo file ảnh ví dụ tồn tại trên Space)
+    allow_flagging='never' # Tắt flagging nếu không cần
 )
+# Chạy app (Trong Hugging Face Spaces, nó sẽ tự chạy file này)
+if __name__ == "__main__":
+    # if model is not None: # Kiểm tra lại lần nữa trước khi chạy
+    #     iface.launch() # Không cần launch() ở đây khi deploy lên Spaces
+    # else:
+    #     print("Cannot launch Gradio interface because model/vocab failed to load.")
+    # Dòng iface.launch() chỉ cần khi bạn chạy cục bộ để test.
+    # Trên Spaces, Gradio tự động tìm và chạy interface được định nghĩa.
+    pass # Để trống hoặc thêm logic chạy cục bộ nếu muốn