Spaces:

Tin113
/

vqa_project

Sleeping

App Files Files Community

Tin113 commited on Mar 30, 2025

Commit

83e0b3c

verified ·

1 Parent(s): 66f53bb

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -252

app.py CHANGED Viewed

@@ -1,42 +1,33 @@
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torchvision import transforms
 from PIL import Image
-import json
-import gradio as gr
-import os
-import sys
 # ============================================================================
 #  1. ĐỊNH NGHĨA LẠI CÁC CLASS MODEL (QUAN TRỌNG!)
 #     (Copy từ code huấn luyện gốc, ĐÃ SỬA Attention theo lỗi trước)
 # ============================================================================
 # -----------------------
 # Attention Module
 # -----------------------
 class Attention(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
-        # Tên lớp Linear đã được sửa để khớp với file .pth của bạn
-        self.cnn = nn.Linear(cnn_dim, attention_dim)
-        self.lstm = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
-        # Giả sử bạn có các lớp này trong code gốc đã dùng để train
-        self.tanh = nn.Tanh()
-        self.softmax = nn.Softmax(dim=1)
     def forward(self, cnn_features, lstm_features):
-        # Sử dụng tên lớp Linear đã sửa
-        cnn_proj = self.cnn(cnn_features)
-        lstm_proj = self.lstm(lstm_features)
-        combined = self.tanh(cnn_proj + lstm_proj) # Broadcasting
-        attn_logits = self.attn(combined)
-        attn_weights = self.softmax(attn_logits)
-        attended_features = (attn_weights * lstm_features).sum(dim=1)
         return attended_features
 # -----------------------
 # VQA Model
 # -----------------------
@@ -46,264 +37,190 @@ class VQAModel(nn.Module):
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
-        # --- CNN Encoder: ĐỔI TÊN TRỞ LẠI THÀNH self.cnn ---
-        self.cnn = nn.Sequential( # Đổi tên lại thành self.cnn
-            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
-            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1), nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
-        # ------------------------------------------------
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
         self.dropout = nn.Dropout(0.5)
-    # Hàm forward không bị ảnh hưởng vì không gọi trực tiếp
     def forward(self, image, caption, question, answer_input):
-         # Logic forward có thể vẫn dùng tên biến local cnn_features
-         # nhưng self.cnn để gọi mạng Sequential thì đã khớp tên
-        cnn_features = self.cnn(image) # Gọi self.cnn mới đúng tên
-        # ... (phần còn lại của forward giữ nguyên) ...
-        cnn_features = cnn_features.view(cnn_features.size(0), -1)
-        cap_embed = self.embedding(caption)
-        cap_output, _ = self.caption_lstm(cap_embed)
-        q_embed = self.embedding(question)
-        q_output, _ = self.question_lstm(q_embed)
-        cap_attended = self.attention(cnn_features.unsqueeze(1), cap_output)
-        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
-        q_last = q_output[:, -1, :]
-        context = torch.cat([cap_attended, q_attended, q_last], dim=-1)
-        answer_embed = self.embedding(answer_input)
-        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
-        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
-        decoder_in = self.decoder_input_proj(decoder_in)
-        decoder_output, _ = self.decoder_lstm(decoder_in)
-        output = self.fc_out(self.dropout(decoder_output))
-        return output
-# ----------------------------------------------------------------------------
-# ============================================================================
-#  2. CẤU HÌNH VÀ LOAD MODEL/VOCAB
-# ============================================================================
-# !! THAY ĐỔI TÊN FILE NẾU CẦN !!
-MODEL_PATH = "vqa_model.pth" # Đảm bảo tên này khớp file bạn upload
-VOCAB_PATH = "vqa_custom_cnn_vocab.json" # Đảm bảo tên này khớp file bạn upload
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# --- Hàm load ---
-def load_model_and_vocab(model_path, vocab_path, device):
-    print(f"Attempting to load vocabulary from: {vocab_path}")
-    if not os.path.exists(vocab_path):
-        print(f"Error: Vocabulary file not found at {vocab_path}")
-        return None, None, None
-    try:
-        with open(vocab_path, 'r') as f:
-            vocab_data = json.load(f)
-        word_to_idx = vocab_data['word_to_idx']
-        # Chuyển key của idx_to_word thành integer để tra cứu bằng index
-        idx_to_word = {int(k): v for k, v in vocab_data['idx_to_word'].items()}
-        vocab_size = len(word_to_idx)
-        print(f"Vocabulary loaded successfully. Size: {vocab_size}")
-    except Exception as e:
-        print(f"Error loading or processing vocabulary: {e}")
-        return None, None, None
-    print(f"Attempting to load model from: {model_path}")
-    if not os.path.exists(model_path):
-        print(f"Error: Model file not found at {model_path}")
-        return None, None, None
-    try:
-        # Khởi tạo model với các tham số chính xác
-        # Lấy các giá trị này từ lúc bạn huấn luyện model gốc
-        model = VQAModel(vocab_size=vocab_size,
-                         embedding_dim=256,    # Xác nhận giá trị này
-                         lstm_units=256,       # Xác nhận giá trị này
-                         cnn_output_dim=512,   # Xác nhận giá trị này
-                         attention_dim=256,    # Xác nhận giá trị này
-                         max_seq_len=30)       # Xác nhận giá trị này
-        model.load_state_dict(torch.load(model_path, map_location=device))
-        model.to(device)
-        model.eval() # Quan trọng: Chuyển sang chế độ đánh giá
-        print(f"Model loaded successfully from {model_path} to {device}")
-        return model, word_to_idx, idx_to_word
-    except Exception as e:
-        print(f"Error loading model state_dict: {e}")
-        # Có thể in traceback để debug kỹ hơn nếu cần
-        # import traceback
-        # traceback.print_exc()
-        return None, None, None
-# --- Load model và vocab một lần khi app khởi động ---
-model, word_to_idx, idx_to_word = load_model_and_vocab(MODEL_PATH, VOCAB_PATH, DEVICE)
-# ============================================================================
-#  3. ĐỊNH NGHĨA TRANSFORM (Lấy từ hàm train_vqa của bạn)
-# ============================================================================
-# Đảm bảo transform này giống hệt lúc bạn huấn luyện
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
-# ============================================================================
-#  4. HÀM DỰ ĐOÁN CHO GRADIO (BẮT CHƯỚC LOGIC model.predict GỐC)
-# ============================================================================
-# Hàm này sẽ được gọi bởi Gradio Interface
-def predict_vqa(image, question_str):
-    print("--- Received request ---")
-    if model is None or word_to_idx is None:
-         print("Error: Model or vocabulary not loaded.")
-         return "Lỗi: Model hoặc từ điển chưa được tải."
-    if image is None:
-        print("Error: No image provided.")
-        return "Lỗi: Vui lòng cung cấp ảnh."
-    if not question_str or not question_str.strip():
-        print("Error: No question provided.")
-        return "Lỗi: Vui lòng nhập câu hỏi."
-    print(f"Input question: {question_str}")
-    # --- 1. Tiền xử lý ảnh ---
-    try:
-        image_tensor = transform(image).unsqueeze(0).to(DEVICE)
-        print(f"Image transformed, shape: {image_tensor.shape}")
-    except Exception as e:
-        print(f"Error transforming image: {e}")
-        return f"Lỗi xử lý ảnh: {e}"
-    # --- 2. Tiền xử lý câu hỏi ---
-    try:
-        question_tokens = question_str.lower().split()
-        unk_idx = word_to_idx.get('<UNK>', word_to_idx.get('<PAD>', 0))
-        question_seq = [word_to_idx.get(word, unk_idx) for word in question_tokens]
-        if not question_seq: question_seq = [unk_idx] # Tránh sequence rỗng
-        question_tensor = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(DEVICE)
-        print(f"Question tensor created, shape: {question_tensor.shape}")
-    except Exception as e:
-        print(f"Error processing question: {e}")
-        return f"Lỗi xử lý câu hỏi: {e}"
-    # --- 3. Chạy Inference (Logic từ model.predict gốc) ---
-    start_token_idx = word_to_idx['<START>']
-    end_token_idx = word_to_idx['<END>']
-    max_len = model.max_seq_len # Lấy max_len từ model đã load
-    generated_indices = []
-    # Bắt đầu decoder input với <START> token
-    decoder_input_tensor = torch.tensor([[start_token_idx]], dtype=torch.long).to(DEVICE)
-    # Hidden state của decoder LSTM (khởi tạo là None, giống predict gốc)
-    hidden_state = None
-    try:
-        with torch.no_grad(): # Tắt gradient calculation
-            print("Encoding image...")
-            # Sử dụng self.cnn_net thay vì self.cnn
-            cnn_features = model.cnn(image_tensor)
-            cnn_features = cnn_features.view(cnn_features.size(0), -1)
-            print(f"CNN features shape: {cnn_features.shape}")
-            print("Encoding question...")
-            q_embed = model.embedding(question_tensor)
-            q_output, _ = model.question_lstm(q_embed) # (1, q_seq_len, lstm_units)
-            print(f"Question LSTM output shape: {q_output.shape}")
-            print("Calculating attention...")
-            # Chú ý unsqueeze(1) cho cnn_features khi đưa vào attention
-            q_attended = model.attention(cnn_features.unsqueeze(1), q_output) # (1, lstm_units)
-            q_last = q_output[:, -1, :] # (1, lstm_units)
-            print(f"Attended question features shape: {q_attended.shape}")
-            print(f"Last question LSTM state shape: {q_last.shape}")
-            # --- Context Vector THEO LOGIC model.predict GỐC ---
-            context = torch.cat([q_attended, q_attended, q_last], dim=-1) # (1, 3*lstm_units)
-            print(f"Context vector shape: {context.shape}")
-            print("Starting decoder loop...")
-            for i in range(max_len):
-                print(f"Decoder step {i+1}/{max_len}")
-                current_word_embed = model.embedding(decoder_input_tensor) # (1, 1, embedding_dim)
-                # Context cần unsqueeze để có chiều seq_len=1 trước khi repeat/cat
-                context_repeated = context.unsqueeze(1) # (1, 1, 3*lstm_units)
-                # Input cho lớp chiếu của decoder
-                decoder_proj_input = torch.cat([current_word_embed, context_repeated], dim=-1)
-                decoder_lstm_input = model.decoder_input_proj(decoder_proj_input) # (1, 1, lstm_units)
-                # Chạy Decoder LSTM
-                decoder_output, hidden_state = model.decoder_lstm(decoder_lstm_input, hidden_state) # hidden_state được cập nhật
-                # Lấy Logits từ output của step này
-                output_logits = model.fc_out(decoder_output.squeeze(1)) # (1, vocab_size)
-                predicted_idx = output_logits.argmax(dim=-1).item()
-                print(f"Predicted index: {predicted_idx}")
-                if predicted_idx == end_token_idx:
-                    print("End token detected.")
-                    break
-                generated_indices.append(predicted_idx)
-                # Input cho bước tiếp theo là từ vừa dự đoán
-                decoder_input_tensor = torch.tensor([[predicted_idx]], dtype=torch.long).to(DEVICE)
-            print("Decoder loop finished.")
-    except Exception as e:
-        print(f"Error during model inference: {e}")
-        # In traceback đầy đủ để debug
-        import traceback
-        traceback.print_exc()
-        return f"Lỗi trong quá trình dự đoán: {e}"
-    # --- 4. Decode Output ---
-    try:
-        answer_words = [idx_to_word.get(idx, '<UNK>') for idx in generated_indices]
-        final_answer = ' '.join(answer_words) if answer_words else "(Không tạo được câu trả lời)"
-        print(f"Decoded answer: {final_answer}")
-        return final_answer
-    except Exception as e:
-        print(f"Error decoding answer: {e}")
-        return f"Lỗi giải mã câu trả lời: {e}"
-# ============================================================================
-#  5. TẠO GRADIO INTERFACE (Đảm bảo ở global scope)
-# ============================================================================
-# Chỉ định nghĩa interface nếu model đã load thành công
-if model is not None and word_to_idx is not None:
-    print("Defining Gradio interface...")
-    title = "VQA for Animal"
-    description = "Tải lên ảnh con vật và nhập câu hỏi để nhận câu trả lời. (CHỈ HỖ TRỢ TIẾNG ANH)"
-    # examples = [ # Optional: Thêm ví dụ nếu bạn upload ảnh tương ứng
-    #     ["zebra.jpg", "what animal is this?"]
-    # ]
-    # Định nghĩa Interface ở global scope
     iface = gr.Interface(
-        fn=predict_vqa,
         inputs=[
-            gr.Image(type="pil", label="Image"), # Input là PIL Image
-            gr.Textbox(lines=2, placeholder="Enter question here...", label="Câu hỏi")
         ],
-        outputs=gr.Textbox(label="Câu trả lời"),
-        title=title,
-        description=description,
-        # examples=examples,
-        allow_flagging='never' # Tắt flagging
     )
-    print("Gradio interface defined.")
-else:
-    print("Skipping Gradio interface definition due to load errors.")
-    # Có thể định nghĩa một interface báo lỗi nếu muốn
-    def error_interface(*args):
-        return "Lỗi nghiêm trọng: Không thể tải model hoặc từ điển. Vui lòng kiểm tra logs của Space."
-    iface = gr.Interface(fn=error_interface, inputs=[], outputs="text", title="Lỗi Load Model")
-# Không cần if __name__ == "__main__": iface.launch() cho Spaces

+import gradio as gr
 import torch
+import json
 from torchvision import transforms
 from PIL import Image
+import numpy as np
 # ============================================================================
 #  1. ĐỊNH NGHĨA LẠI CÁC CLASS MODEL (QUAN TRỌNG!)
 #     (Copy từ code huấn luyện gốc, ĐÃ SỬA Attention theo lỗi trước)
 # ============================================================================
 # -----------------------
 # Attention Module
 # -----------------------
 class Attention(nn.Module):
     def __init__(self, cnn_dim, lstm_dim, attention_dim):
         super(Attention, self).__init__()
+        self.cnn_proj = nn.Linear(cnn_dim, attention_dim)
+        self.lstm_proj = nn.Linear(lstm_dim, attention_dim)
         self.attn = nn.Linear(attention_dim, 1)
     def forward(self, cnn_features, lstm_features):
+        # cnn_features: (batch, 1, cnn_dim)
+        # lstm_features: (batch, seq_len, lstm_dim)
+        cnn_proj = self.cnn_proj(cnn_features)  # (batch, 1, attention_dim)
+        lstm_proj = self.lstm_proj(lstm_features)  # (batch, seq_len, attention_dim)
+        combined = torch.tanh(cnn_proj + lstm_proj)  # (batch, seq_len, attention_dim)
+        attn_weights = F.softmax(self.attn(combined), dim=1)  # (batch, seq_len, 1)
+        attended_features = (attn_weights * lstm_features).sum(dim=1)  # (batch, lstm_dim)
         return attended_features
 # -----------------------
 # VQA Model
 # -----------------------
         self.vocab_size = vocab_size
         self.max_seq_len = max_seq_len
+        # CNN Encoder: Trích xuất đặc trưng ảnh
+        self.cnn = nn.Sequential(
+            nn.Conv2d(3, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Conv2d(128, cnn_output_dim, kernel_size=3, padding=1),
+            nn.ReLU(),
             nn.AdaptiveAvgPool2d((1, 1))
         )
+        # Text Embedding
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        # LSTM Encoders cho caption và question
         self.caption_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
         self.question_lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
+        # Attention cho từng kênh
         self.attention = Attention(cnn_output_dim, lstm_units, attention_dim)
+        # Decoder: sử dụng teacher forcing
+        # Context vector: kết hợp của attention từ caption, attention từ question và trạng thái cuối của question
+        # Kích thước context = lstm_units + lstm_units + lstm_units = 3 * lstm_units (ví dụ 768 nếu lstm_units=256)
+        # Kết hợp với embedding của câu trả lời (embedding_dim) => đầu vào của decoder = embedding_dim + 3*lstm_units
         self.decoder_input_proj = nn.Linear(embedding_dim + 3 * lstm_units, lstm_units)
         self.decoder_lstm = nn.LSTM(lstm_units, lstm_units, batch_first=True)
         self.fc_out = nn.Linear(lstm_units, vocab_size)
         self.dropout = nn.Dropout(0.5)
     def forward(self, image, caption, question, answer_input):
+        # --- CNN Encoder ---
+        cnn_features = self.cnn(image)  # (batch, cnn_output_dim, 1, 1)
+        cnn_features = cnn_features.view(cnn_features.size(0), -1)  # (batch, cnn_output_dim)
+        # --- Text Encoders ---
+        cap_embed = self.embedding(caption)  # (batch, cap_seq_len, embedding_dim)
+        cap_output, _ = self.caption_lstm(cap_embed)  # (batch, cap_seq_len, lstm_units)
+        q_embed = self.embedding(question)  # (batch, q_seq_len, embedding_dim)
+        q_output, _ = self.question_lstm(q_embed)  # (batch, q_seq_len, lstm_units)
+        # --- Attention ---
+        cap_attended = self.attention(cnn_features.unsqueeze(1), cap_output)  # (batch, lstm_units)
+        q_attended = self.attention(cnn_features.unsqueeze(1), q_output)      # (batch, lstm_units)
+        q_last = q_output[:, -1, :]  # (batch, lstm_units)
+        # Context vector: (batch, 3*lstm_units)
+        context = torch.cat([cap_attended, q_attended, q_last], dim=-1)
+        # --- Decoder với Teacher Forcing ---
+        # answer_input: (batch, ans_seq_len)
+        answer_embed = self.embedding(answer_input)  # (batch, ans_seq_len, embedding_dim)
+        context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)  # (batch, ans_seq_len, 3*lstm_units)
+        decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)  # (batch, ans_seq_len, embedding_dim + 3*lstm_units)
+        decoder_in = self.decoder_input_proj(decoder_in)  # (batch, ans_seq_len, lstm_units)
+        decoder_output, _ = self.decoder_lstm(decoder_in)  # (batch, ans_seq_len, lstm_units)
+        output = self.fc_out(self.dropout(decoder_output))  # (batch, ans_seq_len, vocab_size)
+        return output
+    def predict(self, image, question, word_to_idx, idx_to_word, device='cuda' if torch.cuda.is_available() else 'cpu'):
+      self.eval()
+      self.to(device)
+      # Kiểm tra nếu image không có batch dimension thì thêm
+      if image.dim() == 3:
+          image = image.unsqueeze(0)
+      image = image.to(device)
+      question_seq = [word_to_idx.get(word, word_to_idx['<PAD>']) for word in question.lower().split()]
+      question = torch.tensor(question_seq, dtype=torch.long).unsqueeze(0).to(device)
+      # Encode image và question
+      cnn_features = self.cnn(image)
+      cnn_features = cnn_features.view(cnn_features.size(0), -1)
+      q_embed = self.embedding(question)
+      q_output, _ = self.question_lstm(q_embed)
+      q_attended = self.attention(cnn_features.unsqueeze(1), q_output)
+      q_last = q_output[:, -1, :]
+      # Ở predict, sử dụng một context vector đơn giản từ question (hoặc kết hợp với các thành phần khác nếu có)
+      context = torch.cat([q_attended, q_attended, q_last], dim=-1)  # (1, 3*lstm_units)
+      # Khởi tạo câu trả lời với token <START>
+      answer_input = torch.tensor([[word_to_idx['<START>']]], dtype=torch.long).to(device)
+      answer_words = []
+      hidden = None
+      for _ in range(self.max_seq_len):
+          answer_embed = self.embedding(answer_input)  # (1, seq_len, embedding_dim)
+          context_repeated = context.unsqueeze(1).repeat(1, answer_input.size(1), 1)
+          decoder_in = torch.cat([answer_embed, context_repeated], dim=-1)
+          decoder_in = self.decoder_input_proj(decoder_in)
+          decoder_output, hidden = self.decoder_lstm(decoder_in, hidden)
+          output = self.fc_out(decoder_output[:, -1, :])
+          next_word_idx = output.argmax(dim=-1).item()
+          if next_word_idx == word_to_idx['<END>']:
+              break
+          answer_words.append(idx_to_word[next_word_idx])
+          answer_input = torch.cat([answer_input, torch.tensor([[next_word_idx]], dtype=torch.long).to(device)], dim=1)
+      return ' '.join(answer_words)
+# Hàm load mô hình
+def load_model(model_path, word_to_idx_path, idx_to_word_path, device='cpu'):
+    # Load từ điển
+    with open(word_to_idx_path, 'r') as f:
+        word_to_idx = json.load(f)
+    with open(idx_to_word_path, 'r') as f:
+        idx_to_word = json.load(f)
+    # Khởi tạo mô hình
+    model = VQAModel(vocab_size=len(word_to_idx))
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model.to(device)
+    model.eval()
+    return model, word_to_idx, idx_to_word
+# Transform ảnh
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
+# Hàm dự đoán
+def predict(image, question, model, word_to_idx, idx_to_word, device='cpu'):
+    # Chuyển đổi ảnh
+    image = transform(image).unsqueeze(0).to(device)
+    # Dự đoán
+    answer = model.predict(image, question, word_to_idx, idx_to_word, device)
+    return answer
+# Tạo giao diện Gradio
+def create_interface(model, word_to_idx, idx_to_word, device='cpu'):
+    def vqa_interface(image, question):
+        answer = predict(image, question, model, word_to_idx, idx_to_word, device)
+        return answer
+    examples = [
+        ["example1.jpg", "What color is the animal?"],
+        ["example2.jpg", "Is this a cat or a dog?"]
+    ]
     iface = gr.Interface(
+        fn=vqa_interface,
         inputs=[
+            gr.Image(type="pil", label="Upload an image"),
+            gr.Textbox(label="Ask a question about the image")
         ],
+        outputs=gr.Textbox(label="Answer"),
+        examples=examples,
+        title="Visual Question Answering System",
+        description="Upload an image and ask a question about it. The model will try to answer."
     )
+    return iface
+# Main
+if __name__ == "__main__":
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Load mô hình
+    model, word_to_idx, idx_to_word = load_model(
+        "vqa_model.pth",
+        "word_to_idx.json",
+        "idx_to_word.json",
+        device
+    )
+    # Tạo và chạy giao diện
+    iface = create_interface(model, word_to_idx, idx_to_word, device)
+    iface.launch()