Spaces:

Tin113
/

vqa_project

Sleeping

Tin113 commited on Mar 29, 2025

Commit

4029376

verified ·

1 Parent(s): 2af0460

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,39 @@ import gradio as gr
 from PIL import Image
 from torchvision import transforms
 # Load mô hình từ Hugging Face Model Hub hoặc local
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -47,7 +80,7 @@ def predict(image, question):
     answer = idx_to_word[predicted_idx]
     return answer
-# 🎨 Giao diện Gradio
 iface = gr.Interface(
     fn=predict,
     inputs=[gr.Image(type="pil"), gr.Textbox(label="Câu hỏi")],

 from PIL import Image
 from torchvision import transforms
+import torch.nn as nn
+import torchvision.models as models
+class VQAModel(nn.Module):
+    def __init__(self, vocab_size):
+        super(VQAModel, self).__init__()
+        # Dùng ResNet làm CNN Encoder
+        self.cnn = models.resnet18(pretrained=True)
+        self.cnn.fc = nn.Linear(512, 256)  # Thay FC layer
+        # Dùng LSTM làm Text Encoder
+        self.embedding = nn.Embedding(vocab_size, 256)
+        self.lstm = nn.LSTM(256, 256, batch_first=True)
+        # Fully Connected Layer để dự đoán câu trả lời
+        self.fc = nn.Linear(256, vocab_size)
+    def forward(self, image, question):
+        # Encode ảnh
+        img_features = self.cnn(image)
+        # Encode câu hỏi
+        q_embed = self.embedding(question)
+        _, (q_features, _) = self.lstm(q_embed)
+        # Kết hợp đặc trưng ảnh và câu hỏi
+        combined = img_features + q_features.squeeze(0)
+        output = self.fc(combined)
+        return output
 # Load mô hình từ Hugging Face Model Hub hoặc local
 device = "cuda" if torch.cuda.is_available() else "cpu"
     answer = idx_to_word[predicted_idx]
     return answer
+# Giao diện Gradio
 iface = gr.Interface(
     fn=predict,
     inputs=[gr.Image(type="pil"), gr.Textbox(label="Câu hỏi")],