Spaces:

PRUTHVIn
/

vqa-pro

Sleeping

App Files Files Community

PRUTHVIn commited on about 1 month ago

Commit

d7a7065

verified ·

1 Parent(s): 37a21e2

main.py

Browse files

Files changed (1) hide show

main.py +39 -35

main.py CHANGED Viewed

@@ -14,7 +14,7 @@ from langdetect import detect
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MAX_LEN = 20
-# Fixed image transform
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor()
@@ -40,7 +40,10 @@ class VQADataset(Dataset):
 def prepare_data(max_answers=50, min_word_count=3, max_len=MAX_LEN):
-    """For local training only – not used in Space."""
     from datasets import load_dataset
     import pandas as pd
@@ -97,13 +100,11 @@ def prepare_data(max_answers=50, min_word_count=3, max_len=MAX_LEN):
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
         super().__init__()
-        # Use lightweight ResNet18 backbone
         self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
         self.cnn.fc = nn.Identity()
         self.embedding = nn.Embedding(vocab_size, embed_dim)
         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
         self.fc1 = nn.Linear(512 + hidden_dim, 256)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(256, num_answers)
@@ -118,10 +119,10 @@ class VQAModel(nn.Module):
         return self.fc2(x)
-# ========== Training (local only) ==========
 def train_model(train_dataset, vocab, idx_to_answer,
                 epochs=20, batch_size=32, lr=1e-3, save_prefix="vqa_custom"):
-    """Run this only on Colab / local, not in Space."""
     vocab_size = len(vocab)
     num_answers = len(idx_to_answer)
@@ -147,31 +148,39 @@ def train_model(train_dataset, vocab, idx_to_answer,
         print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
-    torch.save(model.state_dict(), f"{save_prefix}_model.pth")
-    with open(f"{save_prefix}_vocab.pkl", "wb") as f:
         pickle.dump(vocab, f)
-    with open(f"{save_prefix}_answers.pkl", "wb") as f:
         pickle.dump(idx_to_answer, f)
     return model
-# ========== Load artifacts + inference ==========
-def load_artifacts(prefix="vqa_custom", map_location=None):
-    with open(f"{prefix}_vocab.pkl", "rb") as f:
         vocab = pickle.load(f)
-    with open(f"{prefix}_answers.pkl", "rb") as f:
         idx_to_answer = pickle.load(f)
     model = VQAModel(len(vocab), 300, 256, len(idx_to_answer))
-    model.load_state_dict(torch.load(f"{prefix}_model.pth", map_location=map_location or device))
     model.to(device)
     model.eval()
     def encode_question_infer(q, max_len=MAX_LEN):
-        tokens = str(q).lower().split()
-        enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
-        enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
         return torch.tensor(enc).unsqueeze(0)
     def predict_custom_vqa(image_path, question):
@@ -184,41 +193,37 @@ def load_artifacts(prefix="vqa_custom", map_location=None):
         return idx_to_answer[pred.item()]
     def final_pipeline(image_path, question, open_vqa_fn=None, translate_fn=None):
-        # Keep it simple in Space: English only, no BLIP / translator unless passed in
         lang = detect(question)
-        q_en = question if lang == "en" or translate_fn is None else translate_fn(question, lang, "en")
-        if open_vqa_fn is not None and ("what is" in q_en.lower() or "this place" in q_en.lower()):
-            answer_en = open_vqa_fn(image_path, q_en)
-        else:
-            answer_en = predict_custom_vqa(image_path, q_en)
-        if lang != "en" and translate_fn is not None:
-            return translate_fn(answer_en, "en", lang)
         return answer_en
     return final_pipeline, predict_custom_vqa, vocab, idx_to_answer, model, encode_question_infer
 def load_artifacts_and_helpers(prefix="vqa_custom", map_location=None):
-    return load_artifacts(prefix=prefix, map_location=map_location)
 if __name__ == "__main__":
-    # Local CLI only; never runs in Space
     import argparse
-    parser = argparse.ArgumentParser()
     parser.add_argument("--prepare", action="store_true")
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--epochs", type=int, default=20)
-    parser.add_argument("--prefix", default="vqa_custom")
     parser.add_argument("--image")
     parser.add_argument("--question", default="What is in the image?")
     args = parser.parse_args()
     if args.prepare or args.train:
         artifacts = prepare_data()
-        print("Prepared dataset with", len(artifacts["answer_to_idx"]), "answers")
     if args.train:
         train_model(
@@ -226,9 +231,8 @@ if __name__ == "__main__":
             artifacts["vocab"],
             artifacts["idx_to_answer"],
             epochs=args.epochs,
-            save_prefix=args.prefix,
         )
     if args.image:
-        final_pipeline, *_ = load_artifacts(prefix=args.prefix)
-        print(final_pipeline(args.image, args.question, open_vqa_fn=None, translate_fn=None))

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MAX_LEN = 20
+# Same transform as training
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor()
 def prepare_data(max_answers=50, min_word_count=3, max_len=MAX_LEN):
+    """
+    Local training helper: load VQA-RAD, clean, build vocab, dataset.
+    NOT used on the Space at runtime.
+    """
     from datasets import load_dataset
     import pandas as pd
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
         super().__init__()
+        # same backbone as original code (ResNet18 pretrained)
         self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
         self.cnn.fc = nn.Identity()
         self.embedding = nn.Embedding(vocab_size, embed_dim)
         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
         self.fc1 = nn.Linear(512 + hidden_dim, 256)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(256, num_answers)
         return self.fc2(x)
+# ========== Training (local only, not used on Space) ==========
 def train_model(train_dataset, vocab, idx_to_answer,
                 epochs=20, batch_size=32, lr=1e-3, save_prefix="vqa_custom"):
+    """Use only in Colab / local to create vqa_custom_model.pth etc."""
     vocab_size = len(vocab)
     num_answers = len(idx_to_answer)
         print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
+    torch.save(model.state_dict(), "vqa_custom_model.pth")
+    with open("vocab.pkl", "wb") as f:
         pickle.dump(vocab, f)
+    with open("answer_mapping.pkl", "wb") as f:
         pickle.dump(idx_to_answer, f)
     return model
+# ========== Load artifacts + inference (used in Space) ==========
+def load_artifacts(prefix=None, map_location=None):
+    """
+    Load your original good model:
+      - vqa_custom_model.pth
+      - vocab.pkl
+      - answer_mapping.pkl
+    """
+    with open("vocab.pkl", "rb") as f:
         vocab = pickle.load(f)
+    with open("answer_mapping.pkl", "rb") as f:
         idx_to_answer = pickle.load(f)
     model = VQAModel(len(vocab), 300, 256, len(idx_to_answer))
+    model.load_state_dict(torch.load("vqa_custom_model.pth",
+                                     map_location=map_location or device))
     model.to(device)
     model.eval()
     def encode_question_infer(q, max_len=MAX_LEN):
+        q = str(q).lower()
+        tokens = q.split()
+        enc = [vocab.get(w, vocab.get("<UNK>", 0)) for w in tokens]
+        enc = enc[:max_len] + [vocab.get("<PAD>", 0)] * (max_len - len(enc))
         return torch.tensor(enc).unsqueeze(0)
     def predict_custom_vqa(image_path, question):
         return idx_to_answer[pred.item()]
     def final_pipeline(image_path, question, open_vqa_fn=None, translate_fn=None):
+        # Keep exactly what your good model expects (English radiology questions)
         lang = detect(question)
+        q_en = question  # you trained in English; skip translation
+        # Always use custom model here; you can add BLIP routing later if needed
+        answer_en = predict_custom_vqa(image_path, q_en)
         return answer_en
     return final_pipeline, predict_custom_vqa, vocab, idx_to_answer, model, encode_question_infer
 def load_artifacts_and_helpers(prefix="vqa_custom", map_location=None):
+    # wrapper used by app.py
+    return load_artifacts(map_location=map_location)
+# ========== Optional CLI (local only) ==========
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description="VQA pipeline (prepare/train/infer)")
     parser.add_argument("--prepare", action="store_true")
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--epochs", type=int, default=20)
     parser.add_argument("--image")
     parser.add_argument("--question", default="What is in the image?")
     args = parser.parse_args()
     if args.prepare or args.train:
         artifacts = prepare_data()
+        print("Prepared dataset with", len(artifacts["answer_to_idx"]), "answer classes.")
     if args.train:
         train_model(
             artifacts["vocab"],
             artifacts["idx_to_answer"],
             epochs=args.epochs,
         )
     if args.image:
+        final_pipeline, *_ = load_artifacts()
+        print(final_pipeline(args.image, args.question))