Spaces:

PRUTHVIn
/

vqa-pro

Sleeping

App Files Files Community

PRUTHVIn commited on Mar 28

Commit

87d4a35

verified ·

1 Parent(s): d7a7065

Update main.py

Browse files

Files changed (1) hide show

main.py +157 -205

main.py CHANGED Viewed

@@ -1,107 +1,30 @@
-import re
-from collections import Counter
-import pickle
 import torch
 import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
 from PIL import Image
 import torchvision.transforms as transforms
-import torchvision.models as models
 from langdetect import detect
-# ========== Global config ==========
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MAX_LEN = 20
-# Same transform as training
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor()
 ])
-# ========== Dataset & preprocessing (for local training only) ==========
-class VQADataset(Dataset):
-    def __init__(self, df, transform):
-        self.df = df.reset_index(drop=True)
-        self.transform = transform
-    def __len__(self):
-        return len(self.df)
-    def __getitem__(self, idx):
-        row = self.df.iloc[idx]
-        image = row["image"].convert("RGB")
-        image = self.transform(image)
-        question = torch.tensor(row["question_encoded"], dtype=torch.long)
-        answer = torch.tensor(row["answer_encoded"], dtype=torch.long)
-        return image, question, answer
-def prepare_data(max_answers=50, min_word_count=3, max_len=MAX_LEN):
-    """
-    Local training helper: load VQA-RAD, clean, build vocab, dataset.
-    NOT used on the Space at runtime.
-    """
-    from datasets import load_dataset
-    import pandas as pd
-    dataset = load_dataset("flaviagiammarino/vqa-rad")
-    df = pd.DataFrame(dataset["train"])[["image", "question", "answer"]]
-    def clean_text(text):
-        text = str(text).lower()
-        text = re.sub(r"[^a-z0-9 ]", "", text)
-        return text
-    df["question"] = df["question"].apply(clean_text)
-    df["answer"] = df["answer"].apply(clean_text)
-    top_answers = df["answer"].value_counts().nlargest(max_answers).index
-    df = df[df["answer"].isin(top_answers)].reset_index(drop=True)
-    answer_to_idx = {a: i for i, a in enumerate(top_answers)}
-    idx_to_answer = {i: a for a, i in answer_to_idx.items()}
-    df["answer_encoded"] = df["answer"].apply(lambda x: answer_to_idx[x])
-    vocab = {"<PAD>": 0, "<UNK>": 1}
-    counter = Counter()
-    for q in df["question"]:
-        for w in q.split():
-            counter[w] += 1
-    idx = 2
-    for word, count in counter.items():
-        if count >= min_word_count:
-            vocab[word] = idx
-            idx += 1
-    def encode_question(q):
-        tokens = q.split()
-        enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
-        enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
-        return enc
-    df["question_encoded"] = df["question"].apply(encode_question)
-    train_dataset = VQADataset(df, transform)
-    return {
-        "dataset_df": df,
-        "train_dataset": train_dataset,
-        "vocab": vocab,
-        "answer_to_idx": answer_to_idx,
-        "idx_to_answer": idx_to_answer,
-        "max_len": max_len,
-    }
-# ========== Model ==========
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
         super().__init__()
-        # same backbone as original code (ResNet18 pretrained)
-        self.cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
         self.cnn.fc = nn.Identity()
         self.embedding = nn.Embedding(vocab_size, embed_dim)
         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
@@ -114,125 +37,154 @@ class VQAModel(nn.Module):
         q_embed = self.embedding(question)
         _, (h, _) = self.lstm(q_embed)
         q_feat = h.squeeze(0)
-        x = torch.cat((img_feat, q_feat), dim=1)
-        x = self.relu(self.fc1(x))
-        return self.fc2(x)
-# ========== Training (local only, not used on Space) ==========
-def train_model(train_dataset, vocab, idx_to_answer,
-                epochs=20, batch_size=32, lr=1e-3, save_prefix="vqa_custom"):
-    """Use only in Colab / local to create vqa_custom_model.pth etc."""
-    vocab_size = len(vocab)
-    num_answers = len(idx_to_answer)
-    model = VQAModel(vocab_size, 300, 256, num_answers).to(device)
-    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-    loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
-    from tqdm import tqdm
-    for epoch in range(epochs):
-        model.train()
-        total_loss = 0.0
-        for images, questions, answers in tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}"):
-            images, questions, answers = images.to(device), questions.to(device), answers.to(device)
-            outputs = model(images, questions)
-            loss = criterion(outputs, answers)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            total_loss += loss.item()
-        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
-    torch.save(model.state_dict(), "vqa_custom_model.pth")
-    with open("vocab.pkl", "wb") as f:
-        pickle.dump(vocab, f)
-    with open("answer_mapping.pkl", "wb") as f:
-        pickle.dump(idx_to_answer, f)
-    return model
-# ========== Load artifacts + inference (used in Space) ==========
-def load_artifacts(prefix=None, map_location=None):
-    """
-    Load your original good model:
-      - vqa_custom_model.pth
-      - vocab.pkl
-      - answer_mapping.pkl
-    """
-    with open("vocab.pkl", "rb") as f:
         vocab = pickle.load(f)
-    with open("answer_mapping.pkl", "rb") as f:
         idx_to_answer = pickle.load(f)
-    model = VQAModel(len(vocab), 300, 256, len(idx_to_answer))
-    model.load_state_dict(torch.load("vqa_custom_model.pth",
-                                     map_location=map_location or device))
     model.to(device)
     model.eval()
-    def encode_question_infer(q, max_len=MAX_LEN):
-        q = str(q).lower()
-        tokens = q.split()
-        enc = [vocab.get(w, vocab.get("<UNK>", 0)) for w in tokens]
-        enc = enc[:max_len] + [vocab.get("<PAD>", 0)] * (max_len - len(enc))
-        return torch.tensor(enc).unsqueeze(0)
-    def predict_custom_vqa(image_path, question):
-        image = Image.open(image_path).convert("RGB")
-        image_t = transform(image).unsqueeze(0).to(device)
-        q = encode_question_infer(question).to(device)
-        with torch.no_grad():
-            out = model(image_t, q)
-            _, pred = torch.max(out, 1)
-        return idx_to_answer[pred.item()]
-    def final_pipeline(image_path, question, open_vqa_fn=None, translate_fn=None):
-        # Keep exactly what your good model expects (English radiology questions)
-        lang = detect(question)
-        q_en = question  # you trained in English; skip translation
-        # Always use custom model here; you can add BLIP routing later if needed
-        answer_en = predict_custom_vqa(image_path, q_en)
-        return answer_en
-    return final_pipeline, predict_custom_vqa, vocab, idx_to_answer, model, encode_question_infer
-def load_artifacts_and_helpers(prefix="vqa_custom", map_location=None):
-    # wrapper used by app.py
-    return load_artifacts(map_location=map_location)
-# ========== Optional CLI (local only) ==========
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="VQA pipeline (prepare/train/infer)")
-    parser.add_argument("--prepare", action="store_true")
-    parser.add_argument("--train", action="store_true")
-    parser.add_argument("--epochs", type=int, default=20)
-    parser.add_argument("--image")
-    parser.add_argument("--question", default="What is in the image?")
-    args = parser.parse_args()
-    if args.prepare or args.train:
-        artifacts = prepare_data()
-        print("Prepared dataset with", len(artifacts["answer_to_idx"]), "answer classes.")
-    if args.train:
-        train_model(
-            artifacts["train_dataset"],
-            artifacts["vocab"],
-            artifacts["idx_to_answer"],
-            epochs=args.epochs,
         )
-    if args.image:
-        final_pipeline, *_ = load_artifacts()
-        print(final_pipeline(args.image, args.question))

 import torch
 import torch.nn as nn
+import torchvision.models as models
+import pickle
+import re
 from PIL import Image
 import torchvision.transforms as transforms
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from langdetect import detect
+import numpy as np
+import os
+# Global models dictionary
+models_dict = None
+device = None
+# Transforms
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor()
 ])
 class VQAModel(nn.Module):
     def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
         super().__init__()
+        self.cnn = models.resnet18(pretrained=False)
         self.cnn.fc = nn.Identity()
         self.embedding = nn.Embedding(vocab_size, embed_dim)
         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
         q_embed = self.embedding(question)
         _, (h, _) = self.lstm(q_embed)
         q_feat = h.squeeze(0)
+        combined = torch.cat((img_feat, q_feat), dim=1)
+        x = self.relu(self.fc1(combined))
+        out = self.fc2(x)
+        return out
+def load_models():
+    """Load all models once at startup"""
+    global models_dict, device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Load custom VQA model
+    with open("models/vocab.pkl", "rb") as f:
         vocab = pickle.load(f)
+    with open("models/answer_mapping.pkl", "rb") as f:
         idx_to_answer = pickle.load(f)
+    vocab_size = len(vocab)
+    model = VQAModel(vocab_size, 300, 256, len(idx_to_answer))
+    model.load_state_dict(torch.load("models/vqa_custom_model.pth", map_location=device))
     model.to(device)
     model.eval()
+    # BLIP2 for open-ended (smaller model for free tier)
+    print("Loading BLIP2...")
+    processor = Blip2Processor.from_pretrained(
+        "Salesforce/blip2-flan-t5-base",
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+    )
+    blip_model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-flan-t5-base",
+        device_map="auto" if torch.cuda.is_available() else None,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        low_cpu_mem_usage=True
+    )
+    # Translator
+    print("Loading Translator...")
+    translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+    translator_model = AutoModelForSeq2SeqLM.from_pretrained(
+        "facebook/nllb-200-distilled-600M",
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+    ).to(device)
+    lang_code_map = {
+        "en": "eng_Latn", "hi": "hin_Deva", "te": "tel_Telu",
+        "ta": "tam_Taml", "kn": "kan_Knda", "ml": "mal_Mlym"
+    }
+    models_dict = {
+        'model': model, 'vocab': vocab, 'idx_to_answer': idx_to_answer,
+        'processor': processor, 'blip_model': blip_model,
+        'translator_tokenizer': translator_tokenizer,
+        'translator_model': translator_model, 'lang_code_map': lang_code_map,
+        'device': device
+    }
+    print("✅ All models loaded successfully!")
+    return models_dict
+def init_models():
+    """Initialize models if not loaded"""
+    global models_dict
+    if models_dict is None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        load_models()
+    return models_dict
+# All your functions remain EXACTLY the same...
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r"[^a-z0-9 ]", "", text)
+    return text
+def encode_question_infer(q, vocab):
+    q = clean_text(q)
+    tokens = q.split()
+    MAX_LEN = 20
+    enc = [vocab.get(w, vocab["<unk>"]) for w in tokens]
+    enc = enc[:MAX_LEN] + [vocab["<pad>"]] * (MAX_LEN - len(enc))
+    return torch.tensor(enc, dtype=torch.long)
+def translate(text, src_lang, tgt_lang, tokenizer, model, lang_code_map, device):
+    try:
+        tokenizer.src_lang = lang_code_map.get(src_lang, "eng_Latn")
+        inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
+        tokens = model.generate(
+            **inputs,
+            forced_bos_token_id=tokenizer.convert_tokens_to_ids(lang_code_map[tgt_lang]),
+            max_length=50, num_beams=5
         )
+        return tokenizer.decode(tokens[0], skip_special_tokens=True)
+    except:
+        return text
+def predict_custom_vqa(image_tensor, question_tensor, model, idx_to_answer, device):
+    model.eval()
+    with torch.no_grad():
+        image_tensor = image_tensor.to(device)
+        question_tensor = question_tensor.to(device)
+        out = model(image_tensor, question_tensor)
+        _, pred = torch.max(out, 1)
+    return idx_to_answer[pred.item()]
+def open_vqa(image, question, processor, blip_model):
+    inputs = processor(image, question, return_tensors="pt")
+    if torch.cuda.is_available():
+        inputs = {k: v.to(blip_model.device) for k, v in inputs.items()}
+    out = blip_model.generate(**inputs, max_new_tokens=50)
+    return processor.decode(out[0], skip_special_tokens=True)
+def final_pipeline(image_path_or_pil, question):
+    """Main inference function - EXACT SAME as before"""
+    init_models()
+    m = models_dict
+    if hasattr(image_path_or_pil, 'convert'):
+        image = image_path_or_pil.convert("RGB")
+        image_tensor = transform(image).unsqueeze(0)
+    else:
+        image = Image.open(image_path_or_pil).convert("RGB")
+        image_tensor = transform(image).unsqueeze(0)
+    try:
+        lang = detect(question)
+    except:
+        lang = "en"
+    if lang != "en":
+        q_en = translate(question, lang, "en",
+                        m['translator_tokenizer'], m['translator_model'],
+                        m['lang_code_map'], m['device'])
+    else:
+        q_en = question
+    if any(x in q_en.lower() for x in ["what is", "describe", "this place", "show"]):
+        answer_en = open_vqa(image, q_en, m['processor'], m['blip_model'])
+    else:
+        q_tensor = encode_question_infer(q_en, m['vocab']).unsqueeze(0)
+        answer_en = predict_custom_vqa(image_tensor, q_tensor,
+                                     m['model'], m['idx_to_answer'], m['device'])
+    if lang != "en":
+        answer = translate(answer_en, "en", lang,
+                          m['translator_tokenizer'], m['translator_model'],
+                          m['lang_code_map'], m['device'])
+    else:
+        answer = answer_en
+    return f"**Detected Language:** {lang}\n**Answer:** {answer}"