Spaces:

PRUTHVIn
/

vqa_project

Sleeping

App Files Files Community

PRUTHVIn commited on Mar 31

Commit

1e5f3d4

verified ·

1 Parent(s): 9905fe1

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.DS_Store +0 -0
__pycache__/api.cpython-39.pyc +0 -0
__pycache__/config.cpython-39.pyc +0 -0
__pycache__/inference.cpython-39.pyc +0 -0
api.py +24 -0
app.py +41 -0
config.py +14 -0
index.html +37 -0
inference.py +160 -0
models/__pycache__/vqa_model.cpython-39.pyc +0 -0
models/pretrained.py +2 -0
models/vqa_model.py +27 -0
requirements.txt +9 -0
test.jpg +0 -0
train.py +195 -0
utils/__pycache__/text_utils.cpython-39.pyc +0 -0
utils/text_utils.py +11 -0
utils/translator.py +10 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

__pycache__/api.cpython-39.pyc ADDED Viewed

Binary file (850 Bytes). View file

__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (479 Bytes). View file

__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (2.25 kB). View file

api.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import FastAPI, File, UploadFile, Form
+from inference import predict
+import shutil
+import os
+app = FastAPI()
+UPLOAD_DIR = "temp"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@app.post("/predict")
+async def predict_api(file: UploadFile = File(...), question: str = Form(...)):
+    try:
+        file_path = os.path.join(UPLOAD_DIR, file.filename)
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        answer = predict(file_path, question)
+        return {"answer": answer}
+    except Exception as e:
+        return {"error": str(e)}

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import gradio as gr
+from inference import predict
+import torch
+from huggingface_hub import hf_hub_download
+# This pulls just the model file from your specific repo
+model_path = hf_hub_download(repo_id="PRUTHVIn/vqa_project", filename="weights/vqa_model.pth")
+# Now load it into your model class (example)
+# model.load_state_dict(torch.load(model_path))
+def vqa_interface(image, question):
+    try:
+        if image is None or question.strip() == "":
+            return "Please upload an image and enter a question."
+        answer = predict(image, question)
+        return answer
+    except Exception as e:
+        print("ERROR:", str(e))
+        return f"Error: {str(e)}"
+iface = gr.Interface(
+    fn=vqa_interface,
+    inputs=[
+        gr.Image(type="filepath", label="Upload Image"),
+        gr.Textbox(
+            label="Ask a Question",
+            placeholder="e.g. What is in the image?"
+        )
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="🧠 Smart Visual Question Answering System",
+    description="Upload any image and ask anything (works for medical + general images)",
+    theme="soft"
+)
+if __name__ == "__main__":
+    iface.launch()

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MAX_LEN = 20
+EMBED_DIM = 300
+HIDDEN_DIM = 256
+BATCH_SIZE = 32
+LR = 1e-3
+EPOCHS = 5
+MODEL_PATH = "weights/vqa_model.pth"
+VOCAB_PATH = "weights/vocab.pkl"
+ANSWER_PATH = "weights/answers.pkl"

index.html ADDED Viewed

	@@ -0,0 +1,37 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>VQA App</title>
+</head>
+<body>
+<h2>Visual Question Answering</h2>
+<input type="file" id="image"><br><br>
+<input type="text" id="question" placeholder="Ask a question"><br><br>
+<button onclick="send()">Submit</button>
+<h3 id="result"></h3>
+<script>
+async function send() {
+    const file = document.getElementById("image").files[0];
+    const question = document.getElementById("question").value;
+    let formData = new FormData();
+    formData.append("file", file);
+    formData.append("question", question);
+    const res = await fetch("http://127.0.0.1:8000/predict", {
+        method: "POST",
+        body: formData
+    });
+    const data = await res.json();
+    document.getElementById("result").innerText = data.answer || data.error;
+}
+</script>
+</body>
+</html>

inference.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from transformers import (
+    Blip2Processor,
+    Blip2ForConditionalGeneration,
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM
+)
+from langdetect import detect
+from PIL import Image
+import torch
+import pickle
+import torchvision.transforms as transforms
+# ========================
+# PERFORMANCE SETTINGS
+# ========================
+torch.set_num_threads(4)
+# ========================
+# DEVICE (CPU ONLY)
+# ========================
+device = torch.device("cpu")
+# ========================
+# LOAD BLIP2 (SAFE)
+# ========================
+print("Loading BLIP2...")
+processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+blip_model = Blip2ForConditionalGeneration.from_pretrained(
+    "Salesforce/blip2-flan-t5-xl"
+)
+blip_model.to(device)
+blip_model.eval()
+# ========================
+# LOAD TRANSLATOR
+# ========================
+print("Loading Translator...")
+translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+translator_model.to(device)
+translator_model.eval()
+lang_code_map = {
+    "en":"eng_Latn","hi":"hin_Deva","te":"tel_Telu",
+    "ta":"tam_Taml","kn":"kan_Knda","ml":"mal_Mlym"
+}
+def translate(text, src, tgt):
+    translator_tokenizer.src_lang = lang_code_map[src]
+    inputs = translator_tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        tokens = translator_model.generate(
+            **inputs,
+            forced_bos_token_id=translator_tokenizer.convert_tokens_to_ids(lang_code_map[tgt]),
+            max_length=50
+        )
+    return translator_tokenizer.decode(tokens[0], skip_special_tokens=True)
+# ========================
+# LOAD CUSTOM MODEL
+# ========================
+from models.vqa_model import VQAModel
+transform = transforms.Compose([
+    transforms.Resize((224,224)),
+    transforms.ToTensor()
+])
+with open("weights/vocab.pkl","rb") as f:
+    vocab = pickle.load(f)
+with open("weights/answers.pkl","rb") as f:
+    idx_to_answer = pickle.load(f)
+custom_model = VQAModel(len(vocab),300,256,len(idx_to_answer))
+custom_model.load_state_dict(torch.load("weights/vqa_model.pth", map_location=device))
+custom_model.to(device)
+custom_model.eval()
+def encode_question(q):
+    tokens = q.lower().split()
+    enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
+    enc = enc[:20] + [vocab["<PAD>"]] * (20-len(enc))
+    return torch.tensor(enc).unsqueeze(0)
+# ========================
+# CUSTOM MODEL
+# ========================
+def predict_custom_vqa(image_path, question):
+    image = Image.open(image_path).convert("RGB")
+    image = transform(image).unsqueeze(0)
+    q = encode_question(question)
+    with torch.no_grad():
+        out = custom_model(image, q)
+        _, pred = torch.max(out,1)
+    return idx_to_answer[pred.item()]
+# ========================
+# BLIP2 (OPTIMIZED)
+# ========================
+def open_vqa(image_path, question):
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(image, question, return_tensors="pt")
+    with torch.no_grad():
+        out = blip_model.generate(
+            **inputs,
+            max_new_tokens=15   # 🔥 reduced for speed
+        )
+    return processor.decode(out[0], skip_special_tokens=True)
+# ========================
+# FINAL PIPELINE
+# ========================
+def final_pipeline(image_path, question):
+    lang = detect(question)
+    if lang != "en":
+        q_en = translate(question, lang, "en")
+    else:
+        q_en = question
+    if "what is" in q_en.lower() or "this place" in q_en.lower():
+        answer_en = open_vqa(image_path, q_en)
+    else:
+        answer_en = predict_custom_vqa(image_path, q_en)
+    if lang != "en":
+        return translate(answer_en, "en", lang)
+    else:
+        return answer_en
+def predict(image_path, question):
+    return final_pipeline(image_path, question)
+# ========================
+# WARMUP
+# ========================
+print("Warming up...")
+dummy = Image.new("RGB", (224,224))
+processor(dummy, "test", return_tensors="pt")
+print("✅ Ready!")
+# ========================
+# TEST
+# ========================
+if __name__ == "__main__":
+    print(predict("test.jpg","What is in the image?"))

models/__pycache__/vqa_model.cpython-39.pyc ADDED Viewed

Binary file (1.22 kB). View file

models/pretrained.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def open_vqa_stub(image_path, question):
2	+ return "Pretrained VQA disabled (too heavy for local)."

models/vqa_model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+import torch.nn as nn
+import torchvision.models as models
+class VQAModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
+        super().__init__()
+        self.cnn = models.resnet18(weights="DEFAULT")
+        self.cnn.fc = nn.Identity()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
+        self.fc1 = nn.Linear(512 + hidden_dim, 256)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(256, num_answers)
+    def forward(self, image, question):
+        img_feat = self.cnn(image)
+        q_embed = self.embedding(question)
+        _, (h, _) = self.lstm(q_embed)
+        q_feat = h.squeeze(0)
+        x = self.relu(self.fc1(torch.cat((img_feat, q_feat), dim=1)))
+        return self.fc2(x)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+torchvision
+pillow
+pandas
+scikit-learn
+langdetect
+tqdm
+gradio
+huggingface_hub

test.jpg ADDED Viewed

train.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from datasets import load_dataset
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset, DataLoader, random_split
+from PIL import Image
+from collections import Counter
+import pickle
+import re
+from tqdm import tqdm
+import os
+# ========================
+# CONFIG
+# ========================
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EPOCHS = 50
+BATCH_SIZE = 32
+LR = 5e-4
+MAX_LEN = 20
+# ========================
+# LOAD DATASET
+# ========================
+dataset = load_dataset("flaviagiammarino/vqa-rad")
+df = pd.DataFrame(dataset["train"])
+df = df[["image", "question", "answer"]]
+# ========================
+# CLEAN TEXT
+# ========================
+def clean_text(text):
+    text = text.lower()
+    return re.sub(r"[^a-z0-9 ]", "", text)
+df["question"] = df["question"].apply(clean_text)
+df["answer"] = df["answer"].apply(clean_text)
+# ========================
+# FILTER TOP ANSWERS
+# ========================
+top_answers = df["answer"].value_counts().nlargest(50).index
+df = df[df["answer"].isin(top_answers)]
+answer_to_idx = {a:i for i,a in enumerate(top_answers)}
+idx_to_answer = {i:a for a,i in answer_to_idx.items()}
+df["answer_encoded"] = df["answer"].apply(lambda x: answer_to_idx[x])
+# ========================
+# VOCAB
+# ========================
+vocab = {"<PAD>":0, "<UNK>":1}
+counter = Counter()
+for q in df["question"]:
+    for w in q.split():
+        counter[w] += 1
+idx = 2
+for word, count in counter.items():
+    if count > 2:
+        vocab[word] = idx
+        idx += 1
+def encode_question(q):
+    tokens = q.split()
+    enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
+    enc = enc[:MAX_LEN] + [vocab["<PAD>"]] * (MAX_LEN - len(enc))
+    return enc
+df["question_encoded"] = df["question"].apply(encode_question)
+# ========================
+# DATASET CLASS
+# ========================
+transform = transforms.Compose([
+    transforms.Resize((224,224)),
+    transforms.ToTensor()
+])
+class VQADataset(Dataset):
+    def __init__(self, df):
+        self.df = df
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        image = row["image"].convert("RGB")
+        image = transform(image)
+        question = torch.tensor(row["question_encoded"])
+        answer = torch.tensor(row["answer_encoded"])
+        return image, question, answer
+# ========================
+# SPLIT DATA
+# ========================
+dataset_full = VQADataset(df)
+train_size = int(0.8 * len(dataset_full))
+val_size = len(dataset_full) - train_size
+train_dataset, val_dataset = random_split(dataset_full, [train_size, val_size])
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
+# ========================
+# MODEL
+# ========================
+import torchvision.models as models
+class VQAModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim, hidden_dim, num_answers):
+        super().__init__()
+        self.cnn = models.resnet18(weights="DEFAULT")
+        self.cnn.fc = nn.Identity()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
+        self.fc1 = nn.Linear(512 + hidden_dim, 256)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(256, num_answers)
+    def forward(self, image, question):
+        img_feat = self.cnn(image)
+        q_embed = self.embedding(question)
+        _, (h, _) = self.lstm(q_embed)
+        q_feat = h.squeeze(0)
+        x = self.relu(self.fc1(torch.cat((img_feat, q_feat), dim=1)))
+        return self.fc2(x)
+model = VQAModel(len(vocab), 300, 256, len(answer_to_idx)).to(DEVICE)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=LR)
+# ========================
+# TRAIN LOOP
+# ========================
+for epoch in range(EPOCHS):
+    model.train()
+    total_loss = 0
+    for images, questions, answers in tqdm(train_loader):
+        images, questions, answers = images.to(DEVICE), questions.to(DEVICE), answers.to(DEVICE)
+        outputs = model(images, questions)
+        loss = criterion(outputs, answers)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    # VALIDATION
+    model.eval()
+    val_loss = 0
+    with torch.no_grad():
+        for images, questions, answers in val_loader:
+            images, questions, answers = images.to(DEVICE), questions.to(DEVICE), answers.to(DEVICE)
+            outputs = model(images, questions)
+            loss = criterion(outputs, answers)
+            val_loss += loss.item()
+    print(f"\nEpoch {epoch+1}")
+    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
+    print(f"Val Loss: {val_loss/len(val_loader):.4f}")
+# ========================
+# SAVE MODEL
+# ========================
+os.makedirs("weights", exist_ok=True)
+torch.save(model.state_dict(), "weights/vqa_model.pth")
+with open("weights/vocab.pkl", "wb") as f:
+    pickle.dump(vocab, f)
+with open("weights/answers.pkl", "wb") as f:
+    pickle.dump(idx_to_answer, f)
+print("\n✅ Training Complete & Model Saved!")

utils/__pycache__/text_utils.cpython-39.pyc ADDED Viewed

Binary file (719 Bytes). View file

utils/text_utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import re
+def clean_text(text):
+    text = text.lower()
+    return re.sub(r"[^a-z0-9 ]", "", text)
+def encode_question(q, vocab, max_len=20):
+    tokens = q.split()
+    enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
+    enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
+    return enc

utils/translator.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from langdetect import detect
+def detect_lang(text):
+    try:
+        return detect(text)
+    except:
+        return "en"
+def translate(text, src, tgt):
+    return text