restructured files

Browse files

Files changed (8) hide show

__init__.py +14 -0
functions.py +197 -0
fussionmodel.py +87 -0
model.py +330 -0
model_functions.py +210 -0
models.py +29 -0
qtype.py +25 -0
tpred.py +24 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+from qtype import QuestionTypeClassifier
+from tpred import TaskPredictor
+from model import VQAModel
+from fussionmodel import CoAttentionFusion
+from functions import preprocess_example, preprocess_image, collate_fn
+import torch
+import torch.nn as nn
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+"""
+from .functions import preprocess_example, preprocess_image, collate_fn
+from .model import VQAModel

functions.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from PIL import Image
+import requests
+import torch
+import torchvision.transforms as transforms
+transformten = transforms.Compose([
+        transforms.Resize((224, 224)),   # adjust size for your model
+        transforms.ToTensor(),           # convert to tensor
+        transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet normalization
+                             std=[0.229, 0.224, 0.225])
+    ])
+from collections import defaultdict
+from torch.utils.data import DataLoader
+import os
+from transformers import AutoTokenizer
+image_cache = {}
+def preprocess_image(image_source):
+    """
+    Preprocess a single image for inference.
+    `image_source` can be either a URL or a local file path.
+    Returns a tensor [C, H, W].
+    """
+    if isinstance(image_source, str):
+        if image_source.startswith("http"):  # URL
+            image = Image.open(requests.get(image_source, stream=True).raw).convert("RGB")
+        else:  # local path
+            image = Image.open(image_source).convert("RGB")
+    elif isinstance(image_source, Image.Image):  # already a PIL image
+        image = image_source
+    else:
+        raise ValueError("Unsupported image_source type")
+    # Apply the same transform used during training
+    image = transformten(image)  # e.g. Resize(224) → ToTensor() → Normalize()
+    return image  # torch.Tensor [3, H, W]
+def preprocess_example(example):
+    # Download image
+    #image = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
+    router_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+    #Image from dataset
+    image_name = example["image"].split("/")[-1]
+    image_path = os.path.join("/kaggle/input/medico2025", image_name)
+    # 2. Check if the image is already in our cache
+    if image_path in image_cache:
+        image = image_cache[image_path]
+    else:
+        image = Image.open(image_path)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        image_cache[image_path] = image  # Cache the loaded image object
+    # Apply your normalize/transform method
+    image = transformten(image)  # e.g. Resize + ToTensor + Normalize
+    #print("DEBUG image:", type(image), image.shape)
+    # Tokenize the question
+    q_inputs = router_tokenizer(example["question"],
+                                return_tensors="pt",
+                                truncation=True,
+                                padding="max_length",
+                                max_length=32)
+    # q_inputs is a BatchEncoding with tensors inside (batch_size=1), so we squeeze
+    input_ids = q_inputs["input_ids"].squeeze(0)          # torch.Tensor [seq_len]
+    attention_mask = q_inputs["attention_mask"].squeeze(0)
+    # Pack features
+    return {
+        "image": image,
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "answer": example["answer"],
+        "question_class": example["question_class"],
+        "image_url": example["image"],
+    }
+def normalize_answer(ans, q_type):
+    ans = ans.strip().lower()
+    if q_type == "yesno":
+        if "yes" in ans or "present" in ans or "evidence" in ans:
+            return "Yes"
+        elif "no" in ans or "absent" in ans or "none" in ans:
+            return "No"
+        else:
+            return None  # ambiguous
+    if q_type == "count":
+        # Extract numeric value or return None
+        from re import findall
+        numbers = findall(r"\d+", ans)
+        if numbers:
+            return numbers[0]
+        elif "one" in ans: return "1"
+        elif "two" in ans: return "2"
+        return None
+    if q_type == "color":
+        for color in ["red","green","yellow","blue","white","black"]:
+            if color in ans:
+                return color
+        return None
+    if q_type == "location":
+        # Simplify locations to a small fixed set
+        for loc in ["upper","lower","left","right","central"]:
+            if loc in ans:
+                return loc
+        return None
+    if q_type in ["single","multi"]:
+        return ans  # keep original but can also restrict choices
+    return ans
+def build_vocabs(dataset,q_types_mapping):
+    # Build task-specific vocabularies
+    task_vocabs = {}
+    for general_class in set(q_types_mapping.values()):
+        task_vocabs[general_class] = {}
+    for row in dataset:
+        fine_class = row["question_class"]
+        # ✅ Handle if fine_class is a list
+        if isinstance(fine_class, list):
+            fine_class = fine_class[0]
+        general_class = q_types_mapping[fine_class]
+        norm_ans = normalize_answer(row["answer"], general_class)
+        if norm_ans is None:
+            continue  # skip unnormalizable answers
+        if norm_ans not in task_vocabs[general_class]:
+            idx = len(task_vocabs[general_class])
+            task_vocabs[general_class][norm_ans] = idx
+    return task_vocabs
+def build_answer_vocab(dataset, q_types_mapping):
+    answer_vocab = defaultdict(dict)
+    counters = defaultdict(int)
+    for ans, q_class in zip(dataset["answer"], dataset["question_class"]):
+        # q_class might be a list; pick the first (if multiple labels)
+        if isinstance(q_class, list):
+            q_class = q_class[0]
+        general_class = q_types_mapping[q_class]
+        if ans not in answer_vocab[general_class]:
+            answer_vocab[general_class][ans] = counters[general_class]
+            counters[general_class] += 1
+    return answer_vocab
+def collate_fn(batch):
+    #print(type(batch[0]["image"]))
+    #images = torch.stack([item["image"] for item in batch])
+    images = torch.stack([torch.tensor(item["image"]) if isinstance(item["image"], list) else item["image"] for item in batch])
+    #print(type(images), images.shape)
+    input_ids = torch.stack([torch.tensor(item["input_ids"]) if isinstance(item["input_ids"], list) else item["input_ids"] for item in batch])
+    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) if isinstance(item["attention_mask"], list) else item["attention_mask"] for item in batch])
+    #input_ids = torch.stack([item["input_ids"] for item in batch])
+    #attention_mask = torch.stack([item["attention_mask"] for item in batch])
+    answers = [item["answer"] for item in batch]  # keep as list for label encoding later
+    q_classes = [item["question_class"] for item in batch]
+    return {
+        "images": images,
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "answers": answers,
+        "question_classes": q_classes,
+    }

fussionmodel.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from transformers import ViTModel, BertModel
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+#image_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224").to(device)
+#question_encoder = BertModel.from_pretrained("bert-base-uncased").to(device)
+class CoAttentionFusion(nn.Module):
+    def __init__(self, img_dim, ques_dim, disease_dim, hidden_dim, answer_vocab):
+        super(CoAttentionFusion, self).__init__()
+        self.img_proj = nn.Linear(img_dim, hidden_dim)
+        self.ques_proj = nn.Linear(ques_dim, hidden_dim)
+        self.dis_proj = nn.Linear(disease_dim, hidden_dim)
+        self.att_img = nn.Linear(hidden_dim, 1)
+        self.att_dis = nn.Linear(hidden_dim, 1)
+        self.fusion = nn.Linear(hidden_dim * 3, hidden_dim)
+        # ✅ Store answer vocab inside model for later use
+        self.answer_vocab = answer_vocab
+    def forward(self, img_feat, ques_feat, dis_vec):
+        # Project features
+        #print("Input Shapes\t",img_feat.shape, ques_feat.shape, dis_vec.shape)
+        img_proj = torch.tanh(self.img_proj(img_feat))     # [B, H]
+        ques_proj = torch.tanh(self.ques_proj(ques_feat))  # [B, H]
+        #print("Ques_proj",ques_proj.shape,img_proj.shape)
+        dis_vec = dis_vec.to(torch.float32)
+        dis_proj = torch.tanh(self.dis_proj(dis_vec))      # [B, H]
+        #print("After projection\t",img_proj.shape, ques_proj.shape, dis_proj.shape)
+        # Expand question for image alignment
+        #ques_expand = ques_proj.unsqueeze(1).expand_as(img_proj)
+        #ques_expand = ques_proj#.expand_as(img_proj)
+        #img_co = img_proj * ques_expand
+        #Replacement of above 2 lines
+        ques_proj = ques_proj.unsqueeze(1)                    # [16, 1, 512]
+        ques_expand = ques_proj.expand(-1, img_proj.size(1), -1)  # [16, 197, 512]
+        img_co = img_proj * ques_expand                       # [16, 197, 512]
+        #print("ques_expand",ques_expand.shape,img_co.shape)
+        att_img_weights = torch.sigmoid(self.att_img(img_co))  # [B, 1]
+        #img_att = att_img_weights * img_proj
+        img_att = (att_img_weights * img_proj).sum(1)
+        #att_img_weights = F.softmax(self.att_img(img_co), dim=1)   # [B, R, 1]
+        #img_att = (att_img_weights * img_proj).sum(1)              # [B, H]
+        # Co-attention with disease vector
+        dis_co = dis_proj * ques_proj
+        att_dis_weights = torch.sigmoid(self.att_dis(dis_co))      # [B, 1]
+        dis_att = att_dis_weights * dis_proj                       # [B, H]
+        #print(img_att.shape, ques_proj.shape, dis_att.shape)
+        if img_att.dim() == 1:
+            img_att = img_att.unsqueeze(0)   # [1, H]
+        if ques_proj.dim() == 1:
+            ques_proj = ques_proj.unsqueeze(0)     # [1, H]
+        if dis_att.dim() == 1:
+            dis_att = dis_att.unsqueeze(0)     # [1, H]
+        # Concatenate
+        ques_proj_flat = ques_proj.squeeze(1)
+        dis_att_flat = dis_att.squeeze(1)
+        #print(img_att.shape,ques_proj.shape,dis_att.shape)
+        joint_feat = torch.cat([img_att, ques_proj_flat, dis_att_flat], dim=1)  # [B, 3H]
+        fused = torch.tanh(self.fusion(joint_feat))  # [B, H]
+        return fused

model.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import torch
+import torch.nn as nn
+import os
+from .qtype import QuestionTypeClassifier
+from .functions import build_vocabs, build_answer_vocab, collate_fn, preprocess_example, normalize_answer, preprocess_image
+from .models import disease_model, device, generate_descriptive_answer, router_tokenizer, gen_model
+from .tpred import TaskPredictor
+from .model_functions import compute_loss, compute_meteor, compute_rouge, extract_count, forward_batch
+from .fussionmodel import BertModel, CoAttentionFusion, ViTModel, F
+class VQAModel(nn.Module):
+    def __init__(self,img_dim, ques_dim, disease_dim, hidden_dim):
+        super(VQAModel, self).__init__()
+        #self.fusion = CoAttentionFusion(img_dim, ques_dim, disease_dim, hidden_dim, answer_vocab=answer_vocab)
+        self.qtype_classifier=None
+        self.answer_classifier=None
+        self.epochs=1
+        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.hidden_dim=hidden_dim
+        self.input_dim=768
+        self.ques_dim=ques_dim
+        self.disease_dim=disease_dim
+        self.img_dim=img_dim
+        self.fusion_module=None
+        self.question_encoder=BertModel.from_pretrained("bert-base-uncased").to(self.device)
+        self.image_encoder=ViTModel.from_pretrained("google/vit-base-patch16-224").to(self.device)
+        self.optimizer=None
+        self.answer_vocabs=None
+        self.task_vocabs=None
+        self.data_train=None
+        self.train_loader=None
+        self.q_types = ["yesno", "single", "multi", "color", "location", "count"]
+        # Create task-specific heads (trainable)
+        self.task_heads = nn.ModuleDict({
+            t: TaskPredictor(t, hidden=hidden_dim) for t in self.q_types
+        })
+        self.q_types_mapping = {
+            'abnormality_color': 'color',
+            'landmark_color': 'color',
+            'abnormality_location': 'location',
+            'instrument_location': 'location',
+            'landmark_location': 'location',
+            'finding_count': 'count',
+            'instrument_count': 'count',
+            'polyp_count': 'count',
+            'abnormality_presence': 'yesno',
+            'box_artifact_presence': 'yesno',
+            'finding_presence': 'yesno',
+            'instrument_presence': 'yesno',
+            'landmark_presence': 'yesno',
+            'text_presence': 'yesno',
+            'polyp_removal_status': 'yesno',
+            'polyp_type': 'single',
+            'polyp_size': 'single',
+            'procedure_type': 'single',
+        }
+    def train(self,epochs,data_train,train_loader):
+        self.epochs=epochs
+        self.train_data=data_train
+        self.train_loader=train_loader
+        self.answer_vocabs = build_answer_vocab(self.train_data, self.q_types_mapping)
+        self.task_vocabs = build_vocabs(self.train_data,self.q_types_mapping)
+        #self.qtype_classifier = nn.Linear(hidden_dim, len(self.task_vocabs))   # ✅ match hidden_dim
+        self.qtype_classifier=QuestionTypeClassifier(num_types=len(self.q_types)).to(self.device)
+        #QuestionTypeClassifier(hidden=self.input_dim, num_types=len(self.q_types)).to(device)
+        #print(self.qtype_classifier)
+        self.answer_classifier = nn.Linear(self.hidden_dim, len(self.answer_vocabs)) # ✅ match hidden_dim
+        self.fusion_module = CoAttentionFusion(img_dim=self.img_dim,
+                                               ques_dim=self.ques_dim,
+                                               disease_dim=self.disease_dim,
+                                               hidden_dim=self.hidden_dim,
+                                               answer_vocab=self.answer_vocabs).to(self.device)
+        self.optimizer = torch.optim.AdamW(list(self.fusion_module.parameters()) +
+                                  list(self.question_encoder.parameters()) +
+                                  list(self.image_encoder.parameters())+
+                                  list(self.qtype_classifier.parameters()), lr=2e-5)
+        for epoch in range(self.epochs):
+            self.fusion_module.train()
+            self.qtype_classifier.train()
+            total_loss = 0
+            for batch in self.train_loader:
+                self.optimizer.zero_grad()
+                preds, answers, task_logits = forward_batch(
+                    batch["images"],
+                    batch["input_ids"],
+                    batch["attention_mask"],
+                    batch["answers"],
+                    batch["question_classes"],  # fine-grained from dataset
+                    qtype_classifier=self.qtype_classifier,
+                    fusion_module=self.fusion_module,
+                    q_types=self.q_types,
+                    q_types_mapping=self.q_types_mapping,
+                    task_heads=self.task_heads,
+                    device=self.device,
+                    image_encoder=self.image_encoder,
+                    question_encoder=self.question_encoder
+                )
+                #preds, answers = forward_batch(batch["images"],batch["input_ids"], batch["attention_mask"], batch["answers"], batch["question_classes"])
+                loss = compute_loss(preds,
+                                    answers,
+                                    task_logits,
+                                    batch["question_classes"],
+                                    answer_vocabs=self.answer_vocabs,
+                                    q_types_mapping=self.q_types_mapping,
+                                    q_types=self.q_types,
+                                    task_heads=self.task_heads
+                                   )
+                #loss = compute_loss(preds, answers, batch["question_classes"])
+                loss.backward()
+                self.optimizer.step()
+                total_loss += loss.item()
+            print(f"Epoch {epoch}, Train Loss: {total_loss / len(train_loader)}")
+    def eval(self, val_loader):
+        """
+        Evaluate the model on the validation set.
+        Args:
+            val_loader: DataLoader for validation data.
+        Returns:
+            avg_loss: average validation loss
+            all_preds: list of predicted labels
+            all_answers: list of ground truth answers
+        """
+        self.fusion_module.eval()
+        self.question_encoder.eval()
+        self.image_encoder.eval()
+        self.qtype_classifier.eval()
+        for head in self.task_heads.values():
+            head.eval()
+        total_loss = 0.0
+        all_preds, all_answers = [], []
+        with torch.no_grad():
+            for batch in val_loader:
+                images = batch["images"].to(self.device)
+                input_ids = batch["input_ids"].to(self.device)
+                attention_mask = batch["attention_mask"].to(self.device)
+                answers = batch["answers"]
+                q_classes = batch["question_classes"]
+                # ---- Disease vector ----
+                disease_vec = disease_model(images)
+                # ---- Question type classifier ----
+                task_logits = self.qtype_classifier(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask
+                )  # [B, num_types]
+                # map fine-grained → general
+                mapped_classes = [
+                    self.q_types_mapping[c[0] if isinstance(c, list) else c]
+                    for c in q_classes
+                ]
+                # ---- Encoders ----
+                q_feat = self.question_encoder(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask
+                ).pooler_output  # [B, 768]
+                img_outputs = self.image_encoder(pixel_values=images)
+                img_feat = img_outputs.last_hidden_state  # [B, R, 768]
+                # ---- Fusion ----
+                fused = self.fusion_module(img_feat, q_feat, disease_vec)
+                # ---- Predict per sample ----
+                pred_tensors = []
+                batch_preds = []
+                for i, task_type in enumerate(mapped_classes):
+                    predictor = self.task_heads[task_type]
+                    #pred_out = predictor(fused[i].unsqueeze(0))
+                    pred_tensor = predictor(fused[i].unsqueeze(0))   # shape [1, C] or [1,1] for count
+                    pred_tensors.append(pred_tensor)
+                    if task_type == "yesno":
+                        pred_label = "Yes" if torch.argmax(pred_tensor, dim=1).item() == 1 else "No"
+                    elif task_type == "count":
+                        pred_val = pred_tensor.squeeze()
+                        pred_label = str(int(round(pred_val.item())))
+                        #pred_label = str(int(pred_out.item()))
+                    else:
+                        ans_idx = torch.argmax(pred_tensor, dim=1).item()
+                        if task_type in self.answer_vocabs and ans_idx < len(self.answer_vocabs[task_type]):
+                            inv_vocab = {v: k for k, v in self.answer_vocabs[task_type].items()}
+                            pred_label = inv_vocab.get(ans_idx, str(ans_idx))
+                        else:
+                            pred_label = str(ans_idx)
+                    batch_preds.append(pred_label)
+                # ---- Compute loss ----
+                """
+                batch_loss = compute_loss(
+                    [self.task_heads[c](fused[i].unsqueeze(0)) for i, c in enumerate(mapped_classes)],
+                    answers,
+                    task_logits,
+                    q_classes,
+                    self.answer_vocabs
+                )"""
+                # compute batch loss using the same preds (tensors) and required extra args
+                batch_loss = compute_loss(
+                    preds=pred_tensors,
+                    answers=answers,
+                    task_logits=task_logits,
+                    true_q_classes=q_classes,
+                    answer_vocabs=self.answer_vocabs,
+                    q_types_mapping=self.q_types_mapping,
+                    q_types=self.q_types,
+                    task_heads=self.task_heads
+                )
+                total_loss += batch_loss.item()
+                all_preds.extend(batch_preds)
+                all_answers.extend(answers)
+        avg_loss = total_loss / len(val_loader)
+        return avg_loss, all_preds, all_answers
+    def load(self,load_path = "vqa_model.pt"):
+        checkpoint = torch.load(load_path, map_location=self.device,weights_only=False)
+        self.task_vocabs=checkpoint["task_vocabs"]
+        self.answer_vocabs=checkpoint["answer_vocabs"]
+        self.fusion_module = CoAttentionFusion(
+            img_dim=self.img_dim, ques_dim=self.ques_dim, disease_dim=self.disease_dim, hidden_dim=self.hidden_dim,
+            answer_vocab=checkpoint["answer_vocabs"]
+        ).to(self.device)
+        self.fusion_module.load_state_dict(checkpoint["fusion_module"])
+        self.question_encoder.load_state_dict(checkpoint["question_encoder"])
+        self.image_encoder.load_state_dict(checkpoint["image_encoder"])
+        self.qtype_classifier.load_state_dict(checkpoint["qtype_classifier"])
+        for k, v in checkpoint["task_heads"].items():
+            self.task_heads[k].load_state_dict(v)
+        # 3. Recreate optimizer with correct params
+        self.optimizer = torch.optim.AdamW(
+            list(self.fusion_module.parameters()) +
+            list(self.question_encoder.parameters()) +
+            list(self.image_encoder.parameters()) +
+            list(self.qtype_classifier.parameters()),
+            lr=2e-5
+        )
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
+        print("Model and components loaded successfully")
+    def save(self,save_path = "vqa_model.pt"):
+        torch.save({
+            "fusion_module": self.fusion_module.state_dict(),
+            "question_encoder": self.question_encoder.state_dict(),
+            "image_encoder": self.image_encoder.state_dict(),
+            "qtype_classifier": self.qtype_classifier.state_dict(),
+            "task_heads": {k: v.state_dict() for k, v in self.task_heads.items()},
+            "optimizer": self.optimizer.state_dict(),
+            "epochs": self.epochs,
+            "answer_vocabs": self.answer_vocabs,
+            "task_vocabs": self.task_vocabs
+        }, save_path)
+        print(f"Model saved at {save_path}")
+    def predict(self, image, question):
+        self.fusion_module.eval()
+        self.question_encoder.eval()
+        self.image_encoder.eval()
+        self.qtype_classifier.eval()
+        with torch.no_grad():
+            # ---- Preprocess image ----
+            image_tensor = preprocess_image(image).unsqueeze(0).to(self.device)
+            # ---- Disease vector ----
+            disease_vec = disease_model(image_tensor)
+            # ---- Encode question ----
+            q_inputs = router_tokenizer(
+                question,
+                return_tensors="pt",
+                truncation=True,
+                padding=True
+            ).to(self.device)
+            # DistilBERT classifier for q-type
+            task_logits = self.qtype_classifier(
+                input_ids=q_inputs["input_ids"],
+                attention_mask=q_inputs["attention_mask"]
+            )  # [1, num_types]
+            task_idx = torch.argmax(task_logits, dim=1).item()
+            task_type = self.q_types[task_idx]  # map index → general type
+            # ---- Question encoder for fusion ----
+            q_feat = self.question_encoder(**q_inputs).pooler_output  # [1, 768]
+            # ---- Image encoder ----
+            img_outputs = self.image_encoder(pixel_values=image_tensor)
+            img_feat = img_outputs.last_hidden_state  # [1, R, 768]
+            # ---- Fusion ----
+            fused = self.fusion_module(img_feat, q_feat, disease_vec)
+            # ---- Task-specific head ----
+            predictor = self.task_heads[task_type]  # use pretrained head
+            pred_out = predictor(fused)
+            # ---- Decode prediction ----
+            if task_type == "yesno":
+                pred_label = "Yes" if torch.argmax(pred_out, dim=1).item() == 1 else "No"
+            elif task_type == "count":
+                pred_label = str(int(pred_out.item()))
+            else:  # categorical answer
+                ans_idx = torch.argmax(pred_out, dim=1).item()
+                if task_type in self.answer_vocabs and ans_idx < len(self.answer_vocabs[task_type]):
+                    inv_vocab = {v: k for k, v in self.answer_vocabs[task_type].items()}
+                    pred_label = inv_vocab.get(ans_idx, str(ans_idx))
+                else:
+                    pred_label = str(ans_idx)
+        return pred_label

model_functions.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+from torch.nn import CrossEntropyLoss, MSELoss
+import re
+#!pip install rouge_score
+#from rouge_score import rouge_scorer
+from nltk.translate.meteor_score import meteor_score
+from .models import disease_model
+def forward_batch(images, input_ids, attention_mask, answers, question_classes=None,qtype_classifier=None,fusion_module=None,q_types=None,q_types_mapping=None,task_heads=None,device=None,image_encoder=None,question_encoder=None):
+    # Image encoding
+    img_outputs = image_encoder(pixel_values=images.to(device))
+    img_feat = img_outputs.last_hidden_state  # [B, R, 768]
+    # Question encoding (DistilBERT for qtype classification)
+    task_logits = qtype_classifier(input_ids=input_ids.to(device),
+                                   attention_mask=attention_mask.to(device))  # [B, num_types]
+    # Use another encoder for question embeddings (router encoder you already had)
+    q_feat = question_encoder(input_ids=input_ids.to(device),
+                              attention_mask=attention_mask.to(device)).pooler_output  # [B, 768]
+    # Disease model
+    disease_vec = disease_model(images.to(device))  # [B, 23]
+    # Fusion
+    fused = fusion_module(img_feat, q_feat, disease_vec)
+    # Task-specific predictions (list of preds per sample, like before)
+    preds = []
+    for i, q_class in enumerate(question_classes):#q_class from task)type
+        mapped_type = q_types_mapping[q_class[0] if isinstance(q_class, list) else q_class]
+        predictor = task_heads[mapped_type]   # ✅ trained head
+        pred_out = predictor(fused[i].unsqueeze(0))
+        preds.append(pred_out)
+        #general_class = q_types_mapping[task_type[0] if isinstance(task_type, list) else task_type]
+        #head = TaskPredictor(general_class, hidden=fused.size(-1)).to(device)
+        #preds.append(head(fused[i].unsqueeze(0)))
+    return preds, answers, task_logits
+def forward_batch1(images, input_ids, attention_mask, answers, true_q_classes=None,qtype_classifier=None,fusion_module=None,q_types=None):
+    # Disease vector (dummy placeholder: replace with your trained disease model)
+    disease_vec = disease_model(images)  # [B, 23]
+    # Encode image
+    img_outputs = image_encoder(pixel_values=images.to(device))
+    img_feat = img_outputs.last_hidden_state  # [B, R, 768]
+    # Encode question
+    q_feat = question_encoder(input_ids=input_ids.to(device),
+                              attention_mask=attention_mask.to(device)).pooler_output  # [B, 768]
+    # Predict task type from question
+    #print(q_feat.device)
+    #print(q_feat.shape)
+    #task_logits = qtype_classifier(q_feat)  # [B, 6]
+    task_logits = qtype_classifier(input_ids=batch["input_ids"],
+                               attention_mask=batch["attention_mask"])
+    task_pred = torch.argmax(task_logits, dim=1)  # predicted type index
+    # Fusion
+    fused = fusion_module(img_feat, q_feat, disease_vec)
+    # Task-specific predictions
+    preds = []
+    for i, t_idx in enumerate(task_pred):
+        task_type = q_types[t_idx]  # map index to string
+        predictor = TaskPredictor(task_type).to(device)
+        preds.append(predictor(fused[i].unsqueeze(0)))
+    return preds, answers, task_logits
+    #for i, task_type in enumerate(q_classes):
+    #    predictor = TaskPredictor(task_type).to(device)
+    #    pred_out = predictor(fused[i].unsqueeze(0))
+    #    preds.append(pred_out)
+    #return preds, answers
+def extract_count(answer_str):
+    """
+    Try to convert an answer string into a number.
+    Returns None if it cannot be parsed.
+    """
+    try:
+        # Direct numeric
+        return float(answer_str)
+    except ValueError:
+        pass
+    # Handle words like "one", "two", etc.
+    word2num = {
+        "zero": 0, "one": 1, "two": 2, "three": 3,
+        "four": 4, "five": 5, "six": 6,
+        "seven": 7, "eight": 8, "nine": 9, "ten": 10
+    }
+    tokens = answer_str.lower().split()
+    for t in tokens:
+        if t in word2num:
+            return float(word2num[t])
+    # Extract any digits from the string
+    numbers = re.findall(r"\d+", answer_str)
+    if numbers:
+        return float(numbers[0])
+    return None  # fallback
+def compute_meteor(preds, answers, answer_vocabs, mapped_classes):
+    scores = []
+    for pred, ans, c in zip(preds, answers, mapped_classes):
+        if c not in answer_vocabs:
+            continue
+        # Get predicted index
+        pred_idx = pred.argmax(dim=1).item()
+        # Map index back to string
+        inv_vocab = {v: k for k, v in answer_vocabs[c].items()}
+        pred_str = inv_vocab.get(pred_idx, "")
+        # METEOR score between predicted and ground truth answer
+        score = meteor_score([ans.split()], pred_str.split())
+        scores.append(score)
+    return sum(scores) / len(scores) if scores else 0.0
+def compute_rouge(preds, answers, answer_vocabs, mapped_classes):
+    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+    scores = []
+    for pred, ans, c in zip(preds, answers, mapped_classes):
+        if c not in answer_vocabs:
+            continue
+        pred_idx = pred.argmax(dim=1).item()
+        inv_vocab = {v: k for k, v in answer_vocabs[c].items()}
+        pred_str = inv_vocab.get(pred_idx, "")
+        score = scorer.score(ans, pred_str)["rougeL"].fmeasure
+        scores.append(score)
+    return sum(scores) / len(scores) if scores else 0.0
+def compute_loss(preds, answers, task_logits, true_q_classes, answer_vocabs,q_types_mapping,q_types,task_heads):
+    """
+    preds: list of model predictions for each sample
+    answers: list of strings (descriptive answers)
+    task_logits: tensor [batch_size, num_task_types]
+    true_q_classes: list of lists (fine-grained classes for each question)
+    answer_vocabs: dict mapping {q_type: {answer: index}}
+    """
+    ce_loss = CrossEntropyLoss()
+    mse_loss = MSELoss()
+    total_loss = 0
+    # 1) Map fine-grained → general classes
+    mapped_classes = [
+        q_types_mapping[c[0] if isinstance(c, list) else c]
+        for c in true_q_classes
+    ]
+    # 2) Question type classification loss
+    true_task_types = torch.tensor(
+        [q_types.index(c) for c in mapped_classes],
+        device=task_logits.device
+    )
+    #print("task_logits, true_task_types\t",task_logits, true_task_types)
+    #print("task_logits, true_task_types\t",task_logits.shape, true_task_types.shape)
+    task_loss = ce_loss(task_logits, true_task_types)
+    total_loss += task_loss
+    # 3) Answer prediction loss (per sample)
+    for pred, ans, c in zip(preds, answers, mapped_classes):
+        predictor = task_heads[c]   # ✅ trained head
+        if c == "count":
+            # For count, answer must be numeric
+            try:
+                ans_val = float(ans)
+                ans_val = torch.tensor([ans_val], device=pred.device)
+                total_loss += mse_loss(pred.squeeze(), ans_val)
+            except ValueError:
+                print(f"[Warning] Skipping non-numeric count answer: {ans}")
+                continue
+        else:
+            # For categorical tasks (yesno, single, multi, etc.)
+            if ans not in answer_vocabs.get(c, {}):
+                print(f"[Warning] Skipping unseen or descriptive answer {ans} for task {c}")
+                continue
+            ans_idx = answer_vocabs[c][ans]
+            if ans_idx >= pred.size(1):
+                print(f"[Warning] Skipping answer {ans} for task {c}: "
+                      f"index {ans_idx} >= pred.size(1)")
+                continue
+            ans_tensor = torch.tensor([ans_idx], device=pred.device)
+            total_loss += ce_loss(pred, ans_tensor)
+    meteor = compute_meteor(preds, answers, answer_vocabs, mapped_classes)
+    print(f"Validation METEOR: {meteor:.4f}")
+    #rouge = compute_rouge(preds, answers, answer_vocabs, mapped_classes)
+    #print(f"Validation ROUGE-L: {rouge:.4f}")
+    return total_loss / len(preds)

models.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+from transformers import (
+    AutoTokenizer, AutoModelForSequenceClassification,
+    AutoModel, AutoProcessor, VisionEncoderDecoderModel,
+    T5Tokenizer, T5ForConditionalGeneration
+)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+gen_name = "t5-base"
+gen_tokenizer = T5Tokenizer.from_pretrained(gen_name)
+gen_model = T5ForConditionalGeneration.from_pretrained(gen_name).to(device)
+def generate_descriptive_answer(question, prediction, fused_features):
+    # Construct a prompt combining prediction and context
+    prompt = f"Question: {question} | Prediction: {prediction} | Context: GI disease analysis"
+    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
+    outputs = gen_model.generate(**inputs, max_length=50)
+    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+def disease_model(img):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    #torch.tensor(np.random.rand(23)).to(device)
+    return torch.zeros(23).to(device)
+router_name = "distilbert-base-uncased"
+router_tokenizer = AutoTokenizer.from_pretrained(router_name)

qtype.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+from torch import nn
+from transformers import DistilBertModel
+class QuestionTypeClassifier(nn.Module):
+    def __init__(self, num_types):
+        super().__init__()
+        # Load pre-trained DistilBERT
+        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        # Classification head
+        self.fc = nn.Linear(self.distilbert.config.hidden_size, num_types)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        # Take [CLS] token embedding (DistilBERT uses first token as [CLS])
+        cls_token = outputs.last_hidden_state[:, 0, :]  # [B, hidden]
+        logits = self.fc(cls_token)  # [B, num_types]
+        return logits

tpred.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# ---------------------------
+# Step 5: Task-Specific Predictors
+# ---------------------------
+import torch.nn as nn
+class TaskPredictor(nn.Module):
+    def __init__(self, task_type, hidden=512):
+        super().__init__()
+        if task_type == "yesno":
+            self.head = nn.Linear(hidden, 2)
+        elif task_type == "single":
+            self.head = nn.Linear(hidden, 10)
+        elif task_type == "multi":
+            self.head = nn.Linear(hidden, 10)
+        elif task_type == "color":
+            self.head = nn.Linear(hidden, 5)
+        elif task_type == "location":
+            self.head = nn.Linear(hidden, 6)
+        elif task_type == "count":
+            self.head = nn.Linear(hidden, 1)
+        else:
+            raise ValueError("Unknown task")
+    def forward(self, x):
+        return self.head(x)