Spaces:

hackergeek
/

RADIOCAP200

Runtime error

App Files Files Community

hackergeek commited on Feb 1

Commit

8bbc742

verified ·

1 Parent(s): 8df6dbf

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -185

app.py CHANGED Viewed

@@ -1,206 +1,84 @@
-# =====================================================
-# Gradio Radiology Captioner (with VQA-ready model)
-# Loads epoch_04 checkpoint from Hugging Face
-# =====================================================
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torchvision import transforms
 from PIL import Image
-import math
 import gradio as gr
-from huggingface_hub import hf_hub_download
-import numpy as np
-import pydicom
-import nibabel as nib
-# ======================
-# Device & dtype
-# ======================
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.bfloat16 if DEVICE=="cuda" else torch.float32
-# ======================
-# Tokenizer
-# ======================
 from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
-tokenizer.pad_token = tokenizer.eos_token
-VOCAB_SIZE = tokenizer.vocab_size
-MAX_SEQ_LEN = 192
-# ======================
-# Conversation formatting
-# ======================
-IM_START = "<|im_start|>"
-IM_END   = "<|im_end|>"
-def format_conversation(conversations):
-    text = ""
-    for msg in conversations:
-        role = msg.get("from", msg.get("role", "assistant"))
-        if role == "human":
-            role = "user"
-        content = msg.get("value", msg.get("content", ""))
-        text += f"{IM_START}{role}\n{content.strip()}{IM_END}\n"
-    return text.strip()
-# ======================
-# Model definition
-# ======================
-class ConvBlock(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.dwconv = nn.Conv2d(dim_in, dim_in, 3, padding=1, groups=dim_in)
-        self.norm = nn.LayerNorm(dim_in)
-        self.pw1 = nn.Linear(dim_in, 4*dim_in)
-        self.act = nn.GELU()
-        self.pw2 = nn.Linear(4*dim_in, dim_out)
-        self.shortcut = nn.Conv2d(dim_in, dim_out, 1) if dim_in!=dim_out else nn.Identity()
-    def forward(self, x):
-        res = self.shortcut(x)
-        x = self.dwconv(x)
-        x = x.permute(0,2,3,1)
-        x = self.norm(x)
-        x = x.permute(0,3,1,2)
-        x = x.flatten(2).transpose(1,2)
-        x = self.pw1(x)
-        x = self.act(x)
-        x = self.pw2(x)
-        x = x.transpose(1,2).view(res.shape)
-        return res + x
-class CNNEncoder(nn.Module):
-    def __init__(self, dims=[96,192,384]):
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(3,dims[0],4,4),
-            nn.BatchNorm2d(dims[0]),
-            nn.GELU()
-        )
-        self.stages = nn.ModuleList()
-        for i in range(len(dims)-1):
-            stage = nn.Sequential(*[ConvBlock(dims[i],dims[i]) for _ in range(3)],
-                                  nn.Conv2d(dims[i], dims[i+1], 2,2))
-            self.stages.append(stage)
-        self.stages.append(nn.Sequential(*[ConvBlock(dims[-1],dims[-1]) for _ in range(3)]))
-        self.norm = nn.LayerNorm(dims[-1])
-    def forward(self, x):
-        x = self.stem(x)
-        for stage in self.stages:
-            x = stage(x)
-        x = x.flatten(2).transpose(1,2)
-        x = self.norm(x)
-        return x
-class LocalGraphProp(nn.Module):
-    def __init__(self, dim, steps=3):
         super().__init__()
-        self.steps = steps
-        self.W_self = nn.Parameter(torch.tensor(1.0))
-        self.W_neigh = nn.Parameter(torch.ones(8)/8)
-        self.update = nn.Sequential(nn.Linear(dim, dim), nn.GELU(), nn.Linear(dim, dim))
-    def forward(self,x):
-        B,L,D = x.shape
-        H = W = int(math.sqrt(L))
-        grid = x.view(B,H,W,D)
-        for _ in range(self.steps):
-            padded = F.pad(grid,(0,0,1,1,1,1), mode='replicate')
-            neighbors=[]
-            for dy in [-1,0,1]:
-                for dx in [-1,0,1]:
-                    if dy==dx==0: continue
-                    neighbors.append(padded[:,1+dy:H+1+dy,1+dx:W+1+dx])
-            neigh = torch.stack(neighbors, dim=3)
-            agg = self.W_self*grid.unsqueeze(3) + self.W_neigh.view(1,1,1,8,1)*neigh
-            agg = agg.sum(dim=3)
-            upd = self.update(agg.view(-1,D)).view(B,H,W,D)
-            grid = grid + upd
-            grid = torch.tanh(grid)*0.5 + 0.5
-        return grid.view(B,L,D)
-class RadiologyCaptioner(nn.Module):
-    def __init__(self,d_model=384,nhead=6,num_layers=3):
-        super().__init__()
-        self.encoder = CNNEncoder(dims=[96,192,d_model])
-        self.graph_prop = LocalGraphProp(d_model)
-        decoder_layer = nn.TransformerDecoderLayer(
-            d_model=d_model, nhead=nhead, dim_feedforward=4*d_model,
-            dropout=0.1, activation='gelu', batch_first=True
-        )
-        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
-        self.embed = nn.Embedding(VOCAB_SIZE,d_model)
-        self.pos_embed = nn.Parameter(torch.zeros(1,MAX_SEQ_LEN,d_model))
-        self.head = nn.Linear(d_model,VOCAB_SIZE)
-        self.d_model=d_model
-    def encode_image(self,images):
-        feats=self.encoder(images)
-        feats=self.graph_prop(feats)
-        return feats
-    def forward(self,images,input_ids,labels=None):
-        memory=self.encode_image(images)
-        tgt=self.embed(input_ids)*math.sqrt(self.d_model)+self.pos_embed[:,:input_ids.shape[1]]
-        out=self.decoder(tgt,memory)
-        logits=self.head(out)
-        if labels is not None:
-            loss=F.cross_entropy(logits.reshape(-1,VOCAB_SIZE), labels.reshape(-1), ignore_index=-100)
-            return logits, loss
-        return logits
-    @torch.no_grad()
-    def generate(self, images, max_len=MAX_SEQ_LEN, temperature=0.8):
-        B = images.shape[0]
-        memory = self.encode_image(images)
-        tokens = torch.full((B,1), tokenizer.bos_token_id or tokenizer.eos_token_id, dtype=torch.long, device=DEVICE)
-        for _ in range(max_len):
-            tgt = self.embed(tokens)*math.sqrt(self.d_model)+self.pos_embed[:,:tokens.shape[1]]
-            logits = self.head(self.decoder(tgt,memory)[:,-1])
-            next_token = (logits/temperature).softmax(-1).multinomial(1)
-            tokens = torch.cat([tokens,next_token],dim=1)
-            if next_token.item() == tokenizer.eos_token_id:
-                break
-        return tokens
-# ======================
-# Instantiate model & load checkpoint
-# ======================
-model = RadiologyCaptioner().to(DEVICE, dtype=DTYPE)
-checkpoint_path = hf_hub_download(
-    repo_id="erfanasghariyan/RADIOCAP200",
-    filename="model.pt",
-    subfolder="checkpoints/epoch_04"
-)
-state_dict = torch.load(checkpoint_path, map_location=DEVICE)
-model.load_state_dict(state_dict)
-# ======================
-# Image transform
-# ======================
-IMG_SIZE = 224
 transform = transforms.Compose([
-    transforms.Resize((IMG_SIZE,IMG_SIZE)),
     transforms.ToTensor(),
-    transforms.Normalize(mean=[0.5]*3,std=[0.5]*3)
 ])
-def load_image(img):
-    if isinstance(img,np.ndarray):
-        img = Image.fromarray(img)
-    elif isinstance(img,str) and img.lower().endswith(".dcm"):
-        dcm = pydicom.dcmread(img)
-        arr = dcm.pixel_array.astype(np.float32)
-        arr = np.clip((arr-arr.min())/(arr.ptp()+1e-6),0,1)
-        img = Image.fromarray((arr*255).astype(np.uint8)).convert("RGB")
     return transform(img).unsqueeze(0).to(DEVICE, dtype=DTYPE)
-# ======================
-# Gradio interface
-# ======================
-def predict(img):
     img_tensor = load_image(img)
-    tokens = model.generate(img_tensor)
-    caption = tokenizer.decode(tokens[0], skip_special_tokens=True)
-    return caption
-iface = gr.Interface(fn=predict, inputs=gr.Image(type="filepath"), outputs="text", title="RADIOCAP200 Radiology Captioner")
-iface.launch()

 import torch
+from torch import nn
 from torchvision import transforms
 from PIL import Image
 import gradio as gr
 from transformers import AutoTokenizer
+# ===========================
+# تنظیمات دستگاه و dtype
+# ===========================
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+# ===========================
+# مسیر مدل و tokenizer
+# ===========================
+CHECKPOINT_PATH = "checkpoints/epoch_04/model.pt"  # مسیر دانلود شده در Space
+TOKENIZER_NAME = "bert-base-uncased"  # یا مدل tokenizer مناسب شما
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+# ===========================
+# تعریف مدل (مثال ساده)
+# ===========================
+# توجه: مدل واقعی خودت را اینجا قرار بده
+class DummyCaptionModel(nn.Module):
+    def __init__(self):
         super().__init__()
+        self.dummy = nn.Linear(10, 10)
+    def forward(self, x, question=None):
+        # خروجی فرضی
+        if question:
+            return "Answer to question: " + question
+        return "Generated caption for the image"
+model = DummyCaptionModel()
+if torch.cuda.is_available():
+    model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))
+model.to(DEVICE)
+model.eval()
+# ===========================
+# Transform تصویر
+# ===========================
 transform = transforms.Compose([
+    transforms.Resize((224, 224)),
     transforms.ToTensor(),
+    # transforms.Normalize(mean=[0.485, 0.456, 0.406],
+    #                      std=[0.229, 0.224, 0.225])
 ])
+# ===========================
+# تابع بارگذاری تصویر
+# ===========================
+def load_image(img: Image.Image):
+    """تبدیل PIL image به Tensor"""
     return transform(img).unsqueeze(0).to(DEVICE, dtype=DTYPE)
+# ===========================
+# تابع اصلی پیش‌بینی
+# ===========================
+def predict(img: Image.Image, question: str = ""):
     img_tensor = load_image(img)
+    # اگر سوال خالی بود کپشن تولید کن، وگرنه VQA
+    output_text = model(img_tensor, question.strip() or None)
+    return output_text
+# ===========================
+# Interface گریدیو
+# ===========================
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Image(type="pil", label="Upload Radiology Image"),
+        gr.Textbox(label="Optional Question (for VQA)", placeholder="Ask a question or leave empty for caption")
+    ],
+    outputs=gr.Textbox(label="Output"),
+    title="RADIOCAP200: Radiology Caption + VQA",
+    description="Upload a radiology image and optionally ask a question. If the question is empty, model generates a caption. Otherwise, it answers the question."
+)
+if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860, share=True)