Spaces:

RakeshNJ12345
/

abc

Sleeping

App Files Files Community

RakeshNJ12345 commited on Sep 24, 2025

Commit

0bdd3ac

verified ·

1 Parent(s): e2aeef9

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -32

app.py CHANGED Viewed

@@ -6,15 +6,15 @@ import torch.nn as nn
 import numpy as np
 from PIL import Image
 from transformers import (
-    ViTConfig, ViTModel,
-    BartConfig, BartForConditionalGeneration, AutoTokenizer
 )
 from huggingface_hub import hf_hub_download
-import safetensors.torch as st
 import gradio as gr
-# ───────────────────────── Helpers ─────────────────────────
 def build_bad_words_ids(tok: AutoTokenizer):
     bad_phrases = [
         "XXXX", "xxxx", "X-XXXX", "x-XXXX", "x - XXXX", "x -xxxx", "x-xxxx",
         "X - XXXX", "x—XXXX", "x–XXXX", "x — XXXX", "x – XXXX",
@@ -25,9 +25,10 @@ def build_bad_words_ids(tok: AutoTokenizer):
         ids = tok(phrase, add_special_tokens=False).input_ids
         if ids and not all(i == tok.unk_token_id for i in ids):
             bad_ids.append(ids)
-    return bad_ids or None
 def normalize_report(text: str) -> str:
     if not text:
         return text
     text = re.sub(r'\bx\s*[-–—]?\s*xxxx\b', 'x-ray', text, flags=re.IGNORECASE)
@@ -40,32 +41,20 @@ def normalize_report(text: str) -> str:
 # ─── 1) MODEL LOADING ─────────────────────────────────────────────────────────
 def load_model():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    repo_id = "RakeshNJ12345/MMic-CXR"
-    base_path = "mimic_trained/final"  # ✅ correct subfolder root
-        # 1a) load ViT encoder manually
-    vit_cfg = ViTConfig.from_pretrained(repo_id, subfolder=f"{base_path}/vit")
-    vit = ViTModel(vit_cfg)
-    vit_path = hf_hub_download(repo_id, filename="model.safetensors", subfolder=f"{base_path}/vit")
-    vit_state = st.load_file(vit_path)
-    vit.load_state_dict(vit_state)
-    vit = vit.to(device)
-    # 1b) load decoder (BioBART) manually
-    dec_cfg = BartConfig.from_pretrained(repo_id, subfolder=f"{base_path}/decoder")
-    dec = BartForConditionalGeneration(dec_cfg)
-    dec_path = hf_hub_download(repo_id, filename="model.safetensors", subfolder=f"{base_path}/decoder")
-    dec_state = st.load_file(dec_path)
-    dec.load_state_dict(dec_state)
-    dec = dec.to(device)
-    # 1c) tokenizer
     tok = AutoTokenizer.from_pretrained(repo_id, subfolder=f"{base_path}/decoder")
     tok.clean_up_tokenization_spaces = True
-    # 1d) load projection head
     proj_path = hf_hub_download(repo_id=repo_id, filename="proj.bin", subfolder=base_path)
     loaded = torch.load(proj_path, map_location=device)
     if isinstance(loaded, dict):
@@ -73,13 +62,13 @@ def load_model():
         proj = nn.Linear(vit.config.hidden_size, dec.config.d_model)
         proj.load_state_dict(sd)
     else:
-        proj = loaded
     proj = proj.to(device)
-    # 1e) blocklist
     bad_words_ids = build_bad_words_ids(tok)
-    # 1f) wrapper model
     class TwoViewModel(nn.Module):
         def __init__(self, vit, dec, proj, tok, bad_words_ids=None):
             super().__init__()
@@ -94,13 +83,16 @@ def load_model():
         def generate(self, img_f, img_l, finds, max_len=128, num_beams=4):
             device = img_f.device
-            # extract [CLS]
             out_f = self.vit(pixel_values=img_f).last_hidden_state[:, 0]
             out_l = self.vit(pixel_values=img_l).last_hidden_state[:, 0]
             avg    = 0.5 * (out_f + out_l)
             prefix = self.proj(avg).unsqueeze(1)  # [B,1,D]
-            # prepend findings if available
             if (finds or "").strip():
                 enc = self.tok(finds, return_tensors="pt", padding=True, truncation=True).to(device)
                 text_emb = self.dec.get_encoder().embed_tokens(enc.input_ids)
@@ -133,6 +125,7 @@ model, tokenizer, device = load_model()
 # ─── 2) PREPROCESS ─────────────────────────────────────────────────────────────
 def preprocess(img: Image.Image) -> torch.Tensor:
     img = img.convert("RGB").resize((224, 224))
     arr = np.array(img).astype(np.float32) / 255.0
     if arr.ndim == 2:
@@ -155,7 +148,7 @@ def generate_report(frontal, lateral, findings, beams, max_len):
             f_t, l_t, findings or "", max_len=max_len, num_beams=beams
         )
         text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
-        text = normalize_report(text)
         return text or "<empty>"
     except Exception as e:
         traceback.print_exc()

 import numpy as np
 from PIL import Image
 from transformers import (
+    AutoConfig, AutoModel,            # ← handles dinov2/ViT automatically
+    AutoModelForSeq2SeqLM, AutoTokenizer
 )
 from huggingface_hub import hf_hub_download
 import gradio as gr
+# ───────────────────────── Helpers: blocklist + text normalizer ─────────────────────────
 def build_bad_words_ids(tok: AutoTokenizer):
+    """Build token id sequences to block anonymization artifacts."""
     bad_phrases = [
         "XXXX", "xxxx", "X-XXXX", "x-XXXX", "x - XXXX", "x -xxxx", "x-xxxx",
         "X - XXXX", "x—XXXX", "x–XXXX", "x — XXXX", "x – XXXX",
         ids = tok(phrase, add_special_tokens=False).input_ids
         if ids and not all(i == tok.unk_token_id for i in ids):
             bad_ids.append(ids)
+    return bad_ids or None  # HF expects None if empty
 def normalize_report(text: str) -> str:
+    """Cleanup on generated text to replace/remove anonymization placeholders."""
     if not text:
         return text
     text = re.sub(r'\bx\s*[-–—]?\s*xxxx\b', 'x-ray', text, flags=re.IGNORECASE)
 # ─── 1) MODEL LOADING ─────────────────────────────────────────────────────────
 def load_model():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    repo_id   = "RakeshNJ12345/MMic-CXR"
+    base_path = "mimic_trained/final"  # ← your confirmed path
+    # 1a) Encoder (DINOv2/ViT via AutoModel so config decides the class)
+    enc_cfg = AutoConfig.from_pretrained(repo_id, subfolder=f"{base_path}/vit")
+    vit     = AutoModel.from_pretrained(repo_id, subfolder=f"{base_path}/vit").to(device)
+    # enc_cfg.model_type will be 'dinov2' in your case; vit.config.hidden_size is available.
+    # 1b) Decoder & tokenizer (BioBART)
+    dec = AutoModelForSeq2SeqLM.from_pretrained(repo_id, subfolder=f"{base_path}/decoder").to(device)
     tok = AutoTokenizer.from_pretrained(repo_id, subfolder=f"{base_path}/decoder")
     tok.clean_up_tokenization_spaces = True
+    # 1c) Projection head
     proj_path = hf_hub_download(repo_id=repo_id, filename="proj.bin", subfolder=base_path)
     loaded = torch.load(proj_path, map_location=device)
     if isinstance(loaded, dict):
         proj = nn.Linear(vit.config.hidden_size, dec.config.d_model)
         proj.load_state_dict(sd)
     else:
+        proj = loaded  # if you saved an nn.Linear directly
     proj = proj.to(device)
+    # 1d) Blocklist for anonymization artifacts
     bad_words_ids = build_bad_words_ids(tok)
+    # 1e) Wrapper
     class TwoViewModel(nn.Module):
         def __init__(self, vit, dec, proj, tok, bad_words_ids=None):
             super().__init__()
         def generate(self, img_f, img_l, finds, max_len=128, num_beams=4):
             device = img_f.device
+            # CLS embeddings from both views
             out_f = self.vit(pixel_values=img_f).last_hidden_state[:, 0]
             out_l = self.vit(pixel_values=img_l).last_hidden_state[:, 0]
+            # average + project → prefix embedding
             avg    = 0.5 * (out_f + out_l)
             prefix = self.proj(avg).unsqueeze(1)  # [B,1,D]
+            # prepend findings text (optional)
             if (finds or "").strip():
                 enc = self.tok(finds, return_tensors="pt", padding=True, truncation=True).to(device)
                 text_emb = self.dec.get_encoder().embed_tokens(enc.input_ids)
 # ─── 2) PREPROCESS ─────────────────────────────────────────────────────────────
 def preprocess(img: Image.Image) -> torch.Tensor:
+    # Basic resize + [0,1] scaling; works across ViT/DINOv2
     img = img.convert("RGB").resize((224, 224))
     arr = np.array(img).astype(np.float32) / 255.0
     if arr.ndim == 2:
             f_t, l_t, findings or "", max_len=max_len, num_beams=beams
         )
         text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        text = normalize_report(text)  # cleanup anonymization artifacts
         return text or "<empty>"
     except Exception as e:
         traceback.print_exc()