Spaces:

aneeshm44
/

Gemma3nSolution

Sleeping

App Files Files Community

aneeshm44 commited on Aug 6, 2025

Commit

74f22cb

verified ·

1 Parent(s): d8a9fe7

Create app.py

Browse files

Files changed (1) hide show

app.py +467 -0

app.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import os
+import sys
+import tempfile
+import json
+import math
+import timm
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import gradio as gr
+from huggingface_hub import snapshot_download
+from typing import List, Union, Dict
+# Vision Model
+class TimmCNNModel(nn.Module):
+    def __init__(self, num_classes: int = 8, model_name: str = "efficientnet_b0"):
+        super().__init__()
+        self.backbone = timm.create_model(
+            'efficientnet_b0',
+            pretrained=True,
+            num_classes=0,
+        )
+        self.feature_dim = self.backbone.num_features
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.1),
+            nn.Linear(self.feature_dim, 512),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(512),
+            nn.Dropout(0.1),
+            nn.Linear(512, 256),
+            nn.ReLU(inplace=True),
+            nn.Linear(256, num_classes)
+        )
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        return self.backbone(x)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        features = self.forward_features(x)
+        logits = self.classifier(features)
+        return logits
+# Projector Model
+class Projector_4to3d(nn.Module):
+    def __init__(self, cnn_dim: int = 1280, llm_dim: int = 2048, num_heads: int = 8, dropout: float = 0.1):
+        super().__init__()
+        self.cnn_dim = cnn_dim
+        self.llm_dim = llm_dim
+        # Spatial positional embeddings for 8x8 grid
+        self.spatial_pos_embed = nn.Parameter(torch.randn(64, cnn_dim))
+        # Multi-scale feature processing
+        self.spatial_conv = nn.Conv2d(cnn_dim, cnn_dim // 2, 1)
+        self.global_pool = nn.AdaptiveAvgPool2d(1)
+        # Enhanced projection layers
+        self.input_proj = nn.Sequential(
+            nn.Linear(cnn_dim, llm_dim),
+            nn.LayerNorm(llm_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout)
+        )
+        # Multi-head self-attention for spatial reasoning
+        self.spatial_attention = nn.MultiheadAttention(
+            embed_dim=llm_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        # Cross-attention for text-image alignment
+        self.cross_attention = nn.MultiheadAttention(
+            embed_dim=llm_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.norm1 = nn.LayerNorm(llm_dim)
+        self.norm2 = nn.LayerNorm(llm_dim)
+        # Enhanced FFN
+        self.ffn = nn.Sequential(
+            nn.Linear(llm_dim, llm_dim * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(llm_dim * 4, llm_dim),
+            nn.Dropout(dropout)
+        )
+        self.norm3 = nn.LayerNorm(llm_dim)
+        # Token compression layer
+        self.compress_tokens = nn.Parameter(torch.randn(32, llm_dim))
+        self.token_compression = nn.MultiheadAttention(
+            embed_dim=llm_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Conv2d):
+                nn.init.kaiming_normal_(module.weight)
+    def forward(self, cnn_features: torch.Tensor, text_embeddings: torch.Tensor = None) -> torch.Tensor:
+        batch_size = cnn_features.shape[0]
+        # Multi-scale processing
+        spatial_features = self.spatial_conv(cnn_features)
+        global_context = self.global_pool(cnn_features).flatten(1)
+        # Flatten spatial features and add positional encoding
+        x = einops.rearrange(cnn_features, "b c h w -> b (h w) c")
+        pos_embeddings = self.spatial_pos_embed.unsqueeze(0).expand(batch_size, -1, -1)
+        x = x + pos_embeddings
+        # Project to LLM dimension
+        x = self.input_proj(x)
+        # Self-attention for spatial reasoning
+        attended_x, spatial_attn_weights = self.spatial_attention(x, x, x)
+        x = self.norm1(x + attended_x)
+        # Cross-attention with text (if available)
+        if text_embeddings is not None:
+            text_embeddings_float = text_embeddings.float()
+            cross_attended, cross_attn_weights = self.cross_attention(x, text_embeddings_float, text_embeddings_float)
+            x = self.norm2(x + cross_attended)
+        # FFN
+        ffn_out = self.ffn(x)
+        x = self.norm3(x + ffn_out)
+        # Optional token compression
+        compress_queries = self.compress_tokens.unsqueeze(0).expand(batch_size, -1, -1)
+        compressed_x, _ = self.token_compression(compress_queries, x, x)
+        return compressed_x
+# Main VLM Model
+class Model(nn.Module):
+    def __init__(self, image_model, language_model, projector, tokenizer, prompt="Describe this image:"):
+        super().__init__()
+        self.image_model = image_model
+        self.language_model = language_model
+        self.projector = projector
+        self.tokenizer = tokenizer
+        self.eos_token = tokenizer.eos_token
+        self.prompt = prompt
+        device = next(self.language_model.parameters()).device
+        self.image_model.to(device)
+        self.projector.to(device)
+        # Create prompt embeddings
+        prompt_tokens = tokenizer(text=prompt, return_tensors="pt").input_ids.to(device)
+        prompt_embeddings = language_model.get_input_embeddings()(prompt_tokens).detach()
+        self.register_buffer('prompt_embeddings', prompt_embeddings)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def generate(self, patches: torch.Tensor, generator_kwargs: dict[str, Union[int, float]]):
+        device = self.device
+        patches = patches.to(device)
+        image_features = self.image_model.backbone.forward_features(patches)
+        patch_embeddings = self.projector(image_features)
+        patch_embeddings = patch_embeddings.to(torch.bfloat16)
+        embeddings = torch.cat([
+            self.prompt_embeddings.expand(patches.size(0), -1, -1),
+            patch_embeddings,
+        ], dim=1)
+        prompt_mask = torch.ones(patches.size(0), self.prompt_embeddings.size(1), device=device)
+        patch_mask = torch.ones(patches.size(0), patch_embeddings.size(1), device=device)
+        attention_mask = torch.cat([prompt_mask, patch_mask], dim=1)
+        return self.language_model.generate(
+            inputs_embeds=embeddings,
+            attention_mask=attention_mask,
+            **generator_kwargs
+        )
+vlm_model = None
+tokenizer = None
+def download_and_load_models():
+    global vlm_model, tokenizer
+    print("Starting model download and initialization...")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+        print("CUDA available - using GPU")
+    else:
+        device = torch.device("cpu")
+        print("CUDA not available - using CPU")
+    repo_id = "aneeshm44/regfinal"
+    print(f"Downloading from repo: {repo_id}")
+    local_dir = tempfile.mkdtemp(prefix="regfinal_")
+    print(f"Local directory: {local_dir}")
+    try:
+        snapshot_download(
+            repo_id=repo_id,
+            repo_type="dataset",
+            local_dir=local_dir,
+            allow_patterns=[
+                "llmweights/*",
+                "imagemodelweights/finalcheckpoint.pth",
+                "projectorweights/projector.pth"
+            ],
+            local_dir_use_symlinks=False,
+        )
+        print("Download completed successfully")
+    except Exception as e:
+        print(f"Download failed: {e}")
+        raise e
+    llm_path = os.path.join(local_dir, "llmweights")
+    image_weights_path = os.path.join(local_dir, "imagemodelweights", "finalcheckpoint.pth")
+    projector_weights_path = os.path.join(local_dir, "projectorweights", "projector.pth")
+    print("Loading language model...")
+    try:
+        language_model = AutoModelForCausalLM.from_pretrained(
+            llm_path,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+        )
+        language_model.eval()
+        language_model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained(llm_path)
+        print("Language model loaded successfully")
+    except Exception as e:
+        print(f"Language model loading failed: {e}")
+        raise e
+    print("Loading vision model...")
+    try:
+        image_model = TimmCNNModel(num_classes=8)
+        weights = torch.load(image_weights_path, map_location=device)
+        image_model.load_state_dict(weights['model_state_dict'])
+        for param in image_model.parameters():
+            param.requires_grad = False
+        image_model.eval()
+        image_model.to(device)
+        print("Vision model loaded successfully")
+    except Exception as e:
+        print(f"Vision model loading failed: {e}")
+        raise e
+    print("Loading projector...")
+    try:
+        projector = Projector_4to3d(cnn_dim=1280, llm_dim=2048, num_heads=8)
+        weights = torch.load(projector_weights_path, map_location=device)
+        projector.load_state_dict(weights)
+        for param in projector.parameters():
+            param.requires_grad = False
+        projector.eval()
+        projector.to(device)
+        print("Projector loaded successfully")
+    except Exception as e:
+        print(f"Projector loading failed: {e}")
+        raise e
+    print("Creating VLM model...")
+    try:
+        vlm_model = Model(image_model, language_model, projector, tokenizer, prompt="Describe this image:")
+        vlm_model = vlm_model.to(device)
+        print("VLM model created successfully")
+    except Exception as e:
+        print(f"VLM model creation failed: {e}")
+        raise e
+    print("All models loaded successfully!")
+def tensor_to_pil_image(tensor):
+    """Convert tensor to PIL image for display"""
+    # Denormalize the tensor
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    # Remove batch dimension and denormalize
+    img_tensor = tensor.squeeze(0)
+    img_tensor = img_tensor * std + mean
+    img_tensor = torch.clamp(img_tensor, 0, 1)
+    # Convert to PIL
+    img_array = img_tensor.permute(1, 2, 0).numpy()
+    img_array = (img_array * 255).astype(np.uint8)
+    return Image.fromarray(img_array)
+def describe_image(image, temperature, top_p, max_tokens):
+    """Generate description for uploaded image"""
+    global vlm_model, tokenizer
+    if vlm_model is None:
+        return "Models not loaded yet. Please wait for initialization to complete.", None
+    if image is None:
+        return "Please upload an image.", None
+    try:
+        if isinstance(image, str):
+            image = Image.open(image).convert('RGB')
+        elif hasattr(image, 'convert'):
+            image = image.convert('RGB')
+        image_tensor = torch.tensor(image).unsqueeze(0)  # Add batch dimension
+        processed_image = tensor_to_pil_image(image_tensor)
+        # Generation parameters
+        generator_kwargs = {
+            "max_new_tokens": int(max_tokens),
+            "do_sample": True,
+            "temperature": float(temperature),
+            "top_p": float(top_p),
+            "pad_token_id": tokenizer.eos_token_id
+        }
+        with torch.no_grad():
+            output_ids = vlm_model.generate(image_tensor, generator_kwargs)
+            text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        if "Describe this image:" in text:
+            description = text.split("Describe this image:")[-1].strip()
+        else:
+            description = text.strip()
+        result_text = description if description else "Unable to generate description."
+        return result_text, processed_image
+    except Exception as e:
+        return f"Error processing image: {str(e)}", None
+def reset_interface():
+    return None, "Models loaded successfully! Upload an image to get started.", None
+try:
+    download_and_load_models()
+    initial_status = "Models loaded successfully! Upload an image to get started."
+except Exception as e:
+    initial_status = f"Failed to load models: {str(e)}"
+#  Gradio Interface
+def create_interface():
+    with gr.Blocks(title="WSI Pathology Report using Gemma3n") as demo:
+        gr.Markdown("# WSI Pathology Report using Gemma3n")
+        gr.Markdown("Upload a pathology image and get an AI-generated pathology report.")
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(type="pil", label="Upload WSI Image")
+                gr.Markdown("Generation parameters")
+                with gr.Row():
+                    temperature_slider = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.4,
+                        step=0.1,
+                        label="Temperature",
+                        info="Lower values give more consistent results whereas higher values produce more creative"
+                    )
+                    top_p_slider = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.9,
+                        step=0.1,
+                        label="Top-p",
+                        info="Lower values = more focused vocabulary, Higher values = more diverse vocabulary"
+                    )
+                    max_tokens_slider = gr.Slider(
+                        minimum=10,
+                        maximum=200,
+                        value=60,
+                        step=10,
+                        label="Max Tokens"
+                    )
+                with gr.Row():
+                    submit_btn = gr.Button("Generate Report", variant="primary")
+                    reset_btn = gr.Button("Reset", variant="secondary")
+            with gr.Column():
+                output_text = gr.Textbox(
+                    label="Pathology Report",
+                    lines=8,
+                    value=initial_status,
+                    show_copy_button=True
+                )
+                processed_image = gr.Image(
+                    label="Processed WSI Image",
+                    show_download_button=True
+                )
+        # Event handlers
+        submit_btn.click(
+            fn=describe_image,
+            inputs=[image_input, temperature_slider, top_p_slider, max_tokens_slider],
+            outputs=[output_text, processed_image]
+        )
+        # Auto-generate on image upload
+        image_input.change(
+            fn=describe_image,
+            inputs=[image_input, temperature_slider, top_p_slider, max_tokens_slider],
+            outputs=[output_text, processed_image]
+        )
+        # Reset functionality
+        reset_btn.click(
+            fn=reset_interface,
+            inputs=[],
+            outputs=[image_input, output_text, processed_image]
+        )
+    return demo
+# Launch the interface
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )