Spaces:

bluspater
/

remove-bg-modnet

Runtime error

App Files Files Community

bluspater commited on Apr 9, 2025

Commit

b6584d4

verified ·

1 Parent(s): e69a94a

Update model/modnet.py

Browse files

Files changed (1) hide show

model/modnet.py +65 -58

model/modnet.py CHANGED Viewed

@@ -1,48 +1,73 @@
 import torch
-import cv2
 import numpy as np
 from PIL import Image
-from torchvision.models.mobilenetv2 import mobilenet_v2
-import torch.nn as nn
-def clean_state_dict(state_dict):
-    """Remove 'module.' prefix if present in keys."""
-    new_state_dict = {}
-    for k, v in state_dict.items():
-        if k.startswith('module.'):
-            new_state_dict[k[7:]] = v
-        else:
-            new_state_dict[k] = v
-    return new_state_dict
 class MODNet(nn.Module):
-    def __init__(self, in_channels=3, hr_channels=32, backbone_pretrained=True):
         super(MODNet, self).__init__()
-        mobilenet = mobilenet_v2(pretrained=backbone_pretrained)
-        self.backbone = mobilenet.features  # nn.Sequential already
-        # Simulate enc_channels expected by MODNet-style branches
-        self.enc_channels = [24, 32, 96, 320]
-        # Dummy branches to satisfy loading
-        self.lr_branch = nn.Identity()
-        self.hr_branch = nn.Identity()
-        self.f_branch = nn.Identity()
-    def forward(self, x, inference=False):
         features = self.backbone(x)
-        return features, features, features  # Dummy outputs
 def preprocess_image(image: Image.Image, device: torch.device) -> torch.Tensor:
-    img = np.array(image.convert("RGB"))
-    img_resized = cv2.resize(img, (512, 512))
-    img_input = img_resized / 255.0
-    img_input = torch.tensor(img_input, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(device)
-    return img_input
 def remove_background_modnet(image: Image.Image) -> Image.Image:
@@ -51,37 +76,19 @@ def remove_background_modnet(image: Image.Image) -> Image.Image:
     modnet = MODNet()
     modnet.to(device)
-    # Load weights
-    state_dict = torch.load('pretrained/modnet_webcam_portrait_matting.ckpt', map_location=device)
-    modnet.load_state_dict(clean_state_dict(state_dict), strict=False)
     modnet.eval()
-    img_input = preprocess_image(image, device)
     with torch.no_grad():
-        output = modnet(img_input, True)
-    if output is None:
-        raise RuntimeError("MODNet returned None. Ensure model is correctly initialized and forward method is implemented.")
-    if not isinstance(output, (tuple, list)):
-        raise TypeError(f"MODNet output must be a list or tuple, got {type(output)}")
-    if len(output) < 3:
-        raise ValueError(f"Expected at least 3 outputs from MODNet, got {len(output)}")
-    pred_semantic, pred_detail, pred_matte = output
-    if pred_matte is None:
-        raise RuntimeError("pred_matte is None — MODNet forward method may not be returning expected outputs.")
     matte = pred_matte[0][0].cpu().numpy()
     matte = cv2.resize(matte, image.size)
     matte = np.uint8(matte * 255)
-    rgba_image = image.convert("RGBA")
-    image_np = np.array(rgba_image)
-    if image_np.shape[2] < 4:
-        alpha_channel = 255 * np.ones((*image_np.shape[:2], 1), dtype=np.uint8)
-        image_np = np.concatenate([image_np, alpha_channel], axis=2)
     image_np[:, :, 3] = matte
-    return Image.fromarray(image_np)

 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import numpy as np
+import cv2
 from PIL import Image
+from torchvision import transforms
+# Backbone: U2NET-like architecture (simplified for inference only)
+class BasicConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(BasicConvBlock, self).__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        return self.block(x)
+class SimpleMODNetBackbone(nn.Module):
+    def __init__(self):
+        super(SimpleMODNetBackbone, self).__init__()
+        self.stage1 = BasicConvBlock(3, 64)
+        self.pool1 = nn.MaxPool2d(2, 2)
+        self.stage2 = BasicConvBlock(64, 128)
+        self.pool2 = nn.MaxPool2d(2, 2)
+        self.stage3 = BasicConvBlock(128, 256)
+    def forward(self, x):
+        x = self.stage1(x)
+        x = self.pool1(x)
+        x = self.stage2(x)
+        x = self.pool2(x)
+        x = self.stage3(x)
+        return x
 class MODNet(nn.Module):
+    def __init__(self):
         super(MODNet, self).__init__()
+        self.backbone = SimpleMODNetBackbone()
+        self.seg_head = nn.Sequential(
+            nn.Conv2d(256, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(64, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
         features = self.backbone(x)
+        pred_matte = self.seg_head(features)
+        return pred_matte
 def preprocess_image(image: Image.Image, device: torch.device) -> torch.Tensor:
+    transform = transforms.Compose([
+        transforms.Resize((512, 512)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                             std=[0.229, 0.224, 0.225])
+    ])
+    img_tensor = transform(image.convert("RGB")).unsqueeze(0).to(device)
+    return img_tensor
 def remove_background_modnet(image: Image.Image) -> Image.Image:
     modnet = MODNet()
     modnet.to(device)
+    # Skip loading weights (simple version)
     modnet.eval()
+    img_tensor = preprocess_image(image, device)
     with torch.no_grad():
+        pred_matte = modnet(img_tensor)
     matte = pred_matte[0][0].cpu().numpy()
     matte = cv2.resize(matte, image.size)
     matte = np.uint8(matte * 255)
+    image = image.convert("RGBA")
+    image_np = np.array(image)
     image_np[:, :, 3] = matte
+    return Image.fromarray(image_np)