File size: 5,656 Bytes

24870a9

"""
Simplified Human Parsing using a pretrained model.
This replaces the MediaPipe-based parsing with a proper semantic segmentation model.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import numpy as np
from PIL import Image
import os
import gdown

class SimpleHumanParser:
    """
    A simplified human parsing model using a lightweight segmentation approach.
    For production use, consider integrating SCHP or Graphonomy.
    """
    def __init__(self):
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        
        # For now, we'll use a heuristic-based approach
        # In a production system, you'd load a pretrained model here
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def parse_image(self, image, pose_data):
        """
        Generate a semantic segmentation map for the person.
        
        Args:
            image: PIL Image
            pose_data: numpy array of pose keypoints (18, 2)
        
        Returns:
            numpy array of shape (H, W) with class labels
        """
        # Convert image to numpy
        img_np = np.array(image)
        h, w = img_np.shape[:2]
        
        # Initialize parse map
        parse_map = np.zeros((h, w), dtype=np.uint8)
        
        # Use pose to create better segmentation
        # This is still heuristic but more sophisticated than before
        
        # 1. Background (0)
        parse_map[:] = 0
        
        # 2. Face/Head region (1, 4, 13 in original)
        # Use nose, eyes, ears
        if pose_data[0][0] > 0:  # Nose exists
            nose = pose_data[0].astype(int)
            # Estimate head region
            head_radius = int(h * 0.08)  # Approximate head size
            y1 = max(0, nose[1] - head_radius * 2)
            y2 = min(h, nose[1] + head_radius // 2)
            x1 = max(0, nose[0] - head_radius)
            x2 = min(w, nose[0] + head_radius)
            parse_map[y1:y2, x1:x2] = 4  # Face
        
        # 3. Upper body (torso) - label 3 (upper clothes)
        # Use shoulders and hips
        r_shoulder = pose_data[2].astype(int)
        l_shoulder = pose_data[5].astype(int)
        r_hip = pose_data[8].astype(int)
        l_hip = pose_data[11].astype(int)
        
        if all(r_shoulder > 0) and all(l_shoulder > 0) and all(r_hip > 0) and all(l_hip > 0):
            # Create torso polygon
            torso_pts = np.array([
                r_shoulder,
                l_shoulder,
                l_hip,
                r_hip
            ], dtype=np.int32)
            
            # Expand the polygon slightly
            center = torso_pts.mean(axis=0)
            torso_pts = ((torso_pts - center) * 1.2 + center).astype(np.int32)
            
            # Fill torso
            from PIL import ImageDraw
            mask_img = Image.new('L', (w, h), 0)
            draw = ImageDraw.Draw(mask_img)
            draw.polygon([tuple(p) for p in torso_pts], fill=3)
            torso_mask = np.array(mask_img)
            parse_map[torso_mask == 3] = 3
        
        # 4. Arms - labels 5 (left arm), 6 (right arm)
        # Right arm: shoulder(2) -> elbow(3) -> wrist(4)
        if all(pose_data[2] > 0) and all(pose_data[3] > 0):
            self._draw_limb(parse_map, pose_data[2], pose_data[3], 6, w, h)
        if all(pose_data[3] > 0) and all(pose_data[4] > 0):
            self._draw_limb(parse_map, pose_data[3], pose_data[4], 6, w, h)
        
        # Left arm: shoulder(5) -> elbow(6) -> wrist(7)
        if all(pose_data[5] > 0) and all(pose_data[6] > 0):
            self._draw_limb(parse_map, pose_data[5], pose_data[6], 5, w, h)
        if all(pose_data[6] > 0) and all(pose_data[7] > 0):
            self._draw_limb(parse_map, pose_data[6], pose_data[7], 5, w, h)
        
        # 5. Legs - labels 9, 12 (pants/bottom)
        # Right leg
        if all(pose_data[8] > 0) and all(pose_data[9] > 0):
            self._draw_limb(parse_map, pose_data[8], pose_data[9], 9, w, h)
        if all(pose_data[9] > 0) and all(pose_data[10] > 0):
            self._draw_limb(parse_map, pose_data[9], pose_data[10], 9, w, h)
        
        # Left leg
        if all(pose_data[11] > 0) and all(pose_data[12] > 0):
            self._draw_limb(parse_map, pose_data[11], pose_data[12], 12, w, h)
        if all(pose_data[12] > 0) and all(pose_data[13] > 0):
            self._draw_limb(parse_map, pose_data[12], pose_data[13], 12, w, h)
        
        # 6. Hair (1) - region above face
        if pose_data[0][0] > 0:
            nose = pose_data[0].astype(int)
            hair_radius = int(h * 0.08)
            y1 = max(0, nose[1] - hair_radius * 3)
            y2 = nose[1] - hair_radius
            x1 = max(0, nose[0] - hair_radius)
            x2 = min(w, nose[0] + hair_radius)
            if y2 > y1:
                parse_map[y1:y2, x1:x2] = 1  # Hair
        
        return parse_map
    
    def _draw_limb(self, parse_map, pt1, pt2, label, w, h):
        """Draw a limb (line with thickness) on the parse map."""
        from PIL import ImageDraw
        mask_img = Image.new('L', (w, h), 0)
        draw = ImageDraw.Draw(mask_img)
        thickness = max(10, int(h * 0.03))
        draw.line([tuple(pt1.astype(int)), tuple(pt2.astype(int))], fill=label, width=thickness)
        limb_mask = np.array(mask_img)
        parse_map[limb_mask == label] = label