""" Simplified Human Parsing using a pretrained model. This replaces the MediaPipe-based parsing with a proper semantic segmentation model. """ import torch import torch.nn as nn import torch.nn.functional as F from torchvision import transforms import numpy as np from PIL import Image import os import gdown class SimpleHumanParser: """ A simplified human parsing model using a lightweight segmentation approach. For production use, consider integrating SCHP or Graphonomy. """ def __init__(self): self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') if torch.cuda.is_available(): self.device = torch.device('cuda') # For now, we'll use a heuristic-based approach # In a production system, you'd load a pretrained model here self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def parse_image(self, image, pose_data): """ Generate a semantic segmentation map for the person. Args: image: PIL Image pose_data: numpy array of pose keypoints (18, 2) Returns: numpy array of shape (H, W) with class labels """ # Convert image to numpy img_np = np.array(image) h, w = img_np.shape[:2] # Initialize parse map parse_map = np.zeros((h, w), dtype=np.uint8) # Use pose to create better segmentation # This is still heuristic but more sophisticated than before # 1. Background (0) parse_map[:] = 0 # 2. Face/Head region (1, 4, 13 in original) # Use nose, eyes, ears if pose_data[0][0] > 0: # Nose exists nose = pose_data[0].astype(int) # Estimate head region head_radius = int(h * 0.08) # Approximate head size y1 = max(0, nose[1] - head_radius * 2) y2 = min(h, nose[1] + head_radius // 2) x1 = max(0, nose[0] - head_radius) x2 = min(w, nose[0] + head_radius) parse_map[y1:y2, x1:x2] = 4 # Face # 3. Upper body (torso) - label 3 (upper clothes) # Use shoulders and hips r_shoulder = pose_data[2].astype(int) l_shoulder = pose_data[5].astype(int) r_hip = pose_data[8].astype(int) l_hip = pose_data[11].astype(int) if all(r_shoulder > 0) and all(l_shoulder > 0) and all(r_hip > 0) and all(l_hip > 0): # Create torso polygon torso_pts = np.array([ r_shoulder, l_shoulder, l_hip, r_hip ], dtype=np.int32) # Expand the polygon slightly center = torso_pts.mean(axis=0) torso_pts = ((torso_pts - center) * 1.2 + center).astype(np.int32) # Fill torso from PIL import ImageDraw mask_img = Image.new('L', (w, h), 0) draw = ImageDraw.Draw(mask_img) draw.polygon([tuple(p) for p in torso_pts], fill=3) torso_mask = np.array(mask_img) parse_map[torso_mask == 3] = 3 # 4. Arms - labels 5 (left arm), 6 (right arm) # Right arm: shoulder(2) -> elbow(3) -> wrist(4) if all(pose_data[2] > 0) and all(pose_data[3] > 0): self._draw_limb(parse_map, pose_data[2], pose_data[3], 6, w, h) if all(pose_data[3] > 0) and all(pose_data[4] > 0): self._draw_limb(parse_map, pose_data[3], pose_data[4], 6, w, h) # Left arm: shoulder(5) -> elbow(6) -> wrist(7) if all(pose_data[5] > 0) and all(pose_data[6] > 0): self._draw_limb(parse_map, pose_data[5], pose_data[6], 5, w, h) if all(pose_data[6] > 0) and all(pose_data[7] > 0): self._draw_limb(parse_map, pose_data[6], pose_data[7], 5, w, h) # 5. Legs - labels 9, 12 (pants/bottom) # Right leg if all(pose_data[8] > 0) and all(pose_data[9] > 0): self._draw_limb(parse_map, pose_data[8], pose_data[9], 9, w, h) if all(pose_data[9] > 0) and all(pose_data[10] > 0): self._draw_limb(parse_map, pose_data[9], pose_data[10], 9, w, h) # Left leg if all(pose_data[11] > 0) and all(pose_data[12] > 0): self._draw_limb(parse_map, pose_data[11], pose_data[12], 12, w, h) if all(pose_data[12] > 0) and all(pose_data[13] > 0): self._draw_limb(parse_map, pose_data[12], pose_data[13], 12, w, h) # 6. Hair (1) - region above face if pose_data[0][0] > 0: nose = pose_data[0].astype(int) hair_radius = int(h * 0.08) y1 = max(0, nose[1] - hair_radius * 3) y2 = nose[1] - hair_radius x1 = max(0, nose[0] - hair_radius) x2 = min(w, nose[0] + hair_radius) if y2 > y1: parse_map[y1:y2, x1:x2] = 1 # Hair return parse_map def _draw_limb(self, parse_map, pt1, pt2, label, w, h): """Draw a limb (line with thickness) on the parse map.""" from PIL import ImageDraw mask_img = Image.new('L', (w, h), 0) draw = ImageDraw.Draw(mask_img) thickness = max(10, int(h * 0.03)) draw.line([tuple(pt1.astype(int)), tuple(pt2.astype(int))], fill=label, width=thickness) limb_mask = np.array(mask_img) parse_map[limb_mask == label] = label