viton-hd / human_parser.py
known57's picture
Add files using upload-large-folder tool
24870a9 verified
"""
Simplified Human Parsing using a pretrained model.
This replaces the MediaPipe-based parsing with a proper semantic segmentation model.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import numpy as np
from PIL import Image
import os
import gdown
class SimpleHumanParser:
"""
A simplified human parsing model using a lightweight segmentation approach.
For production use, consider integrating SCHP or Graphonomy.
"""
def __init__(self):
self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
if torch.cuda.is_available():
self.device = torch.device('cuda')
# For now, we'll use a heuristic-based approach
# In a production system, you'd load a pretrained model here
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
def parse_image(self, image, pose_data):
"""
Generate a semantic segmentation map for the person.
Args:
image: PIL Image
pose_data: numpy array of pose keypoints (18, 2)
Returns:
numpy array of shape (H, W) with class labels
"""
# Convert image to numpy
img_np = np.array(image)
h, w = img_np.shape[:2]
# Initialize parse map
parse_map = np.zeros((h, w), dtype=np.uint8)
# Use pose to create better segmentation
# This is still heuristic but more sophisticated than before
# 1. Background (0)
parse_map[:] = 0
# 2. Face/Head region (1, 4, 13 in original)
# Use nose, eyes, ears
if pose_data[0][0] > 0: # Nose exists
nose = pose_data[0].astype(int)
# Estimate head region
head_radius = int(h * 0.08) # Approximate head size
y1 = max(0, nose[1] - head_radius * 2)
y2 = min(h, nose[1] + head_radius // 2)
x1 = max(0, nose[0] - head_radius)
x2 = min(w, nose[0] + head_radius)
parse_map[y1:y2, x1:x2] = 4 # Face
# 3. Upper body (torso) - label 3 (upper clothes)
# Use shoulders and hips
r_shoulder = pose_data[2].astype(int)
l_shoulder = pose_data[5].astype(int)
r_hip = pose_data[8].astype(int)
l_hip = pose_data[11].astype(int)
if all(r_shoulder > 0) and all(l_shoulder > 0) and all(r_hip > 0) and all(l_hip > 0):
# Create torso polygon
torso_pts = np.array([
r_shoulder,
l_shoulder,
l_hip,
r_hip
], dtype=np.int32)
# Expand the polygon slightly
center = torso_pts.mean(axis=0)
torso_pts = ((torso_pts - center) * 1.2 + center).astype(np.int32)
# Fill torso
from PIL import ImageDraw
mask_img = Image.new('L', (w, h), 0)
draw = ImageDraw.Draw(mask_img)
draw.polygon([tuple(p) for p in torso_pts], fill=3)
torso_mask = np.array(mask_img)
parse_map[torso_mask == 3] = 3
# 4. Arms - labels 5 (left arm), 6 (right arm)
# Right arm: shoulder(2) -> elbow(3) -> wrist(4)
if all(pose_data[2] > 0) and all(pose_data[3] > 0):
self._draw_limb(parse_map, pose_data[2], pose_data[3], 6, w, h)
if all(pose_data[3] > 0) and all(pose_data[4] > 0):
self._draw_limb(parse_map, pose_data[3], pose_data[4], 6, w, h)
# Left arm: shoulder(5) -> elbow(6) -> wrist(7)
if all(pose_data[5] > 0) and all(pose_data[6] > 0):
self._draw_limb(parse_map, pose_data[5], pose_data[6], 5, w, h)
if all(pose_data[6] > 0) and all(pose_data[7] > 0):
self._draw_limb(parse_map, pose_data[6], pose_data[7], 5, w, h)
# 5. Legs - labels 9, 12 (pants/bottom)
# Right leg
if all(pose_data[8] > 0) and all(pose_data[9] > 0):
self._draw_limb(parse_map, pose_data[8], pose_data[9], 9, w, h)
if all(pose_data[9] > 0) and all(pose_data[10] > 0):
self._draw_limb(parse_map, pose_data[9], pose_data[10], 9, w, h)
# Left leg
if all(pose_data[11] > 0) and all(pose_data[12] > 0):
self._draw_limb(parse_map, pose_data[11], pose_data[12], 12, w, h)
if all(pose_data[12] > 0) and all(pose_data[13] > 0):
self._draw_limb(parse_map, pose_data[12], pose_data[13], 12, w, h)
# 6. Hair (1) - region above face
if pose_data[0][0] > 0:
nose = pose_data[0].astype(int)
hair_radius = int(h * 0.08)
y1 = max(0, nose[1] - hair_radius * 3)
y2 = nose[1] - hair_radius
x1 = max(0, nose[0] - hair_radius)
x2 = min(w, nose[0] + hair_radius)
if y2 > y1:
parse_map[y1:y2, x1:x2] = 1 # Hair
return parse_map
def _draw_limb(self, parse_map, pt1, pt2, label, w, h):
"""Draw a limb (line with thickness) on the parse map."""
from PIL import ImageDraw
mask_img = Image.new('L', (w, h), 0)
draw = ImageDraw.Draw(mask_img)
thickness = max(10, int(h * 0.03))
draw.line([tuple(pt1.astype(int)), tuple(pt2.astype(int))], fill=label, width=thickness)
limb_mask = np.array(mask_img)
parse_map[limb_mask == label] = label