viton-hd / human_parser.py

Add files using upload-large-folder tool

24870a9 verified 12 days ago

5.66 kB

	"""
	Simplified Human Parsing using a pretrained model.
	This replaces the MediaPipe-based parsing with a proper semantic segmentation model.
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchvision import transforms
	import numpy as np
	from PIL import Image
	import os
	import gdown

	class SimpleHumanParser:
	"""
	A simplified human parsing model using a lightweight segmentation approach.
	For production use, consider integrating SCHP or Graphonomy.
	"""
	def __init__(self):
	self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
	if torch.cuda.is_available():
	self.device = torch.device('cuda')

	# For now, we'll use a heuristic-based approach
	# In a production system, you'd load a pretrained model here
	self.transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	def parse_image(self, image, pose_data):
	"""
	Generate a semantic segmentation map for the person.

	Args:
	image: PIL Image
	pose_data: numpy array of pose keypoints (18, 2)

	Returns:
	numpy array of shape (H, W) with class labels
	"""
	# Convert image to numpy
	img_np = np.array(image)
	h, w = img_np.shape[:2]

	# Initialize parse map
	parse_map = np.zeros((h, w), dtype=np.uint8)

	# Use pose to create better segmentation
	# This is still heuristic but more sophisticated than before

	# 1. Background (0)
	parse_map[:] = 0

	# 2. Face/Head region (1, 4, 13 in original)
	# Use nose, eyes, ears
	if pose_data[0][0] > 0: # Nose exists
	nose = pose_data[0].astype(int)
	# Estimate head region
	head_radius = int(h * 0.08) # Approximate head size
	y1 = max(0, nose[1] - head_radius * 2)
	y2 = min(h, nose[1] + head_radius // 2)
	x1 = max(0, nose[0] - head_radius)
	x2 = min(w, nose[0] + head_radius)
	parse_map[y1:y2, x1:x2] = 4 # Face

	# 3. Upper body (torso) - label 3 (upper clothes)
	# Use shoulders and hips
	r_shoulder = pose_data[2].astype(int)
	l_shoulder = pose_data[5].astype(int)
	r_hip = pose_data[8].astype(int)
	l_hip = pose_data[11].astype(int)

	if all(r_shoulder > 0) and all(l_shoulder > 0) and all(r_hip > 0) and all(l_hip > 0):
	# Create torso polygon
	torso_pts = np.array([
	r_shoulder,
	l_shoulder,
	l_hip,
	r_hip
	], dtype=np.int32)

	# Expand the polygon slightly
	center = torso_pts.mean(axis=0)
	torso_pts = ((torso_pts - center) * 1.2 + center).astype(np.int32)

	# Fill torso
	from PIL import ImageDraw
	mask_img = Image.new('L', (w, h), 0)
	draw = ImageDraw.Draw(mask_img)
	draw.polygon([tuple(p) for p in torso_pts], fill=3)
	torso_mask = np.array(mask_img)
	parse_map[torso_mask == 3] = 3

	# 4. Arms - labels 5 (left arm), 6 (right arm)
	# Right arm: shoulder(2) -> elbow(3) -> wrist(4)
	if all(pose_data[2] > 0) and all(pose_data[3] > 0):
	self._draw_limb(parse_map, pose_data[2], pose_data[3], 6, w, h)
	if all(pose_data[3] > 0) and all(pose_data[4] > 0):
	self._draw_limb(parse_map, pose_data[3], pose_data[4], 6, w, h)

	# Left arm: shoulder(5) -> elbow(6) -> wrist(7)
	if all(pose_data[5] > 0) and all(pose_data[6] > 0):
	self._draw_limb(parse_map, pose_data[5], pose_data[6], 5, w, h)
	if all(pose_data[6] > 0) and all(pose_data[7] > 0):
	self._draw_limb(parse_map, pose_data[6], pose_data[7], 5, w, h)

	# 5. Legs - labels 9, 12 (pants/bottom)
	# Right leg
	if all(pose_data[8] > 0) and all(pose_data[9] > 0):
	self._draw_limb(parse_map, pose_data[8], pose_data[9], 9, w, h)
	if all(pose_data[9] > 0) and all(pose_data[10] > 0):
	self._draw_limb(parse_map, pose_data[9], pose_data[10], 9, w, h)

	# Left leg
	if all(pose_data[11] > 0) and all(pose_data[12] > 0):
	self._draw_limb(parse_map, pose_data[11], pose_data[12], 12, w, h)
	if all(pose_data[12] > 0) and all(pose_data[13] > 0):
	self._draw_limb(parse_map, pose_data[12], pose_data[13], 12, w, h)

	# 6. Hair (1) - region above face
	if pose_data[0][0] > 0:
	nose = pose_data[0].astype(int)
	hair_radius = int(h * 0.08)
	y1 = max(0, nose[1] - hair_radius * 3)
	y2 = nose[1] - hair_radius
	x1 = max(0, nose[0] - hair_radius)
	x2 = min(w, nose[0] + hair_radius)
	if y2 > y1:
	parse_map[y1:y2, x1:x2] = 1 # Hair

	return parse_map

	def _draw_limb(self, parse_map, pt1, pt2, label, w, h):
	"""Draw a limb (line with thickness) on the parse map."""
	from PIL import ImageDraw
	mask_img = Image.new('L', (w, h), 0)
	draw = ImageDraw.Draw(mask_img)
	thickness = max(10, int(h * 0.03))
	draw.line([tuple(pt1.astype(int)), tuple(pt2.astype(int))], fill=label, width=thickness)
	limb_mask = np.array(mask_img)
	parse_map[limb_mask == label] = label