Spaces:
Build error
Build error
Fix build: use local anime_face_detector module with OpenMMLab 2.x
Browse files- anime_face_detector/__init__.py +58 -0
- anime_face_detector/configs/mmdet/faster-rcnn.py +99 -0
- anime_face_detector/configs/mmdet/yolov3.py +69 -0
- anime_face_detector/configs/mmpose/hrnetv2.py +141 -0
- anime_face_detector/detector.py +195 -0
- app.py +0 -14
- requirements.txt +5 -2
anime_face_detector/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
from .detector import LandmarkDetector
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_config_path(model_name: str) -> pathlib.Path:
|
| 9 |
+
assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
|
| 10 |
+
|
| 11 |
+
package_path = pathlib.Path(__file__).parent.resolve()
|
| 12 |
+
if model_name in ['faster-rcnn', 'yolov3']:
|
| 13 |
+
config_dir = package_path / 'configs' / 'mmdet'
|
| 14 |
+
else:
|
| 15 |
+
config_dir = package_path / 'configs' / 'mmpose'
|
| 16 |
+
return config_dir / f'{model_name}.py'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_checkpoint_path(model_name: str) -> pathlib.Path:
|
| 20 |
+
assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
|
| 21 |
+
if model_name in ['faster-rcnn', 'yolov3']:
|
| 22 |
+
file_name = f'mmdet_anime-face_{model_name}.pth'
|
| 23 |
+
else:
|
| 24 |
+
file_name = f'mmpose_anime-face_{model_name}.pth'
|
| 25 |
+
|
| 26 |
+
model_dir = pathlib.Path(torch.hub.get_dir()) / 'checkpoints'
|
| 27 |
+
model_dir.mkdir(exist_ok=True, parents=True)
|
| 28 |
+
model_path = model_dir / file_name
|
| 29 |
+
if not model_path.exists():
|
| 30 |
+
url = f'https://github.com/hysts/anime-face-detector/releases/download/v0.0.1/{file_name}'
|
| 31 |
+
torch.hub.download_url_to_file(url, model_path.as_posix())
|
| 32 |
+
|
| 33 |
+
return model_path
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def create_detector(
|
| 37 |
+
face_detector_name: str = 'yolov3',
|
| 38 |
+
landmark_model_name='hrnetv2',
|
| 39 |
+
device: str = 'cuda:0',
|
| 40 |
+
flip_test: bool = True,
|
| 41 |
+
box_scale_factor: float = 1.1,
|
| 42 |
+
) -> LandmarkDetector:
|
| 43 |
+
assert face_detector_name in ['yolov3', 'faster-rcnn']
|
| 44 |
+
assert landmark_model_name in ['hrnetv2']
|
| 45 |
+
detector_config_path = get_config_path(face_detector_name)
|
| 46 |
+
landmark_config_path = get_config_path(landmark_model_name)
|
| 47 |
+
detector_checkpoint_path = get_checkpoint_path(face_detector_name)
|
| 48 |
+
landmark_checkpoint_path = get_checkpoint_path(landmark_model_name)
|
| 49 |
+
model = LandmarkDetector(
|
| 50 |
+
landmark_config_path,
|
| 51 |
+
landmark_checkpoint_path,
|
| 52 |
+
detector_config_path,
|
| 53 |
+
detector_checkpoint_path,
|
| 54 |
+
device=device,
|
| 55 |
+
flip_test=flip_test,
|
| 56 |
+
box_scale_factor=box_scale_factor,
|
| 57 |
+
)
|
| 58 |
+
return model
|
anime_face_detector/configs/mmdet/faster-rcnn.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mmdet 3.x config for Faster R-CNN anime face detection
|
| 2 |
+
|
| 3 |
+
model = dict(
|
| 4 |
+
type='FasterRCNN',
|
| 5 |
+
data_preprocessor=dict(
|
| 6 |
+
type='DetDataPreprocessor',
|
| 7 |
+
mean=[123.675, 116.28, 103.53],
|
| 8 |
+
std=[58.395, 57.12, 57.375],
|
| 9 |
+
bgr_to_rgb=True,
|
| 10 |
+
pad_size_divisor=32,
|
| 11 |
+
),
|
| 12 |
+
backbone=dict(
|
| 13 |
+
type='ResNet',
|
| 14 |
+
depth=50,
|
| 15 |
+
num_stages=4,
|
| 16 |
+
out_indices=(0, 1, 2, 3),
|
| 17 |
+
frozen_stages=1,
|
| 18 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
| 19 |
+
norm_eval=True,
|
| 20 |
+
style='pytorch',
|
| 21 |
+
),
|
| 22 |
+
neck=dict(
|
| 23 |
+
type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
|
| 24 |
+
),
|
| 25 |
+
rpn_head=dict(
|
| 26 |
+
type='RPNHead',
|
| 27 |
+
in_channels=256,
|
| 28 |
+
feat_channels=256,
|
| 29 |
+
anchor_generator=dict(
|
| 30 |
+
type='AnchorGenerator',
|
| 31 |
+
scales=[8],
|
| 32 |
+
ratios=[0.5, 1.0, 2.0],
|
| 33 |
+
strides=[4, 8, 16, 32, 64],
|
| 34 |
+
),
|
| 35 |
+
bbox_coder=dict(
|
| 36 |
+
type='DeltaXYWHBBoxCoder',
|
| 37 |
+
target_means=[0.0, 0.0, 0.0, 0.0],
|
| 38 |
+
target_stds=[1.0, 1.0, 1.0, 1.0],
|
| 39 |
+
),
|
| 40 |
+
),
|
| 41 |
+
roi_head=dict(
|
| 42 |
+
type='StandardRoIHead',
|
| 43 |
+
bbox_roi_extractor=dict(
|
| 44 |
+
type='SingleRoIExtractor',
|
| 45 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
| 46 |
+
out_channels=256,
|
| 47 |
+
featmap_strides=[4, 8, 16, 32],
|
| 48 |
+
),
|
| 49 |
+
bbox_head=dict(
|
| 50 |
+
type='Shared2FCBBoxHead',
|
| 51 |
+
in_channels=256,
|
| 52 |
+
fc_out_channels=1024,
|
| 53 |
+
roi_feat_size=7,
|
| 54 |
+
num_classes=1,
|
| 55 |
+
bbox_coder=dict(
|
| 56 |
+
type='DeltaXYWHBBoxCoder',
|
| 57 |
+
target_means=[0.0, 0.0, 0.0, 0.0],
|
| 58 |
+
target_stds=[0.1, 0.1, 0.2, 0.2],
|
| 59 |
+
),
|
| 60 |
+
reg_class_agnostic=False,
|
| 61 |
+
),
|
| 62 |
+
),
|
| 63 |
+
test_cfg=dict(
|
| 64 |
+
rpn=dict(
|
| 65 |
+
nms_pre=1000,
|
| 66 |
+
max_per_img=1000,
|
| 67 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
| 68 |
+
min_bbox_size=0,
|
| 69 |
+
),
|
| 70 |
+
rcnn=dict(
|
| 71 |
+
score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100
|
| 72 |
+
),
|
| 73 |
+
),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# test pipeline for mmdet 3.x
|
| 77 |
+
test_pipeline = [
|
| 78 |
+
dict(type='LoadImageFromFile', backend_args=None),
|
| 79 |
+
dict(type='Resize', scale=(1333, 800), keep_ratio=True),
|
| 80 |
+
dict(type='Pad', size_divisor=32, pad_val=dict(img=(114, 114, 114))),
|
| 81 |
+
dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
# test dataloader (required for mmdet 3.x init_detector)
|
| 85 |
+
test_dataloader = dict(
|
| 86 |
+
batch_size=1,
|
| 87 |
+
num_workers=0,
|
| 88 |
+
persistent_workers=False,
|
| 89 |
+
drop_last=False,
|
| 90 |
+
sampler=dict(type='DefaultSampler', shuffle=False),
|
| 91 |
+
dataset=dict(
|
| 92 |
+
type='CocoDataset',
|
| 93 |
+
data_root='',
|
| 94 |
+
ann_file='',
|
| 95 |
+
data_prefix=dict(img=''),
|
| 96 |
+
test_mode=True,
|
| 97 |
+
pipeline=test_pipeline,
|
| 98 |
+
),
|
| 99 |
+
)
|
anime_face_detector/configs/mmdet/yolov3.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mmdet 3.x config for YOLOv3 anime face detection
|
| 2 |
+
|
| 3 |
+
model = dict(
|
| 4 |
+
type='YOLOV3',
|
| 5 |
+
data_preprocessor=dict(
|
| 6 |
+
type='DetDataPreprocessor',
|
| 7 |
+
mean=[0.0, 0.0, 0.0],
|
| 8 |
+
std=[255.0, 255.0, 255.0],
|
| 9 |
+
bgr_to_rgb=True,
|
| 10 |
+
pad_size_divisor=32,
|
| 11 |
+
),
|
| 12 |
+
backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
|
| 13 |
+
neck=dict(
|
| 14 |
+
type='YOLOV3Neck',
|
| 15 |
+
num_scales=3,
|
| 16 |
+
in_channels=[1024, 512, 256],
|
| 17 |
+
out_channels=[512, 256, 128],
|
| 18 |
+
),
|
| 19 |
+
bbox_head=dict(
|
| 20 |
+
type='YOLOV3Head',
|
| 21 |
+
num_classes=1,
|
| 22 |
+
in_channels=[512, 256, 128],
|
| 23 |
+
out_channels=[1024, 512, 256],
|
| 24 |
+
anchor_generator=dict(
|
| 25 |
+
type='YOLOAnchorGenerator',
|
| 26 |
+
base_sizes=[
|
| 27 |
+
[(116, 90), (156, 198), (373, 326)],
|
| 28 |
+
[(30, 61), (62, 45), (59, 119)],
|
| 29 |
+
[(10, 13), (16, 30), (33, 23)],
|
| 30 |
+
],
|
| 31 |
+
strides=[32, 16, 8],
|
| 32 |
+
),
|
| 33 |
+
bbox_coder=dict(type='YOLOBBoxCoder'),
|
| 34 |
+
featmap_strides=[32, 16, 8],
|
| 35 |
+
),
|
| 36 |
+
test_cfg=dict(
|
| 37 |
+
nms_pre=1000,
|
| 38 |
+
min_bbox_size=0,
|
| 39 |
+
score_thr=0.05,
|
| 40 |
+
conf_thr=0.005,
|
| 41 |
+
nms=dict(type='nms', iou_threshold=0.45),
|
| 42 |
+
max_per_img=100,
|
| 43 |
+
),
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# test pipeline for mmdet 3.x
|
| 47 |
+
test_pipeline = [
|
| 48 |
+
dict(type='LoadImageFromFile', backend_args=None),
|
| 49 |
+
dict(type='Resize', scale=(608, 608), keep_ratio=True),
|
| 50 |
+
dict(type='Pad', size=(608, 608), pad_val=dict(img=(114, 114, 114))),
|
| 51 |
+
dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
# test dataloader (required for mmdet 3.x init_detector)
|
| 55 |
+
test_dataloader = dict(
|
| 56 |
+
batch_size=1,
|
| 57 |
+
num_workers=0,
|
| 58 |
+
persistent_workers=False,
|
| 59 |
+
drop_last=False,
|
| 60 |
+
sampler=dict(type='DefaultSampler', shuffle=False),
|
| 61 |
+
dataset=dict(
|
| 62 |
+
type='CocoDataset',
|
| 63 |
+
data_root='',
|
| 64 |
+
ann_file='',
|
| 65 |
+
data_prefix=dict(img=''),
|
| 66 |
+
test_mode=True,
|
| 67 |
+
pipeline=test_pipeline,
|
| 68 |
+
),
|
| 69 |
+
)
|
anime_face_detector/configs/mmpose/hrnetv2.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mmpose 1.x config for HRNetV2 anime face landmark detection
|
| 2 |
+
|
| 3 |
+
# codec configuration
|
| 4 |
+
codec = dict(
|
| 5 |
+
type='MSRAHeatmap',
|
| 6 |
+
input_size=(256, 256),
|
| 7 |
+
heatmap_size=(64, 64),
|
| 8 |
+
sigma=2,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# model configuration
|
| 12 |
+
model = dict(
|
| 13 |
+
type='TopdownPoseEstimator',
|
| 14 |
+
data_preprocessor=dict(
|
| 15 |
+
type='PoseDataPreprocessor',
|
| 16 |
+
mean=[123.675, 116.28, 103.53],
|
| 17 |
+
std=[58.395, 57.12, 57.375],
|
| 18 |
+
bgr_to_rgb=True,
|
| 19 |
+
),
|
| 20 |
+
backbone=dict(
|
| 21 |
+
type='HRNet',
|
| 22 |
+
in_channels=3,
|
| 23 |
+
extra=dict(
|
| 24 |
+
stage1=dict(
|
| 25 |
+
num_modules=1,
|
| 26 |
+
num_branches=1,
|
| 27 |
+
block='BOTTLENECK',
|
| 28 |
+
num_blocks=(4,),
|
| 29 |
+
num_channels=(64,),
|
| 30 |
+
),
|
| 31 |
+
stage2=dict(
|
| 32 |
+
num_modules=1,
|
| 33 |
+
num_branches=2,
|
| 34 |
+
block='BASIC',
|
| 35 |
+
num_blocks=(4, 4),
|
| 36 |
+
num_channels=(18, 36),
|
| 37 |
+
),
|
| 38 |
+
stage3=dict(
|
| 39 |
+
num_modules=4,
|
| 40 |
+
num_branches=3,
|
| 41 |
+
block='BASIC',
|
| 42 |
+
num_blocks=(4, 4, 4),
|
| 43 |
+
num_channels=(18, 36, 72),
|
| 44 |
+
),
|
| 45 |
+
stage4=dict(
|
| 46 |
+
num_modules=3,
|
| 47 |
+
num_branches=4,
|
| 48 |
+
block='BASIC',
|
| 49 |
+
num_blocks=(4, 4, 4, 4),
|
| 50 |
+
num_channels=(18, 36, 72, 144),
|
| 51 |
+
multiscale_output=True, # Output all branches for concat
|
| 52 |
+
),
|
| 53 |
+
),
|
| 54 |
+
),
|
| 55 |
+
neck=dict(
|
| 56 |
+
type='FeatureMapProcessor',
|
| 57 |
+
concat=True,
|
| 58 |
+
),
|
| 59 |
+
head=dict(
|
| 60 |
+
type='HeatmapHead',
|
| 61 |
+
in_channels=270, # 18+36+72+144 = 270 (concat of all HRNet outputs)
|
| 62 |
+
out_channels=28,
|
| 63 |
+
deconv_out_channels=None,
|
| 64 |
+
conv_out_channels=(270,),
|
| 65 |
+
conv_kernel_sizes=(1,),
|
| 66 |
+
loss=dict(type='KeypointMSELoss', use_target_weight=True),
|
| 67 |
+
decoder=codec,
|
| 68 |
+
),
|
| 69 |
+
test_cfg=dict(
|
| 70 |
+
flip_test=False, # Disabled - requires proper dataset metainfo
|
| 71 |
+
),
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# flip pairs for flip augmentation
|
| 75 |
+
flip_indices = [4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5, 19, 18, 17, 22, 21, 20, 13, 12, 11, 16, 15, 14, 23, 26, 25, 24, 27]
|
| 76 |
+
|
| 77 |
+
# test pipeline
|
| 78 |
+
test_pipeline = [
|
| 79 |
+
dict(type='LoadImage'),
|
| 80 |
+
dict(type='GetBBoxCenterScale'),
|
| 81 |
+
dict(type='TopdownAffine', input_size=codec['input_size']),
|
| 82 |
+
dict(type='PackPoseInputs'),
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
# test dataloader (required for inference_topdown)
|
| 86 |
+
test_dataloader = dict(
|
| 87 |
+
batch_size=1,
|
| 88 |
+
num_workers=0,
|
| 89 |
+
persistent_workers=False,
|
| 90 |
+
drop_last=False,
|
| 91 |
+
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
|
| 92 |
+
dataset=dict(
|
| 93 |
+
type='CocoDataset',
|
| 94 |
+
data_root='',
|
| 95 |
+
data_mode='topdown',
|
| 96 |
+
ann_file='',
|
| 97 |
+
data_prefix=dict(img=''),
|
| 98 |
+
test_mode=True,
|
| 99 |
+
pipeline=test_pipeline,
|
| 100 |
+
),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# dataset meta information
|
| 104 |
+
dataset_info = dict(
|
| 105 |
+
dataset_name='anime_face',
|
| 106 |
+
paper_info=dict(),
|
| 107 |
+
keypoint_info={
|
| 108 |
+
0: dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-4'),
|
| 109 |
+
1: dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-3'),
|
| 110 |
+
2: dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap=''),
|
| 111 |
+
3: dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-1'),
|
| 112 |
+
4: dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-0'),
|
| 113 |
+
5: dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-10'),
|
| 114 |
+
6: dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-9'),
|
| 115 |
+
7: dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-8'),
|
| 116 |
+
8: dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-7'),
|
| 117 |
+
9: dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-6'),
|
| 118 |
+
10: dict(name='kpt-10', id=10, color=[255, 255, 255], type='', swap='kpt-5'),
|
| 119 |
+
11: dict(name='kpt-11', id=11, color=[255, 255, 255], type='', swap='kpt-19'),
|
| 120 |
+
12: dict(name='kpt-12', id=12, color=[255, 255, 255], type='', swap='kpt-18'),
|
| 121 |
+
13: dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap='kpt-17'),
|
| 122 |
+
14: dict(name='kpt-14', id=14, color=[255, 255, 255], type='', swap='kpt-22'),
|
| 123 |
+
15: dict(name='kpt-15', id=15, color=[255, 255, 255], type='', swap='kpt-21'),
|
| 124 |
+
16: dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap='kpt-20'),
|
| 125 |
+
17: dict(name='kpt-17', id=17, color=[255, 255, 255], type='', swap='kpt-13'),
|
| 126 |
+
18: dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='kpt-12'),
|
| 127 |
+
19: dict(name='kpt-19', id=19, color=[255, 255, 255], type='', swap='kpt-11'),
|
| 128 |
+
20: dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap='kpt-16'),
|
| 129 |
+
21: dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap='kpt-15'),
|
| 130 |
+
22: dict(name='kpt-22', id=22, color=[255, 255, 255], type='', swap='kpt-14'),
|
| 131 |
+
23: dict(name='kpt-23', id=23, color=[255, 255, 255], type='', swap=''),
|
| 132 |
+
24: dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap='kpt-26'),
|
| 133 |
+
25: dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
|
| 134 |
+
26: dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap='kpt-24'),
|
| 135 |
+
27: dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
|
| 136 |
+
},
|
| 137 |
+
skeleton_info={},
|
| 138 |
+
joint_weights=[1.0] * 28,
|
| 139 |
+
sigmas=[0.025] * 28,
|
| 140 |
+
flip_indices=flip_indices,
|
| 141 |
+
)
|
anime_face_detector/detector.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import pathlib
|
| 4 |
+
import warnings
|
| 5 |
+
|
| 6 |
+
import cv2
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
from mmdet.apis import inference_detector, init_detector
|
| 10 |
+
from mmengine.config import Config
|
| 11 |
+
from mmengine.registry import DefaultScope
|
| 12 |
+
from mmpose.apis import inference_topdown, init_model
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LandmarkDetector:
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
landmark_detector_config_or_path: Config | str | pathlib.Path,
|
| 19 |
+
landmark_detector_checkpoint_path: str | pathlib.Path,
|
| 20 |
+
face_detector_config_or_path: Config | str | pathlib.Path | None = None,
|
| 21 |
+
face_detector_checkpoint_path: str | pathlib.Path | None = None,
|
| 22 |
+
device: str = 'cuda:0',
|
| 23 |
+
flip_test: bool = True,
|
| 24 |
+
box_scale_factor: float = 1.1,
|
| 25 |
+
):
|
| 26 |
+
landmark_config = self._load_config(landmark_detector_config_or_path)
|
| 27 |
+
face_detector_config = self._load_config(face_detector_config_or_path)
|
| 28 |
+
|
| 29 |
+
self.landmark_detector = self._init_pose_model(
|
| 30 |
+
landmark_config, landmark_detector_checkpoint_path, device, flip_test
|
| 31 |
+
)
|
| 32 |
+
self.face_detector = self._init_face_detector(
|
| 33 |
+
face_detector_config, face_detector_checkpoint_path, device
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
self.box_scale_factor = box_scale_factor
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def _load_config(
|
| 40 |
+
config_or_path: Config | str | pathlib.Path | None,
|
| 41 |
+
) -> Config | None:
|
| 42 |
+
if config_or_path is None or isinstance(config_or_path, Config):
|
| 43 |
+
return config_or_path
|
| 44 |
+
return Config.fromfile(config_or_path)
|
| 45 |
+
|
| 46 |
+
@staticmethod
|
| 47 |
+
def _init_pose_model(
|
| 48 |
+
config: Config,
|
| 49 |
+
checkpoint_path: str | pathlib.Path,
|
| 50 |
+
device: str,
|
| 51 |
+
flip_test: bool,
|
| 52 |
+
) -> nn.Module:
|
| 53 |
+
if isinstance(checkpoint_path, pathlib.Path):
|
| 54 |
+
checkpoint_path = checkpoint_path.as_posix()
|
| 55 |
+
model = init_model(config, checkpoint_path, device=device)
|
| 56 |
+
|
| 57 |
+
# Set flip_test in model's test_cfg
|
| 58 |
+
if hasattr(model, 'test_cfg') and model.test_cfg is not None:
|
| 59 |
+
model.test_cfg['flip_test'] = flip_test
|
| 60 |
+
if hasattr(model.cfg, 'model') and hasattr(model.cfg.model, 'test_cfg'):
|
| 61 |
+
model.cfg.model.test_cfg['flip_test'] = flip_test
|
| 62 |
+
|
| 63 |
+
# Set dataset_meta with our custom keypoint info (28 keypoints for anime face)
|
| 64 |
+
if hasattr(config, 'dataset_info'):
|
| 65 |
+
dataset_meta = {
|
| 66 |
+
'dataset_name': config.dataset_info.get('dataset_name', 'anime_face'),
|
| 67 |
+
'num_keypoints': 28,
|
| 68 |
+
'keypoint_info': config.dataset_info.get('keypoint_info', {}),
|
| 69 |
+
'skeleton_info': config.dataset_info.get('skeleton_info', {}),
|
| 70 |
+
'joint_weights': config.dataset_info.get('joint_weights', [1.0] * 28),
|
| 71 |
+
'sigmas': config.dataset_info.get('sigmas', [0.025] * 28),
|
| 72 |
+
'flip_indices': config.dataset_info.get(
|
| 73 |
+
'flip_indices', config.flip_indices if hasattr(config, 'flip_indices') else []
|
| 74 |
+
),
|
| 75 |
+
}
|
| 76 |
+
model.dataset_meta = dataset_meta
|
| 77 |
+
|
| 78 |
+
# Copy all config attributes to model.cfg (required for inference_topdown)
|
| 79 |
+
for key in ['test_dataloader', 'test_pipeline', 'codec', 'flip_indices']:
|
| 80 |
+
if hasattr(config, key) and not hasattr(model.cfg, key):
|
| 81 |
+
setattr(model.cfg, key, getattr(config, key))
|
| 82 |
+
return model
|
| 83 |
+
|
| 84 |
+
@staticmethod
|
| 85 |
+
def _init_face_detector(
|
| 86 |
+
config: Config | None, checkpoint_path: str | pathlib.Path | None, device: str
|
| 87 |
+
) -> nn.Module | None:
|
| 88 |
+
if config is not None:
|
| 89 |
+
if isinstance(checkpoint_path, pathlib.Path):
|
| 90 |
+
checkpoint_path = checkpoint_path.as_posix()
|
| 91 |
+
model = init_detector(config, checkpoint_path, device=device)
|
| 92 |
+
else:
|
| 93 |
+
model = None
|
| 94 |
+
return model
|
| 95 |
+
|
| 96 |
+
def _detect_faces(self, image: np.ndarray) -> list[np.ndarray]:
|
| 97 |
+
# Set mmdet scope for face detection
|
| 98 |
+
with DefaultScope.overwrite_default_scope('mmdet'):
|
| 99 |
+
# mmdet 3.x returns DetDataSample
|
| 100 |
+
result = inference_detector(self.face_detector, image)
|
| 101 |
+
# Extract bboxes and scores from pred_instances
|
| 102 |
+
pred_instances = result.pred_instances
|
| 103 |
+
bboxes = pred_instances.bboxes.cpu().numpy()
|
| 104 |
+
scores = pred_instances.scores.cpu().numpy()
|
| 105 |
+
# Combine to [x0, y0, x1, y1, score] format
|
| 106 |
+
boxes = []
|
| 107 |
+
for bbox, score in zip(bboxes, scores):
|
| 108 |
+
box = np.append(bbox, score)
|
| 109 |
+
boxes.append(box)
|
| 110 |
+
# scale boxes by `self.box_scale_factor`
|
| 111 |
+
boxes = self._update_pred_box(boxes)
|
| 112 |
+
return boxes
|
| 113 |
+
|
| 114 |
+
def _update_pred_box(self, pred_boxes: np.ndarray) -> list[np.ndarray]:
|
| 115 |
+
boxes = []
|
| 116 |
+
for pred_box in pred_boxes:
|
| 117 |
+
box = pred_box[:4]
|
| 118 |
+
size = box[2:] - box[:2] + 1
|
| 119 |
+
new_size = size * self.box_scale_factor
|
| 120 |
+
center = (box[:2] + box[2:]) / 2
|
| 121 |
+
tl = center - new_size / 2
|
| 122 |
+
br = tl + new_size
|
| 123 |
+
pred_box[:4] = np.concatenate([tl, br])
|
| 124 |
+
boxes.append(pred_box)
|
| 125 |
+
return boxes
|
| 126 |
+
|
| 127 |
+
def _detect_landmarks(
|
| 128 |
+
self, image: np.ndarray, boxes: list[np.ndarray]
|
| 129 |
+
) -> list[dict[str, np.ndarray]]:
|
| 130 |
+
# mmpose 1.x uses inference_topdown with different interface
|
| 131 |
+
# Convert boxes to numpy array format expected by inference_topdown
|
| 132 |
+
bboxes = np.array(boxes) if boxes else np.empty((0, 5))
|
| 133 |
+
|
| 134 |
+
# Set mmpose scope for landmark detection
|
| 135 |
+
with DefaultScope.overwrite_default_scope('mmpose'):
|
| 136 |
+
# inference_topdown returns list of PoseDataSample
|
| 137 |
+
# Pass only first 4 columns (x0, y0, x1, y1) - mmpose 1.x expects (N, 4) format
|
| 138 |
+
results = inference_topdown(
|
| 139 |
+
self.landmark_detector, image, bboxes[:, :4], bbox_format='xyxy'
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Convert PoseDataSample to dict format for backward compatibility
|
| 143 |
+
preds = []
|
| 144 |
+
for i, result in enumerate(results):
|
| 145 |
+
pred_instances = result.pred_instances
|
| 146 |
+
keypoints = pred_instances.keypoints[0] # (K, 2)
|
| 147 |
+
keypoint_scores = pred_instances.keypoint_scores[0] # (K,)
|
| 148 |
+
# Combine keypoints and scores to [x, y, score] format
|
| 149 |
+
keypoints_with_scores = np.concatenate(
|
| 150 |
+
[keypoints, keypoint_scores[:, np.newaxis]], axis=1
|
| 151 |
+
)
|
| 152 |
+
preds.append({'bbox': boxes[i], 'keypoints': keypoints_with_scores})
|
| 153 |
+
return preds
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def _load_image(image_or_path: np.ndarray | str | pathlib.Path) -> np.ndarray:
|
| 157 |
+
if isinstance(image_or_path, np.ndarray):
|
| 158 |
+
image = image_or_path
|
| 159 |
+
elif isinstance(image_or_path, str):
|
| 160 |
+
image = cv2.imread(image_or_path)
|
| 161 |
+
elif isinstance(image_or_path, pathlib.Path):
|
| 162 |
+
image = cv2.imread(image_or_path.as_posix())
|
| 163 |
+
else:
|
| 164 |
+
raise ValueError
|
| 165 |
+
return image
|
| 166 |
+
|
| 167 |
+
def __call__(
|
| 168 |
+
self,
|
| 169 |
+
image_or_path: np.ndarray | str | pathlib.Path,
|
| 170 |
+
boxes: list[np.ndarray] | None = None,
|
| 171 |
+
) -> list[dict[str, np.ndarray]]:
|
| 172 |
+
"""Detect face landmarks.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
image_or_path: An image with BGR channel order or an image path.
|
| 176 |
+
boxes: A list of bounding boxes for faces. Each bounding box
|
| 177 |
+
should be of the form [x0, y0, x1, y1, [score]].
|
| 178 |
+
|
| 179 |
+
Returns: A list of detection results. Each detection result has
|
| 180 |
+
bounding box of the form [x0, y0, x1, y1, [score]], and landmarks
|
| 181 |
+
of the form [x, y, score].
|
| 182 |
+
"""
|
| 183 |
+
image = self._load_image(image_or_path)
|
| 184 |
+
if boxes is None:
|
| 185 |
+
if self.face_detector is not None:
|
| 186 |
+
boxes = self._detect_faces(image)
|
| 187 |
+
else:
|
| 188 |
+
warnings.warn(
|
| 189 |
+
'Neither the face detector nor the bounding box is '
|
| 190 |
+
'specified. So the entire image is treated as the face '
|
| 191 |
+
'region.'
|
| 192 |
+
)
|
| 193 |
+
h, w = image.shape[:2]
|
| 194 |
+
boxes = [np.array([0, 0, w - 1, h - 1, 1])]
|
| 195 |
+
return self._detect_landmarks(image, boxes)
|
app.py
CHANGED
|
@@ -1,19 +1,5 @@
|
|
| 1 |
"""Anime Face Detector - Hugging Face Space with Zero GPU support"""
|
| 2 |
import os
|
| 3 |
-
import subprocess
|
| 4 |
-
import sys
|
| 5 |
-
|
| 6 |
-
# Install mmcv, mmdet, mmpose using mim before importing
|
| 7 |
-
def install_mmlab():
|
| 8 |
-
subprocess.run([sys.executable, '-m', 'pip', 'install', 'openmim'], check=True)
|
| 9 |
-
subprocess.run([sys.executable, '-m', 'mim', 'install', 'mmengine', 'mmcv', 'mmdet', 'mmpose'], check=True)
|
| 10 |
-
|
| 11 |
-
# Check if mmcv is installed
|
| 12 |
-
try:
|
| 13 |
-
import mmcv
|
| 14 |
-
except ImportError:
|
| 15 |
-
print('Installing OpenMMLab dependencies...')
|
| 16 |
-
install_mmlab()
|
| 17 |
|
| 18 |
import spaces
|
| 19 |
import gradio as gr
|
|
|
|
| 1 |
"""Anime Face Detector - Hugging Face Space with Zero GPU support"""
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import spaces
|
| 5 |
import gradio as gr
|
requirements.txt
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
| 1 |
torch
|
| 2 |
torchvision
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
opencv-python-headless
|
| 6 |
gradio>=4.0.0
|
| 7 |
spaces
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 2 |
torch
|
| 3 |
torchvision
|
| 4 |
+
mmengine
|
| 5 |
+
mmcv>=2.0.0
|
| 6 |
+
mmdet>=3.0.0
|
| 7 |
+
mmpose>=1.0.0
|
| 8 |
opencv-python-headless
|
| 9 |
gradio>=4.0.0
|
| 10 |
spaces
|