ayousanz commited on
Commit
70ade39
·
1 Parent(s): 0cf8493

Fix build: use local anime_face_detector module with OpenMMLab 2.x

Browse files
anime_face_detector/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+ import torch
4
+
5
+ from .detector import LandmarkDetector
6
+
7
+
8
+ def get_config_path(model_name: str) -> pathlib.Path:
9
+ assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
10
+
11
+ package_path = pathlib.Path(__file__).parent.resolve()
12
+ if model_name in ['faster-rcnn', 'yolov3']:
13
+ config_dir = package_path / 'configs' / 'mmdet'
14
+ else:
15
+ config_dir = package_path / 'configs' / 'mmpose'
16
+ return config_dir / f'{model_name}.py'
17
+
18
+
19
+ def get_checkpoint_path(model_name: str) -> pathlib.Path:
20
+ assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
21
+ if model_name in ['faster-rcnn', 'yolov3']:
22
+ file_name = f'mmdet_anime-face_{model_name}.pth'
23
+ else:
24
+ file_name = f'mmpose_anime-face_{model_name}.pth'
25
+
26
+ model_dir = pathlib.Path(torch.hub.get_dir()) / 'checkpoints'
27
+ model_dir.mkdir(exist_ok=True, parents=True)
28
+ model_path = model_dir / file_name
29
+ if not model_path.exists():
30
+ url = f'https://github.com/hysts/anime-face-detector/releases/download/v0.0.1/{file_name}'
31
+ torch.hub.download_url_to_file(url, model_path.as_posix())
32
+
33
+ return model_path
34
+
35
+
36
+ def create_detector(
37
+ face_detector_name: str = 'yolov3',
38
+ landmark_model_name='hrnetv2',
39
+ device: str = 'cuda:0',
40
+ flip_test: bool = True,
41
+ box_scale_factor: float = 1.1,
42
+ ) -> LandmarkDetector:
43
+ assert face_detector_name in ['yolov3', 'faster-rcnn']
44
+ assert landmark_model_name in ['hrnetv2']
45
+ detector_config_path = get_config_path(face_detector_name)
46
+ landmark_config_path = get_config_path(landmark_model_name)
47
+ detector_checkpoint_path = get_checkpoint_path(face_detector_name)
48
+ landmark_checkpoint_path = get_checkpoint_path(landmark_model_name)
49
+ model = LandmarkDetector(
50
+ landmark_config_path,
51
+ landmark_checkpoint_path,
52
+ detector_config_path,
53
+ detector_checkpoint_path,
54
+ device=device,
55
+ flip_test=flip_test,
56
+ box_scale_factor=box_scale_factor,
57
+ )
58
+ return model
anime_face_detector/configs/mmdet/faster-rcnn.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mmdet 3.x config for Faster R-CNN anime face detection
2
+
3
+ model = dict(
4
+ type='FasterRCNN',
5
+ data_preprocessor=dict(
6
+ type='DetDataPreprocessor',
7
+ mean=[123.675, 116.28, 103.53],
8
+ std=[58.395, 57.12, 57.375],
9
+ bgr_to_rgb=True,
10
+ pad_size_divisor=32,
11
+ ),
12
+ backbone=dict(
13
+ type='ResNet',
14
+ depth=50,
15
+ num_stages=4,
16
+ out_indices=(0, 1, 2, 3),
17
+ frozen_stages=1,
18
+ norm_cfg=dict(type='BN', requires_grad=True),
19
+ norm_eval=True,
20
+ style='pytorch',
21
+ ),
22
+ neck=dict(
23
+ type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
24
+ ),
25
+ rpn_head=dict(
26
+ type='RPNHead',
27
+ in_channels=256,
28
+ feat_channels=256,
29
+ anchor_generator=dict(
30
+ type='AnchorGenerator',
31
+ scales=[8],
32
+ ratios=[0.5, 1.0, 2.0],
33
+ strides=[4, 8, 16, 32, 64],
34
+ ),
35
+ bbox_coder=dict(
36
+ type='DeltaXYWHBBoxCoder',
37
+ target_means=[0.0, 0.0, 0.0, 0.0],
38
+ target_stds=[1.0, 1.0, 1.0, 1.0],
39
+ ),
40
+ ),
41
+ roi_head=dict(
42
+ type='StandardRoIHead',
43
+ bbox_roi_extractor=dict(
44
+ type='SingleRoIExtractor',
45
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
46
+ out_channels=256,
47
+ featmap_strides=[4, 8, 16, 32],
48
+ ),
49
+ bbox_head=dict(
50
+ type='Shared2FCBBoxHead',
51
+ in_channels=256,
52
+ fc_out_channels=1024,
53
+ roi_feat_size=7,
54
+ num_classes=1,
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[0.0, 0.0, 0.0, 0.0],
58
+ target_stds=[0.1, 0.1, 0.2, 0.2],
59
+ ),
60
+ reg_class_agnostic=False,
61
+ ),
62
+ ),
63
+ test_cfg=dict(
64
+ rpn=dict(
65
+ nms_pre=1000,
66
+ max_per_img=1000,
67
+ nms=dict(type='nms', iou_threshold=0.7),
68
+ min_bbox_size=0,
69
+ ),
70
+ rcnn=dict(
71
+ score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100
72
+ ),
73
+ ),
74
+ )
75
+
76
+ # test pipeline for mmdet 3.x
77
+ test_pipeline = [
78
+ dict(type='LoadImageFromFile', backend_args=None),
79
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
80
+ dict(type='Pad', size_divisor=32, pad_val=dict(img=(114, 114, 114))),
81
+ dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
82
+ ]
83
+
84
+ # test dataloader (required for mmdet 3.x init_detector)
85
+ test_dataloader = dict(
86
+ batch_size=1,
87
+ num_workers=0,
88
+ persistent_workers=False,
89
+ drop_last=False,
90
+ sampler=dict(type='DefaultSampler', shuffle=False),
91
+ dataset=dict(
92
+ type='CocoDataset',
93
+ data_root='',
94
+ ann_file='',
95
+ data_prefix=dict(img=''),
96
+ test_mode=True,
97
+ pipeline=test_pipeline,
98
+ ),
99
+ )
anime_face_detector/configs/mmdet/yolov3.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mmdet 3.x config for YOLOv3 anime face detection
2
+
3
+ model = dict(
4
+ type='YOLOV3',
5
+ data_preprocessor=dict(
6
+ type='DetDataPreprocessor',
7
+ mean=[0.0, 0.0, 0.0],
8
+ std=[255.0, 255.0, 255.0],
9
+ bgr_to_rgb=True,
10
+ pad_size_divisor=32,
11
+ ),
12
+ backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
13
+ neck=dict(
14
+ type='YOLOV3Neck',
15
+ num_scales=3,
16
+ in_channels=[1024, 512, 256],
17
+ out_channels=[512, 256, 128],
18
+ ),
19
+ bbox_head=dict(
20
+ type='YOLOV3Head',
21
+ num_classes=1,
22
+ in_channels=[512, 256, 128],
23
+ out_channels=[1024, 512, 256],
24
+ anchor_generator=dict(
25
+ type='YOLOAnchorGenerator',
26
+ base_sizes=[
27
+ [(116, 90), (156, 198), (373, 326)],
28
+ [(30, 61), (62, 45), (59, 119)],
29
+ [(10, 13), (16, 30), (33, 23)],
30
+ ],
31
+ strides=[32, 16, 8],
32
+ ),
33
+ bbox_coder=dict(type='YOLOBBoxCoder'),
34
+ featmap_strides=[32, 16, 8],
35
+ ),
36
+ test_cfg=dict(
37
+ nms_pre=1000,
38
+ min_bbox_size=0,
39
+ score_thr=0.05,
40
+ conf_thr=0.005,
41
+ nms=dict(type='nms', iou_threshold=0.45),
42
+ max_per_img=100,
43
+ ),
44
+ )
45
+
46
+ # test pipeline for mmdet 3.x
47
+ test_pipeline = [
48
+ dict(type='LoadImageFromFile', backend_args=None),
49
+ dict(type='Resize', scale=(608, 608), keep_ratio=True),
50
+ dict(type='Pad', size=(608, 608), pad_val=dict(img=(114, 114, 114))),
51
+ dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
52
+ ]
53
+
54
+ # test dataloader (required for mmdet 3.x init_detector)
55
+ test_dataloader = dict(
56
+ batch_size=1,
57
+ num_workers=0,
58
+ persistent_workers=False,
59
+ drop_last=False,
60
+ sampler=dict(type='DefaultSampler', shuffle=False),
61
+ dataset=dict(
62
+ type='CocoDataset',
63
+ data_root='',
64
+ ann_file='',
65
+ data_prefix=dict(img=''),
66
+ test_mode=True,
67
+ pipeline=test_pipeline,
68
+ ),
69
+ )
anime_face_detector/configs/mmpose/hrnetv2.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mmpose 1.x config for HRNetV2 anime face landmark detection
2
+
3
+ # codec configuration
4
+ codec = dict(
5
+ type='MSRAHeatmap',
6
+ input_size=(256, 256),
7
+ heatmap_size=(64, 64),
8
+ sigma=2,
9
+ )
10
+
11
+ # model configuration
12
+ model = dict(
13
+ type='TopdownPoseEstimator',
14
+ data_preprocessor=dict(
15
+ type='PoseDataPreprocessor',
16
+ mean=[123.675, 116.28, 103.53],
17
+ std=[58.395, 57.12, 57.375],
18
+ bgr_to_rgb=True,
19
+ ),
20
+ backbone=dict(
21
+ type='HRNet',
22
+ in_channels=3,
23
+ extra=dict(
24
+ stage1=dict(
25
+ num_modules=1,
26
+ num_branches=1,
27
+ block='BOTTLENECK',
28
+ num_blocks=(4,),
29
+ num_channels=(64,),
30
+ ),
31
+ stage2=dict(
32
+ num_modules=1,
33
+ num_branches=2,
34
+ block='BASIC',
35
+ num_blocks=(4, 4),
36
+ num_channels=(18, 36),
37
+ ),
38
+ stage3=dict(
39
+ num_modules=4,
40
+ num_branches=3,
41
+ block='BASIC',
42
+ num_blocks=(4, 4, 4),
43
+ num_channels=(18, 36, 72),
44
+ ),
45
+ stage4=dict(
46
+ num_modules=3,
47
+ num_branches=4,
48
+ block='BASIC',
49
+ num_blocks=(4, 4, 4, 4),
50
+ num_channels=(18, 36, 72, 144),
51
+ multiscale_output=True, # Output all branches for concat
52
+ ),
53
+ ),
54
+ ),
55
+ neck=dict(
56
+ type='FeatureMapProcessor',
57
+ concat=True,
58
+ ),
59
+ head=dict(
60
+ type='HeatmapHead',
61
+ in_channels=270, # 18+36+72+144 = 270 (concat of all HRNet outputs)
62
+ out_channels=28,
63
+ deconv_out_channels=None,
64
+ conv_out_channels=(270,),
65
+ conv_kernel_sizes=(1,),
66
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
67
+ decoder=codec,
68
+ ),
69
+ test_cfg=dict(
70
+ flip_test=False, # Disabled - requires proper dataset metainfo
71
+ ),
72
+ )
73
+
74
+ # flip pairs for flip augmentation
75
+ flip_indices = [4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5, 19, 18, 17, 22, 21, 20, 13, 12, 11, 16, 15, 14, 23, 26, 25, 24, 27]
76
+
77
+ # test pipeline
78
+ test_pipeline = [
79
+ dict(type='LoadImage'),
80
+ dict(type='GetBBoxCenterScale'),
81
+ dict(type='TopdownAffine', input_size=codec['input_size']),
82
+ dict(type='PackPoseInputs'),
83
+ ]
84
+
85
+ # test dataloader (required for inference_topdown)
86
+ test_dataloader = dict(
87
+ batch_size=1,
88
+ num_workers=0,
89
+ persistent_workers=False,
90
+ drop_last=False,
91
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
92
+ dataset=dict(
93
+ type='CocoDataset',
94
+ data_root='',
95
+ data_mode='topdown',
96
+ ann_file='',
97
+ data_prefix=dict(img=''),
98
+ test_mode=True,
99
+ pipeline=test_pipeline,
100
+ ),
101
+ )
102
+
103
+ # dataset meta information
104
+ dataset_info = dict(
105
+ dataset_name='anime_face',
106
+ paper_info=dict(),
107
+ keypoint_info={
108
+ 0: dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-4'),
109
+ 1: dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-3'),
110
+ 2: dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap=''),
111
+ 3: dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-1'),
112
+ 4: dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-0'),
113
+ 5: dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-10'),
114
+ 6: dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-9'),
115
+ 7: dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-8'),
116
+ 8: dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-7'),
117
+ 9: dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-6'),
118
+ 10: dict(name='kpt-10', id=10, color=[255, 255, 255], type='', swap='kpt-5'),
119
+ 11: dict(name='kpt-11', id=11, color=[255, 255, 255], type='', swap='kpt-19'),
120
+ 12: dict(name='kpt-12', id=12, color=[255, 255, 255], type='', swap='kpt-18'),
121
+ 13: dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap='kpt-17'),
122
+ 14: dict(name='kpt-14', id=14, color=[255, 255, 255], type='', swap='kpt-22'),
123
+ 15: dict(name='kpt-15', id=15, color=[255, 255, 255], type='', swap='kpt-21'),
124
+ 16: dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap='kpt-20'),
125
+ 17: dict(name='kpt-17', id=17, color=[255, 255, 255], type='', swap='kpt-13'),
126
+ 18: dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='kpt-12'),
127
+ 19: dict(name='kpt-19', id=19, color=[255, 255, 255], type='', swap='kpt-11'),
128
+ 20: dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap='kpt-16'),
129
+ 21: dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap='kpt-15'),
130
+ 22: dict(name='kpt-22', id=22, color=[255, 255, 255], type='', swap='kpt-14'),
131
+ 23: dict(name='kpt-23', id=23, color=[255, 255, 255], type='', swap=''),
132
+ 24: dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap='kpt-26'),
133
+ 25: dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
134
+ 26: dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap='kpt-24'),
135
+ 27: dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
136
+ },
137
+ skeleton_info={},
138
+ joint_weights=[1.0] * 28,
139
+ sigmas=[0.025] * 28,
140
+ flip_indices=flip_indices,
141
+ )
anime_face_detector/detector.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import pathlib
4
+ import warnings
5
+
6
+ import cv2
7
+ import numpy as np
8
+ import torch.nn as nn
9
+ from mmdet.apis import inference_detector, init_detector
10
+ from mmengine.config import Config
11
+ from mmengine.registry import DefaultScope
12
+ from mmpose.apis import inference_topdown, init_model
13
+
14
+
15
+ class LandmarkDetector:
16
+ def __init__(
17
+ self,
18
+ landmark_detector_config_or_path: Config | str | pathlib.Path,
19
+ landmark_detector_checkpoint_path: str | pathlib.Path,
20
+ face_detector_config_or_path: Config | str | pathlib.Path | None = None,
21
+ face_detector_checkpoint_path: str | pathlib.Path | None = None,
22
+ device: str = 'cuda:0',
23
+ flip_test: bool = True,
24
+ box_scale_factor: float = 1.1,
25
+ ):
26
+ landmark_config = self._load_config(landmark_detector_config_or_path)
27
+ face_detector_config = self._load_config(face_detector_config_or_path)
28
+
29
+ self.landmark_detector = self._init_pose_model(
30
+ landmark_config, landmark_detector_checkpoint_path, device, flip_test
31
+ )
32
+ self.face_detector = self._init_face_detector(
33
+ face_detector_config, face_detector_checkpoint_path, device
34
+ )
35
+
36
+ self.box_scale_factor = box_scale_factor
37
+
38
+ @staticmethod
39
+ def _load_config(
40
+ config_or_path: Config | str | pathlib.Path | None,
41
+ ) -> Config | None:
42
+ if config_or_path is None or isinstance(config_or_path, Config):
43
+ return config_or_path
44
+ return Config.fromfile(config_or_path)
45
+
46
+ @staticmethod
47
+ def _init_pose_model(
48
+ config: Config,
49
+ checkpoint_path: str | pathlib.Path,
50
+ device: str,
51
+ flip_test: bool,
52
+ ) -> nn.Module:
53
+ if isinstance(checkpoint_path, pathlib.Path):
54
+ checkpoint_path = checkpoint_path.as_posix()
55
+ model = init_model(config, checkpoint_path, device=device)
56
+
57
+ # Set flip_test in model's test_cfg
58
+ if hasattr(model, 'test_cfg') and model.test_cfg is not None:
59
+ model.test_cfg['flip_test'] = flip_test
60
+ if hasattr(model.cfg, 'model') and hasattr(model.cfg.model, 'test_cfg'):
61
+ model.cfg.model.test_cfg['flip_test'] = flip_test
62
+
63
+ # Set dataset_meta with our custom keypoint info (28 keypoints for anime face)
64
+ if hasattr(config, 'dataset_info'):
65
+ dataset_meta = {
66
+ 'dataset_name': config.dataset_info.get('dataset_name', 'anime_face'),
67
+ 'num_keypoints': 28,
68
+ 'keypoint_info': config.dataset_info.get('keypoint_info', {}),
69
+ 'skeleton_info': config.dataset_info.get('skeleton_info', {}),
70
+ 'joint_weights': config.dataset_info.get('joint_weights', [1.0] * 28),
71
+ 'sigmas': config.dataset_info.get('sigmas', [0.025] * 28),
72
+ 'flip_indices': config.dataset_info.get(
73
+ 'flip_indices', config.flip_indices if hasattr(config, 'flip_indices') else []
74
+ ),
75
+ }
76
+ model.dataset_meta = dataset_meta
77
+
78
+ # Copy all config attributes to model.cfg (required for inference_topdown)
79
+ for key in ['test_dataloader', 'test_pipeline', 'codec', 'flip_indices']:
80
+ if hasattr(config, key) and not hasattr(model.cfg, key):
81
+ setattr(model.cfg, key, getattr(config, key))
82
+ return model
83
+
84
+ @staticmethod
85
+ def _init_face_detector(
86
+ config: Config | None, checkpoint_path: str | pathlib.Path | None, device: str
87
+ ) -> nn.Module | None:
88
+ if config is not None:
89
+ if isinstance(checkpoint_path, pathlib.Path):
90
+ checkpoint_path = checkpoint_path.as_posix()
91
+ model = init_detector(config, checkpoint_path, device=device)
92
+ else:
93
+ model = None
94
+ return model
95
+
96
+ def _detect_faces(self, image: np.ndarray) -> list[np.ndarray]:
97
+ # Set mmdet scope for face detection
98
+ with DefaultScope.overwrite_default_scope('mmdet'):
99
+ # mmdet 3.x returns DetDataSample
100
+ result = inference_detector(self.face_detector, image)
101
+ # Extract bboxes and scores from pred_instances
102
+ pred_instances = result.pred_instances
103
+ bboxes = pred_instances.bboxes.cpu().numpy()
104
+ scores = pred_instances.scores.cpu().numpy()
105
+ # Combine to [x0, y0, x1, y1, score] format
106
+ boxes = []
107
+ for bbox, score in zip(bboxes, scores):
108
+ box = np.append(bbox, score)
109
+ boxes.append(box)
110
+ # scale boxes by `self.box_scale_factor`
111
+ boxes = self._update_pred_box(boxes)
112
+ return boxes
113
+
114
+ def _update_pred_box(self, pred_boxes: np.ndarray) -> list[np.ndarray]:
115
+ boxes = []
116
+ for pred_box in pred_boxes:
117
+ box = pred_box[:4]
118
+ size = box[2:] - box[:2] + 1
119
+ new_size = size * self.box_scale_factor
120
+ center = (box[:2] + box[2:]) / 2
121
+ tl = center - new_size / 2
122
+ br = tl + new_size
123
+ pred_box[:4] = np.concatenate([tl, br])
124
+ boxes.append(pred_box)
125
+ return boxes
126
+
127
+ def _detect_landmarks(
128
+ self, image: np.ndarray, boxes: list[np.ndarray]
129
+ ) -> list[dict[str, np.ndarray]]:
130
+ # mmpose 1.x uses inference_topdown with different interface
131
+ # Convert boxes to numpy array format expected by inference_topdown
132
+ bboxes = np.array(boxes) if boxes else np.empty((0, 5))
133
+
134
+ # Set mmpose scope for landmark detection
135
+ with DefaultScope.overwrite_default_scope('mmpose'):
136
+ # inference_topdown returns list of PoseDataSample
137
+ # Pass only first 4 columns (x0, y0, x1, y1) - mmpose 1.x expects (N, 4) format
138
+ results = inference_topdown(
139
+ self.landmark_detector, image, bboxes[:, :4], bbox_format='xyxy'
140
+ )
141
+
142
+ # Convert PoseDataSample to dict format for backward compatibility
143
+ preds = []
144
+ for i, result in enumerate(results):
145
+ pred_instances = result.pred_instances
146
+ keypoints = pred_instances.keypoints[0] # (K, 2)
147
+ keypoint_scores = pred_instances.keypoint_scores[0] # (K,)
148
+ # Combine keypoints and scores to [x, y, score] format
149
+ keypoints_with_scores = np.concatenate(
150
+ [keypoints, keypoint_scores[:, np.newaxis]], axis=1
151
+ )
152
+ preds.append({'bbox': boxes[i], 'keypoints': keypoints_with_scores})
153
+ return preds
154
+
155
+ @staticmethod
156
+ def _load_image(image_or_path: np.ndarray | str | pathlib.Path) -> np.ndarray:
157
+ if isinstance(image_or_path, np.ndarray):
158
+ image = image_or_path
159
+ elif isinstance(image_or_path, str):
160
+ image = cv2.imread(image_or_path)
161
+ elif isinstance(image_or_path, pathlib.Path):
162
+ image = cv2.imread(image_or_path.as_posix())
163
+ else:
164
+ raise ValueError
165
+ return image
166
+
167
+ def __call__(
168
+ self,
169
+ image_or_path: np.ndarray | str | pathlib.Path,
170
+ boxes: list[np.ndarray] | None = None,
171
+ ) -> list[dict[str, np.ndarray]]:
172
+ """Detect face landmarks.
173
+
174
+ Args:
175
+ image_or_path: An image with BGR channel order or an image path.
176
+ boxes: A list of bounding boxes for faces. Each bounding box
177
+ should be of the form [x0, y0, x1, y1, [score]].
178
+
179
+ Returns: A list of detection results. Each detection result has
180
+ bounding box of the form [x0, y0, x1, y1, [score]], and landmarks
181
+ of the form [x, y, score].
182
+ """
183
+ image = self._load_image(image_or_path)
184
+ if boxes is None:
185
+ if self.face_detector is not None:
186
+ boxes = self._detect_faces(image)
187
+ else:
188
+ warnings.warn(
189
+ 'Neither the face detector nor the bounding box is '
190
+ 'specified. So the entire image is treated as the face '
191
+ 'region.'
192
+ )
193
+ h, w = image.shape[:2]
194
+ boxes = [np.array([0, 0, w - 1, h - 1, 1])]
195
+ return self._detect_landmarks(image, boxes)
app.py CHANGED
@@ -1,19 +1,5 @@
1
  """Anime Face Detector - Hugging Face Space with Zero GPU support"""
2
  import os
3
- import subprocess
4
- import sys
5
-
6
- # Install mmcv, mmdet, mmpose using mim before importing
7
- def install_mmlab():
8
- subprocess.run([sys.executable, '-m', 'pip', 'install', 'openmim'], check=True)
9
- subprocess.run([sys.executable, '-m', 'mim', 'install', 'mmengine', 'mmcv', 'mmdet', 'mmpose'], check=True)
10
-
11
- # Check if mmcv is installed
12
- try:
13
- import mmcv
14
- except ImportError:
15
- print('Installing OpenMMLab dependencies...')
16
- install_mmlab()
17
 
18
  import spaces
19
  import gradio as gr
 
1
  """Anime Face Detector - Hugging Face Space with Zero GPU support"""
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import spaces
5
  import gradio as gr
requirements.txt CHANGED
@@ -1,7 +1,10 @@
 
1
  torch
2
  torchvision
3
- openmim
4
- anime-face-detector
 
 
5
  opencv-python-headless
6
  gradio>=4.0.0
7
  spaces
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
  torch
3
  torchvision
4
+ mmengine
5
+ mmcv>=2.0.0
6
+ mmdet>=3.0.0
7
+ mmpose>=1.0.0
8
  opencv-python-headless
9
  gradio>=4.0.0
10
  spaces