Spaces:

ayousanz
/

anime-face-detector-gpu

Build error

App Files Files Community

ayousanz commited on Nov 27, 2025

Commit

70ade39

1 Parent(s): 0cf8493

Fix build: use local anime_face_detector module with OpenMMLab 2.x

Browse files

Files changed (7) hide show

anime_face_detector/__init__.py +58 -0
anime_face_detector/configs/mmdet/faster-rcnn.py +99 -0
anime_face_detector/configs/mmdet/yolov3.py +69 -0
anime_face_detector/configs/mmpose/hrnetv2.py +141 -0
anime_face_detector/detector.py +195 -0
app.py +0 -14
requirements.txt +5 -2

anime_face_detector/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pathlib
+import torch
+from .detector import LandmarkDetector
+def get_config_path(model_name: str) -> pathlib.Path:
+    assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
+    package_path = pathlib.Path(__file__).parent.resolve()
+    if model_name in ['faster-rcnn', 'yolov3']:
+        config_dir = package_path / 'configs' / 'mmdet'
+    else:
+        config_dir = package_path / 'configs' / 'mmpose'
+    return config_dir / f'{model_name}.py'
+def get_checkpoint_path(model_name: str) -> pathlib.Path:
+    assert model_name in ['faster-rcnn', 'yolov3', 'hrnetv2']
+    if model_name in ['faster-rcnn', 'yolov3']:
+        file_name = f'mmdet_anime-face_{model_name}.pth'
+    else:
+        file_name = f'mmpose_anime-face_{model_name}.pth'
+    model_dir = pathlib.Path(torch.hub.get_dir()) / 'checkpoints'
+    model_dir.mkdir(exist_ok=True, parents=True)
+    model_path = model_dir / file_name
+    if not model_path.exists():
+        url = f'https://github.com/hysts/anime-face-detector/releases/download/v0.0.1/{file_name}'
+        torch.hub.download_url_to_file(url, model_path.as_posix())
+    return model_path
+def create_detector(
+    face_detector_name: str = 'yolov3',
+    landmark_model_name='hrnetv2',
+    device: str = 'cuda:0',
+    flip_test: bool = True,
+    box_scale_factor: float = 1.1,
+) -> LandmarkDetector:
+    assert face_detector_name in ['yolov3', 'faster-rcnn']
+    assert landmark_model_name in ['hrnetv2']
+    detector_config_path = get_config_path(face_detector_name)
+    landmark_config_path = get_config_path(landmark_model_name)
+    detector_checkpoint_path = get_checkpoint_path(face_detector_name)
+    landmark_checkpoint_path = get_checkpoint_path(landmark_model_name)
+    model = LandmarkDetector(
+        landmark_config_path,
+        landmark_checkpoint_path,
+        detector_config_path,
+        detector_checkpoint_path,
+        device=device,
+        flip_test=flip_test,
+        box_scale_factor=box_scale_factor,
+    )
+    return model

anime_face_detector/configs/mmdet/faster-rcnn.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# mmdet 3.x config for Faster R-CNN anime face detection
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+    ),
+    neck=dict(
+        type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
+    ),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64],
+        ),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+    ),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=1,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2],
+            ),
+            reg_class_agnostic=False,
+        ),
+    ),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100
+        ),
+    ),
+)
+# test pipeline for mmdet 3.x
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
+]
+# test dataloader (required for mmdet 3.x init_detector)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='',
+        ann_file='',
+        data_prefix=dict(img=''),
+        test_mode=True,
+        pipeline=test_pipeline,
+    ),
+)

anime_face_detector/configs/mmdet/yolov3.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# mmdet 3.x config for YOLOv3 anime face detection
+model = dict(
+    type='YOLOV3',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[0.0, 0.0, 0.0],
+        std=[255.0, 255.0, 255.0],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[1024, 512, 256],
+        out_channels=[512, 256, 128],
+    ),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=1,
+        in_channels=[512, 256, 128],
+        out_channels=[1024, 512, 256],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[
+                [(116, 90), (156, 198), (373, 326)],
+                [(30, 61), (62, 45), (59, 119)],
+                [(10, 13), (16, 30), (33, 23)],
+            ],
+            strides=[32, 16, 8],
+        ),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+    ),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100,
+    ),
+)
+# test pipeline for mmdet 3.x
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='Resize', scale=(608, 608), keep_ratio=True),
+    dict(type='Pad', size=(608, 608), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')),
+]
+# test dataloader (required for mmdet 3.x init_detector)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='',
+        ann_file='',
+        data_prefix=dict(img=''),
+        test_mode=True,
+        pipeline=test_pipeline,
+    ),
+)

anime_face_detector/configs/mmpose/hrnetv2.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# mmpose 1.x config for HRNetV2 anime face landmark detection
+# codec configuration
+codec = dict(
+    type='MSRAHeatmap',
+    input_size=(256, 256),
+    heatmap_size=(64, 64),
+    sigma=2,
+)
+# model configuration
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+    ),
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4,),
+                num_channels=(64,),
+            ),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36),
+            ),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72),
+            ),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144),
+                multiscale_output=True,  # Output all branches for concat
+            ),
+        ),
+    ),
+    neck=dict(
+        type='FeatureMapProcessor',
+        concat=True,
+    ),
+    head=dict(
+        type='HeatmapHead',
+        in_channels=270,  # 18+36+72+144 = 270 (concat of all HRNet outputs)
+        out_channels=28,
+        deconv_out_channels=None,
+        conv_out_channels=(270,),
+        conv_kernel_sizes=(1,),
+        loss=dict(type='KeypointMSELoss', use_target_weight=True),
+        decoder=codec,
+    ),
+    test_cfg=dict(
+        flip_test=False,  # Disabled - requires proper dataset metainfo
+    ),
+)
+# flip pairs for flip augmentation
+flip_indices = [4, 3, 2, 1, 0, 10, 9, 8, 7, 6, 5, 19, 18, 17, 22, 21, 20, 13, 12, 11, 16, 15, 14, 23, 26, 25, 24, 27]
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs'),
+]
+# test dataloader (required for inference_topdown)
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='',
+        data_mode='topdown',
+        ann_file='',
+        data_prefix=dict(img=''),
+        test_mode=True,
+        pipeline=test_pipeline,
+    ),
+)
+# dataset meta information
+dataset_info = dict(
+    dataset_name='anime_face',
+    paper_info=dict(),
+    keypoint_info={
+        0: dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-4'),
+        1: dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-3'),
+        2: dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap=''),
+        3: dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-1'),
+        4: dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-0'),
+        5: dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-10'),
+        6: dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-9'),
+        7: dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-8'),
+        8: dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-7'),
+        9: dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-6'),
+        10: dict(name='kpt-10', id=10, color=[255, 255, 255], type='', swap='kpt-5'),
+        11: dict(name='kpt-11', id=11, color=[255, 255, 255], type='', swap='kpt-19'),
+        12: dict(name='kpt-12', id=12, color=[255, 255, 255], type='', swap='kpt-18'),
+        13: dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap='kpt-17'),
+        14: dict(name='kpt-14', id=14, color=[255, 255, 255], type='', swap='kpt-22'),
+        15: dict(name='kpt-15', id=15, color=[255, 255, 255], type='', swap='kpt-21'),
+        16: dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap='kpt-20'),
+        17: dict(name='kpt-17', id=17, color=[255, 255, 255], type='', swap='kpt-13'),
+        18: dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='kpt-12'),
+        19: dict(name='kpt-19', id=19, color=[255, 255, 255], type='', swap='kpt-11'),
+        20: dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap='kpt-16'),
+        21: dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap='kpt-15'),
+        22: dict(name='kpt-22', id=22, color=[255, 255, 255], type='', swap='kpt-14'),
+        23: dict(name='kpt-23', id=23, color=[255, 255, 255], type='', swap=''),
+        24: dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap='kpt-26'),
+        25: dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
+        26: dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap='kpt-24'),
+        27: dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+    },
+    skeleton_info={},
+    joint_weights=[1.0] * 28,
+    sigmas=[0.025] * 28,
+    flip_indices=flip_indices,
+)

anime_face_detector/detector.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from __future__ import annotations
+import pathlib
+import warnings
+import cv2
+import numpy as np
+import torch.nn as nn
+from mmdet.apis import inference_detector, init_detector
+from mmengine.config import Config
+from mmengine.registry import DefaultScope
+from mmpose.apis import inference_topdown, init_model
+class LandmarkDetector:
+    def __init__(
+        self,
+        landmark_detector_config_or_path: Config | str | pathlib.Path,
+        landmark_detector_checkpoint_path: str | pathlib.Path,
+        face_detector_config_or_path: Config | str | pathlib.Path | None = None,
+        face_detector_checkpoint_path: str | pathlib.Path | None = None,
+        device: str = 'cuda:0',
+        flip_test: bool = True,
+        box_scale_factor: float = 1.1,
+    ):
+        landmark_config = self._load_config(landmark_detector_config_or_path)
+        face_detector_config = self._load_config(face_detector_config_or_path)
+        self.landmark_detector = self._init_pose_model(
+            landmark_config, landmark_detector_checkpoint_path, device, flip_test
+        )
+        self.face_detector = self._init_face_detector(
+            face_detector_config, face_detector_checkpoint_path, device
+        )
+        self.box_scale_factor = box_scale_factor
+    @staticmethod
+    def _load_config(
+        config_or_path: Config | str | pathlib.Path | None,
+    ) -> Config | None:
+        if config_or_path is None or isinstance(config_or_path, Config):
+            return config_or_path
+        return Config.fromfile(config_or_path)
+    @staticmethod
+    def _init_pose_model(
+        config: Config,
+        checkpoint_path: str | pathlib.Path,
+        device: str,
+        flip_test: bool,
+    ) -> nn.Module:
+        if isinstance(checkpoint_path, pathlib.Path):
+            checkpoint_path = checkpoint_path.as_posix()
+        model = init_model(config, checkpoint_path, device=device)
+        # Set flip_test in model's test_cfg
+        if hasattr(model, 'test_cfg') and model.test_cfg is not None:
+            model.test_cfg['flip_test'] = flip_test
+        if hasattr(model.cfg, 'model') and hasattr(model.cfg.model, 'test_cfg'):
+            model.cfg.model.test_cfg['flip_test'] = flip_test
+        # Set dataset_meta with our custom keypoint info (28 keypoints for anime face)
+        if hasattr(config, 'dataset_info'):
+            dataset_meta = {
+                'dataset_name': config.dataset_info.get('dataset_name', 'anime_face'),
+                'num_keypoints': 28,
+                'keypoint_info': config.dataset_info.get('keypoint_info', {}),
+                'skeleton_info': config.dataset_info.get('skeleton_info', {}),
+                'joint_weights': config.dataset_info.get('joint_weights', [1.0] * 28),
+                'sigmas': config.dataset_info.get('sigmas', [0.025] * 28),
+                'flip_indices': config.dataset_info.get(
+                    'flip_indices', config.flip_indices if hasattr(config, 'flip_indices') else []
+                ),
+            }
+            model.dataset_meta = dataset_meta
+        # Copy all config attributes to model.cfg (required for inference_topdown)
+        for key in ['test_dataloader', 'test_pipeline', 'codec', 'flip_indices']:
+            if hasattr(config, key) and not hasattr(model.cfg, key):
+                setattr(model.cfg, key, getattr(config, key))
+        return model
+    @staticmethod
+    def _init_face_detector(
+        config: Config | None, checkpoint_path: str | pathlib.Path | None, device: str
+    ) -> nn.Module | None:
+        if config is not None:
+            if isinstance(checkpoint_path, pathlib.Path):
+                checkpoint_path = checkpoint_path.as_posix()
+            model = init_detector(config, checkpoint_path, device=device)
+        else:
+            model = None
+        return model
+    def _detect_faces(self, image: np.ndarray) -> list[np.ndarray]:
+        # Set mmdet scope for face detection
+        with DefaultScope.overwrite_default_scope('mmdet'):
+            # mmdet 3.x returns DetDataSample
+            result = inference_detector(self.face_detector, image)
+        # Extract bboxes and scores from pred_instances
+        pred_instances = result.pred_instances
+        bboxes = pred_instances.bboxes.cpu().numpy()
+        scores = pred_instances.scores.cpu().numpy()
+        # Combine to [x0, y0, x1, y1, score] format
+        boxes = []
+        for bbox, score in zip(bboxes, scores):
+            box = np.append(bbox, score)
+            boxes.append(box)
+        # scale boxes by `self.box_scale_factor`
+        boxes = self._update_pred_box(boxes)
+        return boxes
+    def _update_pred_box(self, pred_boxes: np.ndarray) -> list[np.ndarray]:
+        boxes = []
+        for pred_box in pred_boxes:
+            box = pred_box[:4]
+            size = box[2:] - box[:2] + 1
+            new_size = size * self.box_scale_factor
+            center = (box[:2] + box[2:]) / 2
+            tl = center - new_size / 2
+            br = tl + new_size
+            pred_box[:4] = np.concatenate([tl, br])
+            boxes.append(pred_box)
+        return boxes
+    def _detect_landmarks(
+        self, image: np.ndarray, boxes: list[np.ndarray]
+    ) -> list[dict[str, np.ndarray]]:
+        # mmpose 1.x uses inference_topdown with different interface
+        # Convert boxes to numpy array format expected by inference_topdown
+        bboxes = np.array(boxes) if boxes else np.empty((0, 5))
+        # Set mmpose scope for landmark detection
+        with DefaultScope.overwrite_default_scope('mmpose'):
+            # inference_topdown returns list of PoseDataSample
+            # Pass only first 4 columns (x0, y0, x1, y1) - mmpose 1.x expects (N, 4) format
+            results = inference_topdown(
+                self.landmark_detector, image, bboxes[:, :4], bbox_format='xyxy'
+            )
+        # Convert PoseDataSample to dict format for backward compatibility
+        preds = []
+        for i, result in enumerate(results):
+            pred_instances = result.pred_instances
+            keypoints = pred_instances.keypoints[0]  # (K, 2)
+            keypoint_scores = pred_instances.keypoint_scores[0]  # (K,)
+            # Combine keypoints and scores to [x, y, score] format
+            keypoints_with_scores = np.concatenate(
+                [keypoints, keypoint_scores[:, np.newaxis]], axis=1
+            )
+            preds.append({'bbox': boxes[i], 'keypoints': keypoints_with_scores})
+        return preds
+    @staticmethod
+    def _load_image(image_or_path: np.ndarray | str | pathlib.Path) -> np.ndarray:
+        if isinstance(image_or_path, np.ndarray):
+            image = image_or_path
+        elif isinstance(image_or_path, str):
+            image = cv2.imread(image_or_path)
+        elif isinstance(image_or_path, pathlib.Path):
+            image = cv2.imread(image_or_path.as_posix())
+        else:
+            raise ValueError
+        return image
+    def __call__(
+        self,
+        image_or_path: np.ndarray | str | pathlib.Path,
+        boxes: list[np.ndarray] | None = None,
+    ) -> list[dict[str, np.ndarray]]:
+        """Detect face landmarks.
+        Args:
+            image_or_path: An image with BGR channel order or an image path.
+            boxes: A list of bounding boxes for faces. Each bounding box
+                should be of the form [x0, y0, x1, y1, [score]].
+        Returns: A list of detection results. Each detection result has
+            bounding box of the form [x0, y0, x1, y1, [score]], and landmarks
+            of the form [x, y, score].
+        """
+        image = self._load_image(image_or_path)
+        if boxes is None:
+            if self.face_detector is not None:
+                boxes = self._detect_faces(image)
+            else:
+                warnings.warn(
+                    'Neither the face detector nor the bounding box is '
+                    'specified. So the entire image is treated as the face '
+                    'region.'
+                )
+                h, w = image.shape[:2]
+                boxes = [np.array([0, 0, w - 1, h - 1, 1])]
+        return self._detect_landmarks(image, boxes)

app.py CHANGED Viewed

@@ -1,19 +1,5 @@
 """Anime Face Detector - Hugging Face Space with Zero GPU support"""
 import os
-import subprocess
-import sys
-# Install mmcv, mmdet, mmpose using mim before importing
-def install_mmlab():
-    subprocess.run([sys.executable, '-m', 'pip', 'install', 'openmim'], check=True)
-    subprocess.run([sys.executable, '-m', 'mim', 'install', 'mmengine', 'mmcv', 'mmdet', 'mmpose'], check=True)
-# Check if mmcv is installed
-try:
-    import mmcv
-except ImportError:
-    print('Installing OpenMMLab dependencies...')
-    install_mmlab()
 import spaces
 import gradio as gr

 """Anime Face Detector - Hugging Face Space with Zero GPU support"""
 import os
 import spaces
 import gradio as gr

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 torch
 torchvision
-openmim
-anime-face-detector
 opencv-python-headless
 gradio>=4.0.0
 spaces

+--extra-index-url https://download.pytorch.org/whl/cu121
 torch
 torchvision
+mmengine
+mmcv>=2.0.0
+mmdet>=3.0.0
+mmpose>=1.0.0
 opencv-python-headless
 gradio>=4.0.0
 spaces