#!/usr/bin/env python3 """ 遍历 /opt/data/chinese_celeb_dataset 下的图片,使用 YOLO 人脸检测并删除没有检测到人脸的图片。 用法示例: python test/remove_faceless_images.py --dry-run """ from __future__ import annotations import argparse import sys from pathlib import Path from typing import Iterable, List, Optional import config try: from ultralytics import YOLO except ImportError as exc: # pragma: no cover - 运行期缺依赖提示 raise SystemExit("缺少 ultralytics,请先执行 pip install ultralytics") from exc # 默认数据集与模型配置 DEFAULT_DATASET_DIR = Path("/opt/data/chinese_celeb_dataset") MODEL_DIR = Path(config.MODELS_PATH) YOLO_MODEL_NAME = config.YOLO_MODEL def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="使用 YOLO 检测 /opt/data/chinese_celeb_dataset 中的图片并删除无脸图片" ) parser.add_argument( "--dataset-dir", type=Path, default=DEFAULT_DATASET_DIR, help="需要检查的根目录(默认:/opt/data/chinese_celeb_dataset)", ) parser.add_argument( "--extensions", type=str, default=".jpg,.jpeg,.png,.webp,.bmp", help="需要检查的图片扩展名,逗号分隔", ) parser.add_argument( "--confidence", type=float, default=config.FACE_CONFIDENCE, help="YOLO 检测的人脸置信度阈值", ) parser.add_argument( "--dry-run", action="store_true", help="仅输出将被删除的文件,不真正删除,便于先预览结果", ) parser.add_argument( "--verbose", action="store_true", help="输出更多调试信息", ) return parser.parse_args() def load_yolo_model() -> YOLO: """ 优先加载本地 models 目录下配置好的模型,如果不存在则回退为模型名称(会触发自动下载)。 """ candidates: List[str] = [] local_path = MODEL_DIR / YOLO_MODEL_NAME if local_path.exists(): candidates.append(str(local_path)) candidates.append(YOLO_MODEL_NAME) last_error: Optional[Exception] = None for candidate in candidates: try: config.logger.info("尝试加载 YOLO 模型:%s", candidate) return YOLO(candidate) except Exception as exc: # pragma: no cover last_error = exc config.logger.warning("加载 YOLO 模型失败:%s -> %s", candidate, exc) raise RuntimeError(f"无法加载 YOLO 模型:{YOLO_MODEL_NAME}") from last_error def iter_image_files(root: Path, extensions: Iterable[str]) -> Iterable[Path]: lower_exts = tuple(ext.strip().lower() for ext in extensions if ext.strip()) for path in root.rglob("*"): if not path.is_file(): continue if path.suffix.lower() in lower_exts: yield path def has_face(model: YOLO, image_path: Path, confidence: float, verbose: bool = False) -> bool: """ 使用 YOLO 检测图片中是否存在人脸。检测到任意一个框即可视为有人脸。 """ try: results = model(image_path, conf=confidence, verbose=False) except Exception as exc: # pragma: no cover config.logger.error("检测失败,跳过 %s:%s", image_path, exc) return False for result in results: boxes = getattr(result, "boxes", None) if boxes is None: continue if len(boxes) > 0: if verbose: faces = [] for box in boxes: cls_id = int(box.cls[0]) if getattr(box, "cls", None) is not None else -1 score = float(box.conf[0]) if getattr(box, "conf", None) is not None else 0.0 faces.append({"cls": cls_id, "conf": score}) config.logger.info("检测到人脸:%s -> %s", image_path, faces) return True return False def main() -> None: args = parse_args() dataset_dir: Path = args.dataset_dir.expanduser().resolve() if not dataset_dir.exists(): raise SystemExit(f"目录不存在:{dataset_dir}") model = load_yolo_model() image_paths = list(iter_image_files(dataset_dir, args.extensions.split(","))) total = len(image_paths) if total == 0: print(f"目录 {dataset_dir} 下没有匹配到图片文件") return removed = 0 errored = 0 for idx, image_path in enumerate(image_paths, start=1): if idx % 100 == 0 or args.verbose: print(f"[{idx}/{total}] 正在处理 {image_path}") try: if has_face(model, image_path, args.confidence, args.verbose): continue except Exception as exc: # pragma: no cover errored += 1 config.logger.error("检测过程中发生异常,跳过 %s:%s", image_path, exc) continue if args.dry_run: print(f"[DRY-RUN] 将删除:{image_path}") else: try: image_path.unlink() print(f"已删除:{image_path}") except Exception as exc: # pragma: no cover errored += 1 config.logger.error("删除失败 %s:%s", image_path, exc) continue removed += 1 print( f"扫描完成,检测图片 {total} 张,删除 {removed} 张无脸图片,异常 {errored} 张,数据保存在:{dataset_dir}" ) if __name__ == "__main__": try: main() except KeyboardInterrupt: # pragma: no cover sys.exit("用户中断")