chenchaoyun commited on
Commit
586a20d
·
1 Parent(s): 70d9a8b
Files changed (1) hide show
  1. test/remove_faceless_images.py +169 -0
test/remove_faceless_images.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 遍历 /opt/data/chinese_celeb_dataset 下的图片,使用 YOLO 人脸检测并删除没有检测到人脸的图片。
4
+
5
+ 用法示例:
6
+ python test/remove_faceless_images.py --dry-run
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Iterable, List, Optional
15
+
16
+ import config
17
+
18
+ try:
19
+ from ultralytics import YOLO
20
+ except ImportError as exc: # pragma: no cover - 运行期缺依赖提示
21
+ raise SystemExit("缺少 ultralytics,请先执行 pip install ultralytics") from exc
22
+
23
+ # 默认数据集与模型配置
24
+ DEFAULT_DATASET_DIR = Path("/opt/data/chinese_celeb_dataset")
25
+ MODEL_DIR = Path(config.MODELS_PATH)
26
+ YOLO_MODEL_NAME = config.YOLO_MODEL
27
+
28
+
29
+ def parse_args() -> argparse.Namespace:
30
+ parser = argparse.ArgumentParser(
31
+ description="使用 YOLO 检测 /opt/data/chinese_celeb_dataset 中的图片并删除无脸图片"
32
+ )
33
+ parser.add_argument(
34
+ "--dataset-dir",
35
+ type=Path,
36
+ default=DEFAULT_DATASET_DIR,
37
+ help="需要检查的根目录(默认:/opt/data/chinese_celeb_dataset)",
38
+ )
39
+ parser.add_argument(
40
+ "--extensions",
41
+ type=str,
42
+ default=".jpg,.jpeg,.png,.webp,.bmp",
43
+ help="需要检查的图片扩展名,逗号分隔",
44
+ )
45
+ parser.add_argument(
46
+ "--confidence",
47
+ type=float,
48
+ default=config.FACE_CONFIDENCE,
49
+ help="YOLO 检测的人脸置信度阈值",
50
+ )
51
+ parser.add_argument(
52
+ "--dry-run",
53
+ action="store_true",
54
+ help="仅输出将被删除的文件,不真正删除,便于先预览结果",
55
+ )
56
+ parser.add_argument(
57
+ "--verbose",
58
+ action="store_true",
59
+ help="输出更多调试信息",
60
+ )
61
+ return parser.parse_args()
62
+
63
+
64
+ def load_yolo_model() -> YOLO:
65
+ """
66
+ 优先加载本地 models 目录下配置好的模型,如果不存在则回退为模型名称(会触发自动下载)。
67
+ """
68
+ candidates: List[str] = []
69
+ local_path = MODEL_DIR / YOLO_MODEL_NAME
70
+ if local_path.exists():
71
+ candidates.append(str(local_path))
72
+ candidates.append(YOLO_MODEL_NAME)
73
+
74
+ last_error: Optional[Exception] = None
75
+ for candidate in candidates:
76
+ try:
77
+ config.logger.info("尝试加载 YOLO 模型:%s", candidate)
78
+ return YOLO(candidate)
79
+ except Exception as exc: # pragma: no cover
80
+ last_error = exc
81
+ config.logger.warning("加载 YOLO 模型失败:%s -> %s", candidate, exc)
82
+
83
+ raise RuntimeError(f"无法加载 YOLO 模型:{YOLO_MODEL_NAME}") from last_error
84
+
85
+
86
+ def iter_image_files(root: Path, extensions: Iterable[str]) -> Iterable[Path]:
87
+ lower_exts = tuple(ext.strip().lower() for ext in extensions if ext.strip())
88
+ for path in root.rglob("*"):
89
+ if not path.is_file():
90
+ continue
91
+ if path.suffix.lower() in lower_exts:
92
+ yield path
93
+
94
+
95
+ def has_face(model: YOLO, image_path: Path, confidence: float, verbose: bool = False) -> bool:
96
+ """
97
+ 使用 YOLO 检测图片中是否存在人脸。检测到任意一个框即可视为有人脸。
98
+ """
99
+ try:
100
+ results = model(image_path, conf=confidence, verbose=False)
101
+ except Exception as exc: # pragma: no cover
102
+ config.logger.error("检测失败,跳过 %s:%s", image_path, exc)
103
+ return False
104
+
105
+ for result in results:
106
+ boxes = getattr(result, "boxes", None)
107
+ if boxes is None:
108
+ continue
109
+ if len(boxes) > 0:
110
+ if verbose:
111
+ faces = []
112
+ for box in boxes:
113
+ cls_id = int(box.cls[0]) if getattr(box, "cls", None) is not None else -1
114
+ score = float(box.conf[0]) if getattr(box, "conf", None) is not None else 0.0
115
+ faces.append({"cls": cls_id, "conf": score})
116
+ config.logger.info("检测到人脸:%s -> %s", image_path, faces)
117
+ return True
118
+ return False
119
+
120
+
121
+ def main() -> None:
122
+ args = parse_args()
123
+ dataset_dir: Path = args.dataset_dir.expanduser().resolve()
124
+ if not dataset_dir.exists():
125
+ raise SystemExit(f"目录不存在:{dataset_dir}")
126
+
127
+ model = load_yolo_model()
128
+ image_paths = list(iter_image_files(dataset_dir, args.extensions.split(",")))
129
+ total = len(image_paths)
130
+ if total == 0:
131
+ print(f"目录 {dataset_dir} 下没有匹配到图片文件")
132
+ return
133
+
134
+ removed = 0
135
+ errored = 0
136
+ for idx, image_path in enumerate(image_paths, start=1):
137
+ if idx % 100 == 0 or args.verbose:
138
+ print(f"[{idx}/{total}] 正在处理 {image_path}")
139
+
140
+ try:
141
+ if has_face(model, image_path, args.confidence, args.verbose):
142
+ continue
143
+ except Exception as exc: # pragma: no cover
144
+ errored += 1
145
+ config.logger.error("检测过程中发生异常,跳过 %s:%s", image_path, exc)
146
+ continue
147
+
148
+ if args.dry_run:
149
+ print(f"[DRY-RUN] 将删除:{image_path}")
150
+ else:
151
+ try:
152
+ image_path.unlink()
153
+ print(f"已删除:{image_path}")
154
+ except Exception as exc: # pragma: no cover
155
+ errored += 1
156
+ config.logger.error("删除失败 %s:%s", image_path, exc)
157
+ continue
158
+ removed += 1
159
+
160
+ print(
161
+ f"扫描完成,检测图片 {total} 张,删除 {removed} 张无脸图片,异常 {errored} 张,数据保存在:{dataset_dir}"
162
+ )
163
+
164
+
165
+ if __name__ == "__main__":
166
+ try:
167
+ main()
168
+ except KeyboardInterrupt: # pragma: no cover
169
+ sys.exit("用户中断")