rapid-anima / scripts /clean_captions.py

Initial commit: rapid-anima distillation codebase

77cc641 verified about 2 months ago

4.75 kB

	#!/usr/bin/env python3
	"""
	キャプション掃除スクリプト
	==========================

	Anima 公式が推奨する以下のタグを ".txt" キャプションから除去:
	- Quality (Human): masterpiece, best quality, good quality, normal quality, low quality, worst quality
	- Quality (PonyV7 score): score_9, score_8, ..., score_1
	- Time period: year 2025, year 2024, ..., newest, recent, mid, early, old
	- Meta: highres, absurdres, anime screenshot, jpeg artifacts, official art,
	lowres, blurry, watermark, signature, web address, artist name,
	censored, uncensored 等

	これらをデータから抜くことで、学習後のモデルは「タグなし = 高品質」を学ぶ。

	使い方:
	python clean_captions.py --input ./raw --output ./cleaned \
	[--drop-artist-prob 0.0] [--drop-all-artists]
	"""
	import argparse
	import re
	import shutil
	import random
	from pathlib import Path


	# 削除対象タグ (大文字小文字無視、前後空白許容)
	QUALITY_TAGS_HUMAN = {
	"masterpiece", "best quality", "good quality",
	"normal quality", "low quality", "worst quality",
	}

	QUALITY_TAGS_SCORE = {f"score_{i}" for i in range(1, 10)}

	# 公式 README より: 期間タグ
	PERIOD_TAGS = {"newest", "recent", "mid", "early", "old"}

	# Year タグは正規表現で吸収
	YEAR_RE = re.compile(r"^year\s+(19\|20)\d{2}$", re.IGNORECASE)

	META_TAGS = {
	"highres", "absurdres", "lowres",
	"anime screenshot", "official art",
	"jpeg artifacts", "blurry", "bad quality",
	"watermark", "signature", "web address", "twitter username",
	"artist name", "logo",
	"censored", "uncensored", "mosaic censoring", "bar censor",
	"safe", "sensitive", "nsfw", "explicit", # safety タグも除去 (要なら別フラグに)
	}

	FIXED_DROP = (
	{t.lower() for t in QUALITY_TAGS_HUMAN}
	\| QUALITY_TAGS_SCORE
	\| PERIOD_TAGS
	\| META_TAGS
	)


	def should_drop(tag: str) -> bool:
	t = tag.strip().lower()
	if not t:
	return True
	if t in FIXED_DROP:
	return True
	if YEAR_RE.match(t):
	return True
	return False


	def clean_caption(
	text: str,
	drop_artist_prob: float = 0.0,
	drop_all_artists: bool = False,
	rng: random.Random \| None = None,
	) -> str:
	rng = rng or random.Random()
	# ", " 区切りを基本に、Danbooru の "_" 区切りも空白へ統一されている前提
	parts = [p.strip() for p in text.split(",")]
	out = []
	for p in parts:
	if should_drop(p):
	continue
	# artist タグは "@xxx" 形式 (Anima 仕様)
	if p.startswith("@"):
	if drop_all_artists:
	continue
	if drop_artist_prob > 0 and rng.random() < drop_artist_prob:
	continue
	out.append(p)
	return ", ".join(out)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--input", required=True, type=Path)
	ap.add_argument("--output", required=True, type=Path)
	ap.add_argument(
	"--drop-artist-prob",
	type=float,
	default=0.0,
	help="artist タグ(@xxx)をランダムに drop する確率 (0.0〜1.0)",
	)
	ap.add_argument(
	"--drop-all-artists",
	action="store_true",
	help="artist タグを全部除去 (Anima 平均スタイル化)",
	)
	ap.add_argument(
	"--exts",
	default="png,jpg,jpeg,webp",
	help="画像拡張子(カンマ区切り)",
	)
	args = ap.parse_args()

	args.output.mkdir(parents=True, exist_ok=True)
	exts = tuple("." + e.lower() for e in args.exts.split(","))
	rng = random.Random(42)

	n_img, n_cap, n_missing = 0, 0, 0
	for img in args.input.rglob("*"):
	if not img.is_file() or not img.suffix.lower() in exts:
	continue
	n_img += 1
	rel = img.relative_to(args.input)
	out_img = args.output / rel
	out_img.parent.mkdir(parents=True, exist_ok=True)
	if not out_img.exists():
	shutil.copy(img, out_img)

	cap_in = img.with_suffix(".txt")
	cap_out = out_img.with_suffix(".txt")
	if not cap_in.exists():
	n_missing += 1
	cap_out.write_text("", encoding="utf-8")
	continue

	text = cap_in.read_text(encoding="utf-8", errors="ignore")
	cleaned = clean_caption(
	text,
	drop_artist_prob=args.drop_artist_prob,
	drop_all_artists=args.drop_all_artists,
	rng=rng,
	)
	cap_out.write_text(cleaned, encoding="utf-8")
	n_cap += 1

	print(f"[clean_captions] images={n_img} captions={n_cap} missing_txt={n_missing}")
	print(f"[clean_captions] output -> {args.output}")


	if __name__ == "__main__":
	main()