#!/usr/bin/env python3 """ キャプション掃除スクリプト ========================== Anima 公式が推奨する以下のタグを ".txt" キャプションから除去: - Quality (Human): masterpiece, best quality, good quality, normal quality, low quality, worst quality - Quality (PonyV7 score): score_9, score_8, ..., score_1 - Time period: year 2025, year 2024, ..., newest, recent, mid, early, old - Meta: highres, absurdres, anime screenshot, jpeg artifacts, official art, lowres, blurry, watermark, signature, web address, artist name, censored, uncensored 等 これらをデータから抜くことで、学習後のモデルは「タグなし = 高品質」を学ぶ。 使い方: python clean_captions.py --input ./raw --output ./cleaned \ [--drop-artist-prob 0.0] [--drop-all-artists] """ import argparse import re import shutil import random from pathlib import Path # 削除対象タグ (大文字小文字無視、前後空白許容) QUALITY_TAGS_HUMAN = { "masterpiece", "best quality", "good quality", "normal quality", "low quality", "worst quality", } QUALITY_TAGS_SCORE = {f"score_{i}" for i in range(1, 10)} # 公式 README より: 期間タグ PERIOD_TAGS = {"newest", "recent", "mid", "early", "old"} # Year タグは正規表現で吸収 YEAR_RE = re.compile(r"^year\s+(19|20)\d{2}$", re.IGNORECASE) META_TAGS = { "highres", "absurdres", "lowres", "anime screenshot", "official art", "jpeg artifacts", "blurry", "bad quality", "watermark", "signature", "web address", "twitter username", "artist name", "logo", "censored", "uncensored", "mosaic censoring", "bar censor", "safe", "sensitive", "nsfw", "explicit", # safety タグも除去 (要なら別フラグに) } FIXED_DROP = ( {t.lower() for t in QUALITY_TAGS_HUMAN} | QUALITY_TAGS_SCORE | PERIOD_TAGS | META_TAGS ) def should_drop(tag: str) -> bool: t = tag.strip().lower() if not t: return True if t in FIXED_DROP: return True if YEAR_RE.match(t): return True return False def clean_caption( text: str, drop_artist_prob: float = 0.0, drop_all_artists: bool = False, rng: random.Random | None = None, ) -> str: rng = rng or random.Random() # ", " 区切りを基本に、Danbooru の "_" 区切りも空白へ統一されている前提 parts = [p.strip() for p in text.split(",")] out = [] for p in parts: if should_drop(p): continue # artist タグは "@xxx" 形式 (Anima 仕様) if p.startswith("@"): if drop_all_artists: continue if drop_artist_prob > 0 and rng.random() < drop_artist_prob: continue out.append(p) return ", ".join(out) def main(): ap = argparse.ArgumentParser() ap.add_argument("--input", required=True, type=Path) ap.add_argument("--output", required=True, type=Path) ap.add_argument( "--drop-artist-prob", type=float, default=0.0, help="artist タグ(@xxx)をランダムに drop する確率 (0.0〜1.0)", ) ap.add_argument( "--drop-all-artists", action="store_true", help="artist タグを全部除去 (Anima 平均スタイル化)", ) ap.add_argument( "--exts", default="png,jpg,jpeg,webp", help="画像拡張子(カンマ区切り)", ) args = ap.parse_args() args.output.mkdir(parents=True, exist_ok=True) exts = tuple("." + e.lower() for e in args.exts.split(",")) rng = random.Random(42) n_img, n_cap, n_missing = 0, 0, 0 for img in args.input.rglob("*"): if not img.is_file() or not img.suffix.lower() in exts: continue n_img += 1 rel = img.relative_to(args.input) out_img = args.output / rel out_img.parent.mkdir(parents=True, exist_ok=True) if not out_img.exists(): shutil.copy(img, out_img) cap_in = img.with_suffix(".txt") cap_out = out_img.with_suffix(".txt") if not cap_in.exists(): n_missing += 1 cap_out.write_text("", encoding="utf-8") continue text = cap_in.read_text(encoding="utf-8", errors="ignore") cleaned = clean_caption( text, drop_artist_prob=args.drop_artist_prob, drop_all_artists=args.drop_all_artists, rng=rng, ) cap_out.write_text(cleaned, encoding="utf-8") n_cap += 1 print(f"[clean_captions] images={n_img} captions={n_cap} missing_txt={n_missing}") print(f"[clean_captions] output -> {args.output}") if __name__ == "__main__": main()