| |
| """ |
| キャプション掃除スクリプト |
| ========================== |
| |
| Anima 公式が推奨する以下のタグを ".txt" キャプションから除去: |
| - Quality (Human): masterpiece, best quality, good quality, normal quality, low quality, worst quality |
| - Quality (PonyV7 score): score_9, score_8, ..., score_1 |
| - Time period: year 2025, year 2024, ..., newest, recent, mid, early, old |
| - Meta: highres, absurdres, anime screenshot, jpeg artifacts, official art, |
| lowres, blurry, watermark, signature, web address, artist name, |
| censored, uncensored 等 |
| |
| これらをデータから抜くことで、学習後のモデルは「タグなし = 高品質」を学ぶ。 |
| |
| 使い方: |
| python clean_captions.py --input ./raw --output ./cleaned \ |
| [--drop-artist-prob 0.0] [--drop-all-artists] |
| """ |
| import argparse |
| import re |
| import shutil |
| import random |
| from pathlib import Path |
|
|
|
|
| |
| QUALITY_TAGS_HUMAN = { |
| "masterpiece", "best quality", "good quality", |
| "normal quality", "low quality", "worst quality", |
| } |
|
|
| QUALITY_TAGS_SCORE = {f"score_{i}" for i in range(1, 10)} |
|
|
| |
| PERIOD_TAGS = {"newest", "recent", "mid", "early", "old"} |
|
|
| |
| YEAR_RE = re.compile(r"^year\s+(19|20)\d{2}$", re.IGNORECASE) |
|
|
| META_TAGS = { |
| "highres", "absurdres", "lowres", |
| "anime screenshot", "official art", |
| "jpeg artifacts", "blurry", "bad quality", |
| "watermark", "signature", "web address", "twitter username", |
| "artist name", "logo", |
| "censored", "uncensored", "mosaic censoring", "bar censor", |
| "safe", "sensitive", "nsfw", "explicit", |
| } |
|
|
| FIXED_DROP = ( |
| {t.lower() for t in QUALITY_TAGS_HUMAN} |
| | QUALITY_TAGS_SCORE |
| | PERIOD_TAGS |
| | META_TAGS |
| ) |
|
|
|
|
| def should_drop(tag: str) -> bool: |
| t = tag.strip().lower() |
| if not t: |
| return True |
| if t in FIXED_DROP: |
| return True |
| if YEAR_RE.match(t): |
| return True |
| return False |
|
|
|
|
| def clean_caption( |
| text: str, |
| drop_artist_prob: float = 0.0, |
| drop_all_artists: bool = False, |
| rng: random.Random | None = None, |
| ) -> str: |
| rng = rng or random.Random() |
| |
| parts = [p.strip() for p in text.split(",")] |
| out = [] |
| for p in parts: |
| if should_drop(p): |
| continue |
| |
| if p.startswith("@"): |
| if drop_all_artists: |
| continue |
| if drop_artist_prob > 0 and rng.random() < drop_artist_prob: |
| continue |
| out.append(p) |
| return ", ".join(out) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--input", required=True, type=Path) |
| ap.add_argument("--output", required=True, type=Path) |
| ap.add_argument( |
| "--drop-artist-prob", |
| type=float, |
| default=0.0, |
| help="artist タグ(@xxx)をランダムに drop する確率 (0.0〜1.0)", |
| ) |
| ap.add_argument( |
| "--drop-all-artists", |
| action="store_true", |
| help="artist タグを全部除去 (Anima 平均スタイル化)", |
| ) |
| ap.add_argument( |
| "--exts", |
| default="png,jpg,jpeg,webp", |
| help="画像拡張子(カンマ区切り)", |
| ) |
| args = ap.parse_args() |
|
|
| args.output.mkdir(parents=True, exist_ok=True) |
| exts = tuple("." + e.lower() for e in args.exts.split(",")) |
| rng = random.Random(42) |
|
|
| n_img, n_cap, n_missing = 0, 0, 0 |
| for img in args.input.rglob("*"): |
| if not img.is_file() or not img.suffix.lower() in exts: |
| continue |
| n_img += 1 |
| rel = img.relative_to(args.input) |
| out_img = args.output / rel |
| out_img.parent.mkdir(parents=True, exist_ok=True) |
| if not out_img.exists(): |
| shutil.copy(img, out_img) |
|
|
| cap_in = img.with_suffix(".txt") |
| cap_out = out_img.with_suffix(".txt") |
| if not cap_in.exists(): |
| n_missing += 1 |
| cap_out.write_text("", encoding="utf-8") |
| continue |
|
|
| text = cap_in.read_text(encoding="utf-8", errors="ignore") |
| cleaned = clean_caption( |
| text, |
| drop_artist_prob=args.drop_artist_prob, |
| drop_all_artists=args.drop_all_artists, |
| rng=rng, |
| ) |
| cap_out.write_text(cleaned, encoding="utf-8") |
| n_cap += 1 |
|
|
| print(f"[clean_captions] images={n_img} captions={n_cap} missing_txt={n_missing}") |
| print(f"[clean_captions] output -> {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|