rapid-anima / scripts /clean_captions.py
darask0's picture
Initial commit: rapid-anima distillation codebase
77cc641 verified
Raw
History Blame Contribute Delete
4.75 kB
#!/usr/bin/env python3
"""
キャプション掃除スクリプト
==========================
Anima 公式が推奨する以下のタグを ".txt" キャプションから除去:
- Quality (Human): masterpiece, best quality, good quality, normal quality, low quality, worst quality
- Quality (PonyV7 score): score_9, score_8, ..., score_1
- Time period: year 2025, year 2024, ..., newest, recent, mid, early, old
- Meta: highres, absurdres, anime screenshot, jpeg artifacts, official art,
lowres, blurry, watermark, signature, web address, artist name,
censored, uncensored 等
これらをデータから抜くことで、学習後のモデルは「タグなし = 高品質」を学ぶ。
使い方:
python clean_captions.py --input ./raw --output ./cleaned \
[--drop-artist-prob 0.0] [--drop-all-artists]
"""
import argparse
import re
import shutil
import random
from pathlib import Path
# 削除対象タグ (大文字小文字無視、前後空白許容)
QUALITY_TAGS_HUMAN = {
"masterpiece", "best quality", "good quality",
"normal quality", "low quality", "worst quality",
}
QUALITY_TAGS_SCORE = {f"score_{i}" for i in range(1, 10)}
# 公式 README より: 期間タグ
PERIOD_TAGS = {"newest", "recent", "mid", "early", "old"}
# Year タグは正規表現で吸収
YEAR_RE = re.compile(r"^year\s+(19|20)\d{2}$", re.IGNORECASE)
META_TAGS = {
"highres", "absurdres", "lowres",
"anime screenshot", "official art",
"jpeg artifacts", "blurry", "bad quality",
"watermark", "signature", "web address", "twitter username",
"artist name", "logo",
"censored", "uncensored", "mosaic censoring", "bar censor",
"safe", "sensitive", "nsfw", "explicit", # safety タグも除去 (要なら別フラグに)
}
FIXED_DROP = (
{t.lower() for t in QUALITY_TAGS_HUMAN}
| QUALITY_TAGS_SCORE
| PERIOD_TAGS
| META_TAGS
)
def should_drop(tag: str) -> bool:
t = tag.strip().lower()
if not t:
return True
if t in FIXED_DROP:
return True
if YEAR_RE.match(t):
return True
return False
def clean_caption(
text: str,
drop_artist_prob: float = 0.0,
drop_all_artists: bool = False,
rng: random.Random | None = None,
) -> str:
rng = rng or random.Random()
# ", " 区切りを基本に、Danbooru の "_" 区切りも空白へ統一されている前提
parts = [p.strip() for p in text.split(",")]
out = []
for p in parts:
if should_drop(p):
continue
# artist タグは "@xxx" 形式 (Anima 仕様)
if p.startswith("@"):
if drop_all_artists:
continue
if drop_artist_prob > 0 and rng.random() < drop_artist_prob:
continue
out.append(p)
return ", ".join(out)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--input", required=True, type=Path)
ap.add_argument("--output", required=True, type=Path)
ap.add_argument(
"--drop-artist-prob",
type=float,
default=0.0,
help="artist タグ(@xxx)をランダムに drop する確率 (0.0〜1.0)",
)
ap.add_argument(
"--drop-all-artists",
action="store_true",
help="artist タグを全部除去 (Anima 平均スタイル化)",
)
ap.add_argument(
"--exts",
default="png,jpg,jpeg,webp",
help="画像拡張子(カンマ区切り)",
)
args = ap.parse_args()
args.output.mkdir(parents=True, exist_ok=True)
exts = tuple("." + e.lower() for e in args.exts.split(","))
rng = random.Random(42)
n_img, n_cap, n_missing = 0, 0, 0
for img in args.input.rglob("*"):
if not img.is_file() or not img.suffix.lower() in exts:
continue
n_img += 1
rel = img.relative_to(args.input)
out_img = args.output / rel
out_img.parent.mkdir(parents=True, exist_ok=True)
if not out_img.exists():
shutil.copy(img, out_img)
cap_in = img.with_suffix(".txt")
cap_out = out_img.with_suffix(".txt")
if not cap_in.exists():
n_missing += 1
cap_out.write_text("", encoding="utf-8")
continue
text = cap_in.read_text(encoding="utf-8", errors="ignore")
cleaned = clean_caption(
text,
drop_artist_prob=args.drop_artist_prob,
drop_all_artists=args.drop_all_artists,
rng=rng,
)
cap_out.write_text(cleaned, encoding="utf-8")
n_cap += 1
print(f"[clean_captions] images={n_img} captions={n_cap} missing_txt={n_missing}")
print(f"[clean_captions] output -> {args.output}")
if __name__ == "__main__":
main()