#!/usr/bin/env python3
"""
キャプション掃除スクリプト
==========================

Anima 公式が推奨する以下のタグを ".txt" キャプションから除去:
  - Quality (Human): masterpiece, best quality, good quality, normal quality, low quality, worst quality
  - Quality (PonyV7 score): score_9, score_8, ..., score_1
  - Time period: year 2025, year 2024, ..., newest, recent, mid, early, old
  - Meta: highres, absurdres, anime screenshot, jpeg artifacts, official art,
          lowres, blurry, watermark, signature, web address, artist name,
          censored, uncensored 等

これらをデータから抜くことで、学習後のモデルは「タグなし = 高品質」を学ぶ。

使い方:
  python clean_captions.py --input ./raw --output ./cleaned \
      [--drop-artist-prob 0.0] [--drop-all-artists]
"""
import argparse
import re
import shutil
import random
from pathlib import Path


# 削除対象タグ (大文字小文字無視、前後空白許容)
QUALITY_TAGS_HUMAN = {
    "masterpiece", "best quality", "good quality",
    "normal quality", "low quality", "worst quality",
}

QUALITY_TAGS_SCORE = {f"score_{i}" for i in range(1, 10)}

# 公式 README より: 期間タグ
PERIOD_TAGS = {"newest", "recent", "mid", "early", "old"}

# Year タグは正規表現で吸収
YEAR_RE = re.compile(r"^year\s+(19|20)\d{2}$", re.IGNORECASE)

META_TAGS = {
    "highres", "absurdres", "lowres",
    "anime screenshot", "official art",
    "jpeg artifacts", "blurry", "bad quality",
    "watermark", "signature", "web address", "twitter username",
    "artist name", "logo",
    "censored", "uncensored", "mosaic censoring", "bar censor",
    "safe", "sensitive", "nsfw", "explicit",  # safety タグも除去 (要なら別フラグに)
}

FIXED_DROP = (
    {t.lower() for t in QUALITY_TAGS_HUMAN}
    | QUALITY_TAGS_SCORE
    | PERIOD_TAGS
    | META_TAGS
)


def should_drop(tag: str) -> bool:
    t = tag.strip().lower()
    if not t:
        return True
    if t in FIXED_DROP:
        return True
    if YEAR_RE.match(t):
        return True
    return False


def clean_caption(
    text: str,
    drop_artist_prob: float = 0.0,
    drop_all_artists: bool = False,
    rng: random.Random | None = None,
) -> str:
    rng = rng or random.Random()
    # ", " 区切りを基本に、Danbooru の "_" 区切りも空白へ統一されている前提
    parts = [p.strip() for p in text.split(",")]
    out = []
    for p in parts:
        if should_drop(p):
            continue
        # artist タグは "@xxx" 形式 (Anima 仕様)
        if p.startswith("@"):
            if drop_all_artists:
                continue
            if drop_artist_prob > 0 and rng.random() < drop_artist_prob:
                continue
        out.append(p)
    return ", ".join(out)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", required=True, type=Path)
    ap.add_argument("--output", required=True, type=Path)
    ap.add_argument(
        "--drop-artist-prob",
        type=float,
        default=0.0,
        help="artist タグ(@xxx)をランダムに drop する確率 (0.0〜1.0)",
    )
    ap.add_argument(
        "--drop-all-artists",
        action="store_true",
        help="artist タグを全部除去 (Anima 平均スタイル化)",
    )
    ap.add_argument(
        "--exts",
        default="png,jpg,jpeg,webp",
        help="画像拡張子(カンマ区切り)",
    )
    args = ap.parse_args()

    args.output.mkdir(parents=True, exist_ok=True)
    exts = tuple("." + e.lower() for e in args.exts.split(","))
    rng = random.Random(42)

    n_img, n_cap, n_missing = 0, 0, 0
    for img in args.input.rglob("*"):
        if not img.is_file() or not img.suffix.lower() in exts:
            continue
        n_img += 1
        rel = img.relative_to(args.input)
        out_img = args.output / rel
        out_img.parent.mkdir(parents=True, exist_ok=True)
        if not out_img.exists():
            shutil.copy(img, out_img)

        cap_in = img.with_suffix(".txt")
        cap_out = out_img.with_suffix(".txt")
        if not cap_in.exists():
            n_missing += 1
            cap_out.write_text("", encoding="utf-8")
            continue

        text = cap_in.read_text(encoding="utf-8", errors="ignore")
        cleaned = clean_caption(
            text,
            drop_artist_prob=args.drop_artist_prob,
            drop_all_artists=args.drop_all_artists,
            rng=rng,
        )
        cap_out.write_text(cleaned, encoding="utf-8")
        n_cap += 1

    print(f"[clean_captions] images={n_img}  captions={n_cap}  missing_txt={n_missing}")
    print(f"[clean_captions] output -> {args.output}")


if __name__ == "__main__":
    main()