Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,62 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
-
from typing import List, Dict, Tuple
|
| 4 |
-
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import numpy as np
|
| 8 |
from PIL import Image
|
| 9 |
import cv2
|
| 10 |
|
| 11 |
-
|
| 12 |
import torch
|
| 13 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
# -------- Settings --------
|
| 17 |
-
DEFAULT_MODEL = "Salesforce/blip-image-captioning-base" # CPU friendly
|
| 18 |
-
BIG_MODEL = "Salesforce/blip-image-captioning-large" # better on T4/A10G
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
# Blur threshold: lower => more tolerant (keep blurrier images)
|
| 22 |
BLUR_VAR_THRESHOLD = 100.0
|
| 23 |
|
| 24 |
-
|
| 25 |
-
# Work dirs inside the Space container
|
| 26 |
ROOT = Path("workspace")
|
| 27 |
IMAGES_DIR = ROOT / "images"
|
| 28 |
EXPORT_DIR = ROOT / "export"
|
|
@@ -30,60 +64,375 @@ ROOT.mkdir(parents=True, exist_ok=True)
|
|
| 30 |
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
| 31 |
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 32 |
|
| 33 |
-
|
| 34 |
-
# -------- Utilities --------
|
| 35 |
-
|
| 36 |
-
|
| 37 |
def clear_workspace():
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
def is_image(fname: str) -> bool:
|
| 49 |
-
ext = str(fname).lower().split(".")[-1]
|
| 50 |
-
return ext in ["jpg", "jpeg", "png", "bmp", "webp"]
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
def laplacian_var_blur(pil_img: Image.Image) -> float:
|
| 56 |
-
arr = np.array(pil_img.convert("L"))
|
| 57 |
-
|
| 58 |
-
return
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
|
| 63 |
def dhash(pil_img: Image.Image, hash_size: int = 8) -> str:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
return ''.join('1' if v else '0' for v in diff.flatten())
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
def save_uploaded_files(files: List[gr.File]) -> List[str]:
|
| 73 |
-
saved = []
|
| 74 |
-
for f in files:
|
| 75 |
-
if f is None:
|
| 76 |
-
continue
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
| 86 |
def unzip_to_images(zbytes: bytes) -> List[str]:
|
| 87 |
-
saved = []
|
| 88 |
-
with zipfile.ZipFile(io.BytesIO(zbytes)) as zf:
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
"""
|
| 3 |
+
Semi-Auto Image Captioning - Full version for HF Spaces (Gradio)
|
| 4 |
+
Features:
|
| 5 |
+
- ingest images or ZIP
|
| 6 |
+
- preprocess: Laplacian blur (OpenCV), dHash de-dupe
|
| 7 |
+
- optional InsightFace filtering (if insightface installed)
|
| 8 |
+
- auto caption: BLIP (base / large)
|
| 9 |
+
- optional taggers: WD14 / CLIP Interrogator (if installed)
|
| 10 |
+
- human edit via Gradio Dataframe & export CSV/JSONL/ZIP
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import io
|
| 15 |
+
import shutil
|
| 16 |
+
import zipfile
|
| 17 |
+
import hashlib
|
| 18 |
+
import json
|
| 19 |
from pathlib import Path
|
| 20 |
+
from typing import List, Dict, Tuple, Optional
|
|
|
|
| 21 |
|
| 22 |
import gradio as gr
|
| 23 |
import numpy as np
|
| 24 |
from PIL import Image
|
| 25 |
import cv2
|
| 26 |
|
|
|
|
| 27 |
import torch
|
| 28 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 29 |
|
| 30 |
+
# Optional: try to import InsightFace and CLIP interrogator style modules
|
| 31 |
+
try:
|
| 32 |
+
import insightface
|
| 33 |
+
from insightface.app import FaceAnalysis
|
| 34 |
+
_HAS_INSIGHTFACE = True
|
| 35 |
+
except Exception:
|
| 36 |
+
_HAS_INSIGHTFACE = False
|
| 37 |
+
|
| 38 |
+
# Optional taggers (WD14 or CLIP Interrogator)
|
| 39 |
+
# We do a soft import so Space works even if these are not available.
|
| 40 |
+
try:
|
| 41 |
+
from clip_interrogator import ClipInterrogator, Config as CIConfig # hypothetical
|
| 42 |
+
_HAS_CI = True
|
| 43 |
+
except Exception:
|
| 44 |
+
_HAS_CI = False
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# placeholder for WD14 tagger library import
|
| 48 |
+
import wd14_tagger # hypothetical package name
|
| 49 |
+
_HAS_WD14 = True
|
| 50 |
+
except Exception:
|
| 51 |
+
_HAS_WD14 = False
|
| 52 |
+
|
| 53 |
+
# ---------------- Settings ----------------
|
| 54 |
+
DEFAULT_MODEL = "Salesforce/blip-image-captioning-base" # CPU friendly
|
| 55 |
+
BIG_MODEL = "Salesforce/blip-image-captioning-large" # GPU recommended
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
BLUR_VAR_THRESHOLD = 100.0
|
| 58 |
|
| 59 |
+
# Work directories inside the Space container
|
|
|
|
| 60 |
ROOT = Path("workspace")
|
| 61 |
IMAGES_DIR = ROOT / "images"
|
| 62 |
EXPORT_DIR = ROOT / "export"
|
|
|
|
| 64 |
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
| 65 |
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 66 |
|
| 67 |
+
# ---------------- Utilities ----------------
|
|
|
|
|
|
|
|
|
|
| 68 |
def clear_workspace():
|
| 69 |
+
"""Remove workspace images/export and recreate directories."""
|
| 70 |
+
if IMAGES_DIR.exists():
|
| 71 |
+
shutil.rmtree(IMAGES_DIR)
|
| 72 |
+
if EXPORT_DIR.exists():
|
| 73 |
+
shutil.rmtree(EXPORT_DIR)
|
| 74 |
+
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
| 75 |
+
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def is_image(fname: str) -> bool:
|
| 78 |
+
ext = str(fname).lower().split(".")[-1]
|
| 79 |
+
return ext in ["jpg", "jpeg", "png", "bmp", "webp"]
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
def laplacian_var_blur(pil_img: Image.Image) -> float:
|
| 82 |
+
arr = np.array(pil_img.convert("L"))
|
| 83 |
+
if arr.size == 0:
|
| 84 |
+
return 0.0
|
| 85 |
+
fm = cv2.Laplacian(arr, cv2.CV_64F).var()
|
| 86 |
+
return float(fm)
|
|
|
|
| 87 |
|
| 88 |
def dhash(pil_img: Image.Image, hash_size: int = 8) -> str:
|
| 89 |
+
img = pil_img.convert("L").resize((hash_size + 1, hash_size), Image.LANCZOS)
|
| 90 |
+
diff = np.array(img)[:, 1:] > np.array(img)[:, :-1]
|
| 91 |
+
return ''.join('1' if v else '0' for v in diff.flatten())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
def save_uploaded_files(files: List[gr.File]) -> List[str]:
|
| 94 |
+
saved = []
|
| 95 |
+
for f in files:
|
| 96 |
+
if f is None:
|
| 97 |
+
continue
|
| 98 |
+
# gradio file object: f.name is the temporary path on server
|
| 99 |
+
name = os.path.basename(f.name)
|
| 100 |
+
dst = IMAGES_DIR / name
|
| 101 |
+
shutil.copy(f.name, dst)
|
| 102 |
+
saved.append(str(dst))
|
| 103 |
+
return saved
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def unzip_to_images(zbytes: bytes) -> List[str]:
|
| 106 |
+
saved = []
|
| 107 |
+
with zipfile.ZipFile(io.BytesIO(zbytes)) as zf:
|
| 108 |
+
for info in zf.infolist():
|
| 109 |
+
if info.is_dir():
|
| 110 |
+
continue
|
| 111 |
+
if not is_image(info.filename):
|
| 112 |
+
continue
|
| 113 |
+
with zf.open(info) as src:
|
| 114 |
+
data = src.read()
|
| 115 |
+
fname = os.path.basename(info.filename)
|
| 116 |
+
dst = IMAGES_DIR / fname
|
| 117 |
+
with open(dst, 'wb') as out:
|
| 118 |
+
out.write(data)
|
| 119 |
+
saved.append(str(dst))
|
| 120 |
+
return saved
|
| 121 |
+
|
| 122 |
+
# ---------------- Optional InsightFace wrapper ----------------
|
| 123 |
+
_insightface_app = None
|
| 124 |
+
if _HAS_INSIGHTFACE:
|
| 125 |
+
try:
|
| 126 |
+
_insightface_app = FaceAnalysis(providers=['CPUExecutionProvider']) # or CUDA if available
|
| 127 |
+
_insightface_app.prepare(ctx_id=0 if torch.cuda.is_available() else -1, det_size=(640, 640))
|
| 128 |
+
except Exception:
|
| 129 |
+
_insightface_app = None
|
| 130 |
+
_HAS_INSIGHTFACE = False
|
| 131 |
+
|
| 132 |
+
def insightface_quality_score(pil_img: Image.Image) -> Optional[float]:
|
| 133 |
+
"""Return a simple face quality score if InsightFace available, else None.
|
| 134 |
+
We compute average detection 'bbox score' as a proxy (if provided by model).
|
| 135 |
+
"""
|
| 136 |
+
if not _HAS_INSIGHTFACE or _insightface_app is None:
|
| 137 |
+
return None
|
| 138 |
+
try:
|
| 139 |
+
arr = np.array(pil_img.convert("RGB"))
|
| 140 |
+
res = _insightface_app.get(arr)
|
| 141 |
+
if not res:
|
| 142 |
+
return 0.0
|
| 143 |
+
# Some insightface returns dict-like object with bbox/score
|
| 144 |
+
scores = []
|
| 145 |
+
for r in res:
|
| 146 |
+
# support different result structures
|
| 147 |
+
s = getattr(r, 'det_score', None) or getattr(r, 'score', None) or None
|
| 148 |
+
if s is not None:
|
| 149 |
+
scores.append(float(s))
|
| 150 |
+
if not scores:
|
| 151 |
+
return 0.0
|
| 152 |
+
return float(np.mean(scores))
|
| 153 |
+
except Exception:
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
# ---------------- Captioner ----------------
|
| 157 |
+
class BlipCaptioner:
|
| 158 |
+
def __init__(self, model_name: str = DEFAULT_MODEL, device: str = None):
|
| 159 |
+
self.model_name = model_name
|
| 160 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 161 |
+
# load processor & model
|
| 162 |
+
self.processor = BlipProcessor.from_pretrained(model_name)
|
| 163 |
+
self.model = BlipForConditionalGeneration.from_pretrained(model_name)
|
| 164 |
+
if self.device == "cuda":
|
| 165 |
+
try:
|
| 166 |
+
self.model = self.model.half().to(self.device)
|
| 167 |
+
except Exception:
|
| 168 |
+
self.model = self.model.to(self.device)
|
| 169 |
+
else:
|
| 170 |
+
self.model = self.model.to(self.device)
|
| 171 |
+
|
| 172 |
+
@torch.inference_mode()
|
| 173 |
+
def caption(self, pil_img: Image.Image, max_new_tokens: int = 40) -> str:
|
| 174 |
+
inputs = self.processor(images=pil_img, return_tensors="pt").to(self.device)
|
| 175 |
+
out = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
| 176 |
+
text = self.processor.decode(out[0], skip_special_tokens=True)
|
| 177 |
+
return text.strip()
|
| 178 |
+
|
| 179 |
+
_captioner_cache: Dict[str, BlipCaptioner] = {}
|
| 180 |
+
|
| 181 |
+
def get_captioner(model_name: str) -> BlipCaptioner:
|
| 182 |
+
key = model_name
|
| 183 |
+
if key not in _captioner_cache:
|
| 184 |
+
_captioner_cache[key] = BlipCaptioner(model_name=model_name)
|
| 185 |
+
return _captioner_cache[key]
|
| 186 |
+
|
| 187 |
+
# ---------------- Optional Taggers ----------------
|
| 188 |
+
# These are placeholders: if real libs are installed, replace with real calls.
|
| 189 |
+
_ci = None
|
| 190 |
+
if _HAS_CI:
|
| 191 |
+
try:
|
| 192 |
+
ci_cfg = CIConfig()
|
| 193 |
+
_ci = ClipInterrogator(ci_cfg)
|
| 194 |
+
except Exception:
|
| 195 |
+
_ci = None
|
| 196 |
+
_HAS_CI = False
|
| 197 |
+
|
| 198 |
+
def clip_interrogate_caption(pil_img: Image.Image) -> Optional[str]:
|
| 199 |
+
if not _HAS_CI or _ci is None:
|
| 200 |
+
return None
|
| 201 |
+
try:
|
| 202 |
+
return _ci.interrogate(pil_img)
|
| 203 |
+
except Exception:
|
| 204 |
+
return None
|
| 205 |
+
|
| 206 |
+
def wd14_tags(pil_img: Image.Image) -> Optional[List[str]]:
|
| 207 |
+
if not _HAS_WD14:
|
| 208 |
+
return None
|
| 209 |
+
try:
|
| 210 |
+
# hypothetical API, replace if you install a real wd14 tagger
|
| 211 |
+
tags = wd14_tagger.infer_tags(pil_img)
|
| 212 |
+
return tags
|
| 213 |
+
except Exception:
|
| 214 |
+
return None
|
| 215 |
+
|
| 216 |
+
# ---------------- Pipeline steps ----------------
|
| 217 |
+
def step_ingest(files, zip_file):
|
| 218 |
+
"""
|
| 219 |
+
Ingest files or zip, clear workspace, and save incoming images.
|
| 220 |
+
Return: gallery, table
|
| 221 |
+
gallery: list of (path, filename)
|
| 222 |
+
table: rows [name, path, status, caption, blur_var, hash]
|
| 223 |
+
"""
|
| 224 |
+
clear_workspace()
|
| 225 |
+
saved = []
|
| 226 |
+
if files:
|
| 227 |
+
saved += save_uploaded_files(files)
|
| 228 |
+
if zip_file is not None:
|
| 229 |
+
try:
|
| 230 |
+
with open(zip_file.name, "rb") as f:
|
| 231 |
+
zbytes = f.read()
|
| 232 |
+
saved += unzip_to_images(zbytes)
|
| 233 |
+
except Exception:
|
| 234 |
+
# gradio may provide zip file as bytes in memory
|
| 235 |
+
try:
|
| 236 |
+
zbytes = zip_file.read()
|
| 237 |
+
saved += unzip_to_images(zbytes)
|
| 238 |
+
except Exception:
|
| 239 |
+
pass
|
| 240 |
+
|
| 241 |
+
gallery = [(p, os.path.basename(p)) for p in saved if is_image(p)]
|
| 242 |
+
table = [[os.path.basename(p), p, "", "", 0.0, ""] for p in saved if is_image(p)]
|
| 243 |
+
return gallery, table
|
| 244 |
+
|
| 245 |
+
def step_preprocess(table, rm_blurry=True, rm_dupes=True, blur_thr=BLUR_VAR_THRESHOLD, use_insightface=False, face_score_thr=0.1):
|
| 246 |
+
"""
|
| 247 |
+
table: list of rows [name, path, status, caption, blur_var, hash]
|
| 248 |
+
Returns new table with statuses set to "kept" or "filtered:reason"
|
| 249 |
+
"""
|
| 250 |
+
seen_hashes = set()
|
| 251 |
+
new_table = []
|
| 252 |
+
for row in table:
|
| 253 |
+
try:
|
| 254 |
+
name, path, status, caption, blur_var, dh = row
|
| 255 |
+
except Exception:
|
| 256 |
+
# malformed row, skip
|
| 257 |
+
continue
|
| 258 |
+
try:
|
| 259 |
+
pil = Image.open(path).convert("RGB")
|
| 260 |
+
except Exception:
|
| 261 |
+
row[2] = "read_error"
|
| 262 |
+
new_table.append(row)
|
| 263 |
+
continue
|
| 264 |
+
|
| 265 |
+
blur = laplacian_var_blur(pil)
|
| 266 |
+
ph = dhash(pil)
|
| 267 |
+
keep = True
|
| 268 |
+
reason = []
|
| 269 |
+
|
| 270 |
+
if rm_blurry and blur < blur_thr:
|
| 271 |
+
keep = False
|
| 272 |
+
reason.append(f"blur<{blur_thr:.0f}")
|
| 273 |
+
|
| 274 |
+
if rm_dupes and ph in seen_hashes:
|
| 275 |
+
keep = False
|
| 276 |
+
reason.append("duplicate")
|
| 277 |
+
|
| 278 |
+
if use_insightface and _HAS_INSIGHTFACE:
|
| 279 |
+
score = insightface_quality_score(pil)
|
| 280 |
+
if score is not None:
|
| 281 |
+
# treat very low score as filter
|
| 282 |
+
if score < face_score_thr:
|
| 283 |
+
keep = False
|
| 284 |
+
reason.append("low_face_score")
|
| 285 |
+
|
| 286 |
+
if keep:
|
| 287 |
+
seen_hashes.add(ph)
|
| 288 |
+
new_table.append([name, path, "kept", caption, blur, ph])
|
| 289 |
+
else:
|
| 290 |
+
new_table.append([name, path, "filtered:" + ",".join(reason), caption, blur, ph])
|
| 291 |
+
|
| 292 |
+
return new_table
|
| 293 |
+
|
| 294 |
+
def step_autocaption(table, model_choice: str, max_tokens: int, use_ci=False, use_wd14=False):
|
| 295 |
+
"""
|
| 296 |
+
For each kept row, generate caption (BLIP) and optionally append tags from other taggers.
|
| 297 |
+
"""
|
| 298 |
+
cap = get_captioner(model_choice)
|
| 299 |
+
new_table = []
|
| 300 |
+
for row in table:
|
| 301 |
+
name, path, status, caption, blur_var, dh = row
|
| 302 |
+
if not os.path.exists(path):
|
| 303 |
+
row[2] = "missing"
|
| 304 |
+
new_table.append(row)
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# only process kept items (or empty status)
|
| 308 |
+
if not status.startswith("kept") and status != "":
|
| 309 |
+
new_table.append(row)
|
| 310 |
+
continue
|
| 311 |
+
|
| 312 |
+
try:
|
| 313 |
+
pil = Image.open(path).convert("RGB")
|
| 314 |
+
auto_cap = cap.caption(pil, max_new_tokens=max_tokens)
|
| 315 |
+
except Exception as e:
|
| 316 |
+
auto_cap = f"<error: {e}>"
|
| 317 |
+
|
| 318 |
+
# optional additional interrogator / tagger info
|
| 319 |
+
extras = []
|
| 320 |
+
if use_ci:
|
| 321 |
+
try:
|
| 322 |
+
ci_cap = clip_interrogate_caption(pil)
|
| 323 |
+
if ci_cap:
|
| 324 |
+
extras.append(ci_cap)
|
| 325 |
+
except Exception:
|
| 326 |
+
pass
|
| 327 |
+
if use_wd14:
|
| 328 |
+
try:
|
| 329 |
+
tags = wd14_tags(pil)
|
| 330 |
+
if tags:
|
| 331 |
+
extras.append(", ".join(tags))
|
| 332 |
+
except Exception:
|
| 333 |
+
pass
|
| 334 |
+
|
| 335 |
+
final_caption = caption if caption else auto_cap
|
| 336 |
+
if extras:
|
| 337 |
+
# keep extras briefer and join
|
| 338 |
+
final_caption = final_caption + " | " + " | ".join(extras)
|
| 339 |
+
|
| 340 |
+
new_table.append([name, path, "kept", final_caption, blur_var, dh])
|
| 341 |
+
|
| 342 |
+
return new_table
|
| 343 |
+
|
| 344 |
+
def step_export(table, file_prefix: str = "dataset") -> Tuple[str, str, str]:
|
| 345 |
+
"""
|
| 346 |
+
Build CSV, JSONL and ZIP. Return (csv_path, jsonl_path, zip_path)
|
| 347 |
+
"""
|
| 348 |
+
rows = []
|
| 349 |
+
for name, path, status, caption, blur_var, dh in table:
|
| 350 |
+
if status.startswith("kept") and caption and len(caption.strip()) > 0:
|
| 351 |
+
rows.append({"image": path, "caption": caption})
|
| 352 |
+
|
| 353 |
+
csv_path = EXPORT_DIR / f"{file_prefix}.csv"
|
| 354 |
+
jsonl_path = EXPORT_DIR / f"{file_prefix}.jsonl"
|
| 355 |
+
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
# write CSV
|
| 358 |
+
import csv
|
| 359 |
+
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
| 360 |
+
w = csv.writer(f)
|
| 361 |
+
w.writerow(["image", "caption"])
|
| 362 |
+
for r in rows:
|
| 363 |
+
w.writerow([r["image"], r["caption"]])
|
| 364 |
+
|
| 365 |
+
# write JSONL
|
| 366 |
+
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
| 367 |
+
for r in rows:
|
| 368 |
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
| 369 |
+
|
| 370 |
+
# Zip package (images + csv/jsonl)
|
| 371 |
+
zip_path = EXPORT_DIR / f"{file_prefix}.zip"
|
| 372 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
|
| 373 |
+
z.write(csv_path, arcname=csv_path.name)
|
| 374 |
+
z.write(jsonl_path, arcname=jsonl_path.name)
|
| 375 |
+
for r in rows:
|
| 376 |
+
src = Path(r["image"])
|
| 377 |
+
if src.exists():
|
| 378 |
+
z.write(src, arcname=f"images/{src.name}")
|
| 379 |
+
|
| 380 |
+
return str(csv_path), str(jsonl_path), str(zip_path)
|
| 381 |
+
|
| 382 |
+
# ---------------- Gradio UI ----------------
|
| 383 |
+
title_md = """
|
| 384 |
+
# 半自动图像标注(Captioning)
|
| 385 |
+
**步骤**:上传图片或 ZIP → 预处理/过滤 → 自动打草稿 → 人工修订(表格) → 导出 CSV/JSONL/ZIP。
|
| 386 |
+
"""
|
| 387 |
+
|
| 388 |
+
with gr.Blocks(title="Semi-Auto Image Captioning") as demo:
|
| 389 |
+
gr.Markdown(title_md)
|
| 390 |
+
|
| 391 |
+
with gr.Row():
|
| 392 |
+
with gr.Column():
|
| 393 |
+
files = gr.File(file_count="multiple", file_types=["image"], label="上传图片(可多选)")
|
| 394 |
+
zip_up = gr.File(file_count="single", file_types=[".zip"], label="或上传 ZIP(包含图片)")
|
| 395 |
+
btn_ingest = gr.Button("1) 导入")
|
| 396 |
+
|
| 397 |
+
with gr.Column():
|
| 398 |
+
gallery = gr.Gallery(label="预览", show_label=True, columns=6, height=260)
|
| 399 |
+
table = gr.Dataframe(
|
| 400 |
+
headers=["name", "path", "status", "caption", "blur_var", "hash"],
|
| 401 |
+
datatype=["str", "str", "str", "str", "number", "str"],
|
| 402 |
+
row_count=(0, "dynamic"),
|
| 403 |
+
col_count=(6, "fixed"),
|
| 404 |
+
wrap=True,
|
| 405 |
+
interactive=True,
|
| 406 |
+
label="数据表(可直接编辑 caption)"
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
with gr.Row():
|
| 410 |
+
rm_blur = gr.Checkbox(value=True, label="过滤模糊图")
|
| 411 |
+
rm_dup = gr.Checkbox(value=True, label="去重")
|
| 412 |
+
blur_thr = gr.Slider(10, 500, value=BLUR_VAR_THRESHOLD, step=10, label="模糊阈值 (Laplacian Var)")
|
| 413 |
+
use_insight = gr.Checkbox(value=False, label="使用 InsightFace 进行人脸质量检测(可选)")
|
| 414 |
+
face_thr = gr.Slider(0.0, 1.0, value=0.1, step=0.01, label="InsightFace 人脸质量阈值(越高越严格)")
|
| 415 |
+
btn_pre = gr.Button("2) 预处理/过滤")
|
| 416 |
+
|
| 417 |
+
with gr.Row():
|
| 418 |
+
model_choice = gr.Dropdown(choices=[DEFAULT_MODEL, BIG_MODEL], value=DEFAULT_MODEL, label="BLIP 模型")
|
| 419 |
+
max_toks = gr.Slider(16, 80, value=40, step=4, label="最大新词数")
|
| 420 |
+
use_ci = gr.Checkbox(value=False, label="使用 CLIP Interrogator(可选)")
|
| 421 |
+
use_wd14 = gr.Checkbox(value=False, label="使用 WD14 Tagger(可选)")
|
| 422 |
+
btn_caption = gr.Button("3) 自动打草稿 (Caption)")
|
| 423 |
+
|
| 424 |
+
with gr.Row():
|
| 425 |
+
prefix = gr.Textbox(value="dataset", label="导出文件前缀")
|
| 426 |
+
btn_export = gr.Button("4) 导出 CSV / JSONL / ZIP")
|
| 427 |
+
csv_out = gr.File(label="CSV")
|
| 428 |
+
jsonl_out = gr.File(label="JSONL")
|
| 429 |
+
zip_out = gr.File(label="打包 ZIP")
|
| 430 |
+
|
| 431 |
+
# wiring
|
| 432 |
+
btn_ingest.click(fn=step_ingest, inputs=[files, zip_up], outputs=[gallery, table])
|
| 433 |
+
btn_pre.click(fn=step_preprocess, inputs=[table, rm_blur, rm_dup, blur_thr, use_insight, face_thr], outputs=table)
|
| 434 |
+
btn_caption.click(fn=step_autocaption, inputs=[table, model_choice, max_toks, use_ci, use_wd14], outputs=table)
|
| 435 |
+
btn_export.click(fn=step_export, inputs=[table, prefix], outputs=[csv_out, jsonl_out, zip_out])
|
| 436 |
+
|
| 437 |
+
if __name__ == "__main__":
|
| 438 |
+
demo.launch()
|