Upload 2 files
Browse files- llm_captions.py +85 -0
- maskgen.py +159 -0
llm_captions.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import base64
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
|
| 10 |
+
OLLAMA_MODEL = "gemma3"
|
| 11 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
+
|
| 13 |
+
def image_to_base64(image_path):
|
| 14 |
+
with Image.open(image_path).convert("RGB") as img:
|
| 15 |
+
buffered = BytesIO()
|
| 16 |
+
img.save(buffered, format="JPEG")
|
| 17 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 18 |
+
|
| 19 |
+
def generate_caption_vlm(image_path, base_tags=None):
|
| 20 |
+
prompt = (
|
| 21 |
+
"Write one concise and factual caption for this image for machine learning training; write only the caption without extra text. "
|
| 22 |
+
" Describe ONLY the image contents. Describe the characters, background, and color, style, coverage, and relevant characteristics of the clothes. "
|
| 23 |
+
" NO style words or lighting descriptions. NO mention of camera, lens, quality or mood. One simple descriptive paragraph."
|
| 24 |
+
"Avoid creative or poetic language. Avoid context setting language. Use 50 words or less. "
|
| 25 |
+
f"Include these tags at the end: {', '.join(base_tags) if base_tags else ''}."
|
| 26 |
+
)
|
| 27 |
+
image_base64 = image_to_base64(image_path)
|
| 28 |
+
payload = {
|
| 29 |
+
"model": OLLAMA_MODEL,
|
| 30 |
+
"prompt": prompt,
|
| 31 |
+
"images": [image_base64],
|
| 32 |
+
"options": {"temperature": 0.2},
|
| 33 |
+
"stream": False
|
| 34 |
+
}
|
| 35 |
+
try:
|
| 36 |
+
response = requests.post(OLLAMA_URL, json=payload)
|
| 37 |
+
response.raise_for_status()
|
| 38 |
+
result = response.json()
|
| 39 |
+
return result.get("response", "").strip()
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"⚠️ Ollama VLM request failed: {e}")
|
| 42 |
+
return "Auto-tagged"
|
| 43 |
+
|
| 44 |
+
def create_tags_for_images(image_dir, base_tags):
|
| 45 |
+
image_exts = ("*.jpg", "*.jpeg", "*.png", "*.webp")
|
| 46 |
+
image_files = []
|
| 47 |
+
for ext in image_exts:
|
| 48 |
+
image_files.extend(Path(image_dir).glob(ext))
|
| 49 |
+
|
| 50 |
+
for image_path in tqdm(image_files, desc=f"Tagging images in {image_dir}"):
|
| 51 |
+
caption = generate_caption_vlm(image_path, base_tags)
|
| 52 |
+
tag_file = image_path.with_suffix(".txt")
|
| 53 |
+
with open(tag_file, "w", encoding="utf-8") as f:
|
| 54 |
+
f.write(caption)
|
| 55 |
+
print(f"[✓] Tagged {image_path.name}")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
# === CONFIGURATION OPTIONS ===
|
| 59 |
+
# Option 1: Single folder (set to None to skip)
|
| 60 |
+
#single_folder = None
|
| 61 |
+
#OR
|
| 62 |
+
single_folder = r"G:\My Drive\AI\training_data\kawaii_goth"
|
| 63 |
+
|
| 64 |
+
# Option 2: Process all subfolders in parent_dir except excluded ones
|
| 65 |
+
parent_dir = r"G:\My Drive\AI\images\tbd\N1na"
|
| 66 |
+
exclude_folders = {"video"}
|
| 67 |
+
|
| 68 |
+
if single_folder is not None:
|
| 69 |
+
folder_tag = Path(single_folder).name.replace("_", " ").lower()
|
| 70 |
+
base_tags = ["{kawaii goth}"]
|
| 71 |
+
print(f"\n🏷️ Processing single folder: {single_folder} | Base tags: {base_tags}")
|
| 72 |
+
create_tags_for_images(single_folder, base_tags)
|
| 73 |
+
else:
|
| 74 |
+
folders_to_process = [
|
| 75 |
+
str(folder)
|
| 76 |
+
for folder in Path(parent_dir).iterdir()
|
| 77 |
+
if folder.is_dir() and folder.name not in exclude_folders
|
| 78 |
+
]
|
| 79 |
+
for folder in folders_to_process:
|
| 80 |
+
folder_tag = Path(folder).name.replace("_", " ").lower()
|
| 81 |
+
base_tags = [folder_tag, "Mature woman, realistic, detailed"]
|
| 82 |
+
print(f"\n🏷️ Processing folder: {folder} | Base tags: {base_tags}")
|
| 83 |
+
create_tags_for_images(folder, base_tags)
|
| 84 |
+
|
| 85 |
+
print("✅ All images tagged via VLM.")
|
maskgen.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
import argparse
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import urllib.request
|
| 7 |
+
|
| 8 |
+
import cv2
|
| 9 |
+
import numpy as np
|
| 10 |
+
import os
|
| 11 |
+
import argparse
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import urllib.request
|
| 14 |
+
import mediapipe
|
| 15 |
+
import cv2
|
| 16 |
+
import mediapipe as mp
|
| 17 |
+
|
| 18 |
+
def download_face_model():
|
| 19 |
+
prototxt = "deploy.prototxt"
|
| 20 |
+
caffemodel = "res10_300x300_ssd_iter_140000_fp16.caffemodel"
|
| 21 |
+
prototxt_url = "https://huggingface.co/Durraiya/deploy.prototxt/resolve/main/deploy.prototxt"
|
| 22 |
+
caffemodel_url = "https://huggingface.co/Durraiya/res10_300x300_ssd_iter_140000_fp16.caffemodel/resolve/main/res10_300x300_ssd_iter_140000_fp16.caffemodel"
|
| 23 |
+
if not os.path.exists(prototxt):
|
| 24 |
+
print("Downloading prototxt...")
|
| 25 |
+
urllib.request.urlretrieve(prototxt_url, prototxt)
|
| 26 |
+
if not os.path.exists(caffemodel):
|
| 27 |
+
print("Downloading caffemodel...")
|
| 28 |
+
urllib.request.urlretrieve(caffemodel_url, caffemodel)
|
| 29 |
+
return prototxt, caffemodel
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def download_body_model():
|
| 33 |
+
yolov8_model = "person_yolov8s-seg.pt"
|
| 34 |
+
yolov8_url = "https://huggingface.co/Bingsu/adetailer/resolve/main/person_yolov8s-seg.pt"
|
| 35 |
+
if not os.path.exists(yolov8_model):
|
| 36 |
+
print("Downloading YOLOv8 segmentation model...")
|
| 37 |
+
urllib.request.urlretrieve(yolov8_url, yolov8_model)
|
| 38 |
+
return yolov8_model
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
from ultralytics import YOLO
|
| 45 |
+
|
| 46 |
+
def segment_person_mask_yolov8(image, action="ignore"):
|
| 47 |
+
model = YOLO("person_yolov8s-seg.pt")
|
| 48 |
+
results = model(image)
|
| 49 |
+
mask = np.zeros(image.shape[:2], dtype=np.uint8)
|
| 50 |
+
for r in results:
|
| 51 |
+
for m in r.masks.data:
|
| 52 |
+
m = m.cpu().numpy().astype(np.uint8) * 255
|
| 53 |
+
m = cv2.resize(m, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)
|
| 54 |
+
if action == "ignore":
|
| 55 |
+
mask = cv2.bitwise_or(mask, 255 - m)
|
| 56 |
+
else:
|
| 57 |
+
mask = cv2.bitwise_or(mask, m)
|
| 58 |
+
return mask
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def detect_face_bbox(image, net):
|
| 62 |
+
if image is None:
|
| 63 |
+
print("Warning: Image is None in detect_face_bbox")
|
| 64 |
+
return []
|
| 65 |
+
h, w = image.shape[:2]
|
| 66 |
+
blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0,
|
| 67 |
+
(300, 300), (104.0, 177.0, 123.0))
|
| 68 |
+
net.setInput(blob)
|
| 69 |
+
detections = net.forward()
|
| 70 |
+
bboxes = []
|
| 71 |
+
for i in range(detections.shape[2]):
|
| 72 |
+
confidence = detections[0, 0, i, 2]
|
| 73 |
+
if confidence > 0.6:
|
| 74 |
+
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
|
| 75 |
+
(startX, startY, endX, endY) = box.astype("int")
|
| 76 |
+
bboxes.append((startX, startY, endX, endY))
|
| 77 |
+
return bboxes
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def create_face_bbox_mask(image, bboxes, action="ignore"):
|
| 81 |
+
mask = np.ones(image.shape[:2], dtype=np.uint8) * 255 if action == "ignore" else np.zeros(image.shape[:2],
|
| 82 |
+
dtype=np.uint8)
|
| 83 |
+
for (x1, y1, x2, y2) in bboxes:
|
| 84 |
+
if action == "ignore":
|
| 85 |
+
mask[y1:y2, x1:x2] = 0
|
| 86 |
+
else: # "keep"
|
| 87 |
+
mask[y1:y2, x1:x2] = 255
|
| 88 |
+
return mask
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# Python
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
parser = argparse.ArgumentParser(description="Generate face/body region masks using detectors/segmentation.")
|
| 94 |
+
parser.add_argument("input_dir", type=str, help="Path to the input image directory")
|
| 95 |
+
parser.add_argument("--face_action", choices=["ignore", "keep"], default="ignore",
|
| 96 |
+
help="Masking action for face: 'ignore' to mask face region, 'keep' to keep face region")
|
| 97 |
+
parser.add_argument("--body_action", choices=["ignore", "keep", "none"], default="none",
|
| 98 |
+
help="Masking action for body: 'ignore' to mask body region, 'keep' to keep body region, 'none' to skip body masking")
|
| 99 |
+
parser.add_argument("--include_face_in_body_keep", action="store_true",
|
| 100 |
+
help="If set and body_action=keep, include face region in the mask")
|
| 101 |
+
args = parser.parse_args()
|
| 102 |
+
|
| 103 |
+
INPUT_DIR = Path(args.input_dir)
|
| 104 |
+
MASK_DIR = INPUT_DIR.parent / f"mask_{INPUT_DIR.name}"
|
| 105 |
+
MASK_DIR.mkdir(parents=True, exist_ok=True)
|
| 106 |
+
|
| 107 |
+
# Face model
|
| 108 |
+
face_prototxt, face_caffemodel = download_face_model()
|
| 109 |
+
face_net = cv2.dnn.readNetFromCaffe(face_prototxt, face_caffemodel)
|
| 110 |
+
|
| 111 |
+
# Body segmentation model (YOLOv8)
|
| 112 |
+
if args.body_action != "none":
|
| 113 |
+
yolov8_model = download_body_model()
|
| 114 |
+
else:
|
| 115 |
+
yolov8_model = None
|
| 116 |
+
|
| 117 |
+
for image_file in INPUT_DIR.glob("*.*"):
|
| 118 |
+
img = cv2.imread(str(image_file), cv2.IMREAD_UNCHANGED)
|
| 119 |
+
if img is None:
|
| 120 |
+
print(f"Warning: Failed to read image {image_file}, skipping.")
|
| 121 |
+
continue
|
| 122 |
+
if len(img.shape) == 2:
|
| 123 |
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
| 124 |
+
elif img.shape[-1] == 4:
|
| 125 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
|
| 126 |
+
|
| 127 |
+
# Only body mask if face+body, no background
|
| 128 |
+
if yolov8_model is not None and args.body_action == "keep" and args.include_face_in_body_keep:
|
| 129 |
+
mask = segment_person_mask_yolov8(img, action="keep")
|
| 130 |
+
else:
|
| 131 |
+
# Face mask
|
| 132 |
+
face_bboxes = detect_face_bbox(img, face_net)
|
| 133 |
+
mask = create_face_bbox_mask(img, face_bboxes, action=args.face_action)
|
| 134 |
+
# Body mask (combine with face mask)
|
| 135 |
+
if yolov8_model is not None:
|
| 136 |
+
body_mask = segment_person_mask_yolov8(img, action=args.body_action)
|
| 137 |
+
if args.body_action == "keep" and args.include_face_in_body_keep:
|
| 138 |
+
mask = cv2.bitwise_or(mask, body_mask)
|
| 139 |
+
else:
|
| 140 |
+
mask = cv2.bitwise_and(mask, body_mask)
|
| 141 |
+
|
| 142 |
+
# Make mask same number of channels as input image
|
| 143 |
+
if img.ndim == 3:
|
| 144 |
+
if img.shape[2] == 4:
|
| 145 |
+
mask_out = cv2.merge([mask, mask, mask, mask])
|
| 146 |
+
elif img.shape[2] == 3:
|
| 147 |
+
mask_out = cv2.merge([mask, mask, mask])
|
| 148 |
+
else:
|
| 149 |
+
mask_out = mask
|
| 150 |
+
else:
|
| 151 |
+
mask_out = mask
|
| 152 |
+
if mask_out.shape[:2] != img.shape[:2]:
|
| 153 |
+
print(f"Error: Mask shape {mask_out.shape[:2]} does not match image shape {img.shape[:2]} for {image_file}")
|
| 154 |
+
continue
|
| 155 |
+
out_path = MASK_DIR / (image_file.stem + ".png")
|
| 156 |
+
cv2.imwrite(str(out_path), mask_out)
|
| 157 |
+
|
| 158 |
+
print(
|
| 159 |
+
f"Masks generated in {MASK_DIR.resolve()} using face/body detection with actions face='{args.face_action}', body='{args.body_action}'")
|