kayte0342
/

ultralytics

Model card Files Files and versions

xet

Community

kayte0342 commited on Feb 25

Commit

acef9fe

verified ·

1 Parent(s): 26f0d8b

Upload 2 files

Browse files

Files changed (2) hide show

llm_captions.py +85 -0
maskgen.py +159 -0

llm_captions.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import requests
+from pathlib import Path
+from PIL import Image
+from io import BytesIO
+import base64
+from tqdm import tqdm
+import torch
+OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
+OLLAMA_MODEL = "gemma3"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def image_to_base64(image_path):
+    with Image.open(image_path).convert("RGB") as img:
+        buffered = BytesIO()
+        img.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def generate_caption_vlm(image_path, base_tags=None):
+    prompt = (
+        "Write one concise and factual caption for this image for machine learning training; write only the caption without extra text. "
+        " Describe ONLY the image contents. Describe the characters, background, and color, style, coverage, and relevant characteristics of the clothes. "
+        " NO style words or lighting descriptions. NO mention of camera, lens, quality or mood. One simple descriptive paragraph."
+        "Avoid creative or poetic language. Avoid context setting language. Use 50 words or less. "
+        f"Include these tags at the end: {', '.join(base_tags) if base_tags else ''}."
+    )
+    image_base64 = image_to_base64(image_path)
+    payload = {
+        "model": OLLAMA_MODEL,
+        "prompt": prompt,
+        "images": [image_base64],
+        "options": {"temperature": 0.2},
+        "stream": False
+    }
+    try:
+        response = requests.post(OLLAMA_URL, json=payload)
+        response.raise_for_status()
+        result = response.json()
+        return result.get("response", "").strip()
+    except Exception as e:
+        print(f"⚠️ Ollama VLM request failed: {e}")
+        return "Auto-tagged"
+def create_tags_for_images(image_dir, base_tags):
+    image_exts = ("*.jpg", "*.jpeg", "*.png", "*.webp")
+    image_files = []
+    for ext in image_exts:
+        image_files.extend(Path(image_dir).glob(ext))
+    for image_path in tqdm(image_files, desc=f"Tagging images in {image_dir}"):
+        caption = generate_caption_vlm(image_path, base_tags)
+        tag_file = image_path.with_suffix(".txt")
+        with open(tag_file, "w", encoding="utf-8") as f:
+            f.write(caption)
+        print(f"[✓] Tagged {image_path.name}")
+if __name__ == "__main__":
+    # === CONFIGURATION OPTIONS ===
+    # Option 1: Single folder (set to None to skip)
+    #single_folder = None
+    #OR
+    single_folder = r"G:\My Drive\AI\training_data\kawaii_goth"
+    # Option 2: Process all subfolders in parent_dir except excluded ones
+    parent_dir = r"G:\My Drive\AI\images\tbd\N1na"
+    exclude_folders = {"video"}
+    if single_folder is not None:
+        folder_tag = Path(single_folder).name.replace("_", " ").lower()
+        base_tags = ["{kawaii goth}"]
+        print(f"\n🏷️  Processing single folder: {single_folder} | Base tags: {base_tags}")
+        create_tags_for_images(single_folder, base_tags)
+    else:
+        folders_to_process = [
+            str(folder)
+            for folder in Path(parent_dir).iterdir()
+            if folder.is_dir() and folder.name not in exclude_folders
+        ]
+        for folder in folders_to_process:
+            folder_tag = Path(folder).name.replace("_", " ").lower()
+            base_tags = [folder_tag, "Mature woman, realistic, detailed"]
+            print(f"\n🏷️  Processing folder: {folder} | Base tags: {base_tags}")
+            create_tags_for_images(folder, base_tags)
+    print("✅ All images tagged via VLM.")

maskgen.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import cv2
+import numpy as np
+import os
+import argparse
+from pathlib import Path
+import urllib.request
+import cv2
+import numpy as np
+import os
+import argparse
+from pathlib import Path
+import urllib.request
+import mediapipe
+import cv2
+import mediapipe as mp
+def download_face_model():
+    prototxt = "deploy.prototxt"
+    caffemodel = "res10_300x300_ssd_iter_140000_fp16.caffemodel"
+    prototxt_url = "https://huggingface.co/Durraiya/deploy.prototxt/resolve/main/deploy.prototxt"
+    caffemodel_url = "https://huggingface.co/Durraiya/res10_300x300_ssd_iter_140000_fp16.caffemodel/resolve/main/res10_300x300_ssd_iter_140000_fp16.caffemodel"
+    if not os.path.exists(prototxt):
+        print("Downloading prototxt...")
+        urllib.request.urlretrieve(prototxt_url, prototxt)
+    if not os.path.exists(caffemodel):
+        print("Downloading caffemodel...")
+        urllib.request.urlretrieve(caffemodel_url, caffemodel)
+    return prototxt, caffemodel
+def download_body_model():
+    yolov8_model = "person_yolov8s-seg.pt"
+    yolov8_url = "https://huggingface.co/Bingsu/adetailer/resolve/main/person_yolov8s-seg.pt"
+    if not os.path.exists(yolov8_model):
+        print("Downloading YOLOv8 segmentation model...")
+        urllib.request.urlretrieve(yolov8_url, yolov8_model)
+    return yolov8_model
+from ultralytics import YOLO
+def segment_person_mask_yolov8(image, action="ignore"):
+    model = YOLO("person_yolov8s-seg.pt")
+    results = model(image)
+    mask = np.zeros(image.shape[:2], dtype=np.uint8)
+    for r in results:
+        for m in r.masks.data:
+            m = m.cpu().numpy().astype(np.uint8) * 255
+            m = cv2.resize(m, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)
+            if action == "ignore":
+                mask = cv2.bitwise_or(mask, 255 - m)
+            else:
+                mask = cv2.bitwise_or(mask, m)
+    return mask
+def detect_face_bbox(image, net):
+    if image is None:
+        print("Warning: Image is None in detect_face_bbox")
+        return []
+    h, w = image.shape[:2]
+    blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0,
+                                 (300, 300), (104.0, 177.0, 123.0))
+    net.setInput(blob)
+    detections = net.forward()
+    bboxes = []
+    for i in range(detections.shape[2]):
+        confidence = detections[0, 0, i, 2]
+        if confidence > 0.6:
+            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
+            (startX, startY, endX, endY) = box.astype("int")
+            bboxes.append((startX, startY, endX, endY))
+    return bboxes
+def create_face_bbox_mask(image, bboxes, action="ignore"):
+    mask = np.ones(image.shape[:2], dtype=np.uint8) * 255 if action == "ignore" else np.zeros(image.shape[:2],
+                                                                                              dtype=np.uint8)
+    for (x1, y1, x2, y2) in bboxes:
+        if action == "ignore":
+            mask[y1:y2, x1:x2] = 0
+        else:  # "keep"
+            mask[y1:y2, x1:x2] = 255
+    return mask
+# Python
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate face/body region masks using detectors/segmentation.")
+    parser.add_argument("input_dir", type=str, help="Path to the input image directory")
+    parser.add_argument("--face_action", choices=["ignore", "keep"], default="ignore",
+                        help="Masking action for face: 'ignore' to mask face region, 'keep' to keep face region")
+    parser.add_argument("--body_action", choices=["ignore", "keep", "none"], default="none",
+                        help="Masking action for body: 'ignore' to mask body region, 'keep' to keep body region, 'none' to skip body masking")
+    parser.add_argument("--include_face_in_body_keep", action="store_true",
+                        help="If set and body_action=keep, include face region in the mask")
+    args = parser.parse_args()
+    INPUT_DIR = Path(args.input_dir)
+    MASK_DIR = INPUT_DIR.parent / f"mask_{INPUT_DIR.name}"
+    MASK_DIR.mkdir(parents=True, exist_ok=True)
+    # Face model
+    face_prototxt, face_caffemodel = download_face_model()
+    face_net = cv2.dnn.readNetFromCaffe(face_prototxt, face_caffemodel)
+    # Body segmentation model (YOLOv8)
+    if args.body_action != "none":
+        yolov8_model = download_body_model()
+    else:
+        yolov8_model = None
+    for image_file in INPUT_DIR.glob("*.*"):
+        img = cv2.imread(str(image_file), cv2.IMREAD_UNCHANGED)
+        if img is None:
+            print(f"Warning: Failed to read image {image_file}, skipping.")
+            continue
+        if len(img.shape) == 2:
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif img.shape[-1] == 4:
+            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
+        # Only body mask if face+body, no background
+        if yolov8_model is not None and args.body_action == "keep" and args.include_face_in_body_keep:
+            mask = segment_person_mask_yolov8(img, action="keep")
+        else:
+            # Face mask
+            face_bboxes = detect_face_bbox(img, face_net)
+            mask = create_face_bbox_mask(img, face_bboxes, action=args.face_action)
+            # Body mask (combine with face mask)
+            if yolov8_model is not None:
+                body_mask = segment_person_mask_yolov8(img, action=args.body_action)
+                if args.body_action == "keep" and args.include_face_in_body_keep:
+                    mask = cv2.bitwise_or(mask, body_mask)
+                else:
+                    mask = cv2.bitwise_and(mask, body_mask)
+        # Make mask same number of channels as input image
+        if img.ndim == 3:
+            if img.shape[2] == 4:
+                mask_out = cv2.merge([mask, mask, mask, mask])
+            elif img.shape[2] == 3:
+                mask_out = cv2.merge([mask, mask, mask])
+            else:
+                mask_out = mask
+        else:
+            mask_out = mask
+        if mask_out.shape[:2] != img.shape[:2]:
+            print(f"Error: Mask shape {mask_out.shape[:2]} does not match image shape {img.shape[:2]} for {image_file}")
+            continue
+        out_path = MASK_DIR / (image_file.stem + ".png")
+        cv2.imwrite(str(out_path), mask_out)
+    print(
+        f"Masks generated in {MASK_DIR.resolve()} using face/body detection with actions face='{args.face_action}', body='{args.body_action}'")