numzoo / scripts /generate_dataset.py
goumsss's picture
Prep captions for LoRA training: trigger word + content-only
47fb1ee
Raw
History Blame Contribute Delete
15.1 kB
"""
NumZoo training dataset generator.
Generates LoRA training images matching the NumZoo aesthetic using Qwen-Image
via the HuggingFace Inference API (fal-ai provider, billed to your HF Pro credits).
(Qwen-Image beat FLUX.1-dev on multi-animal accuracy; FLUX.2-dev is edit-only.)
Alignment with the live app is guaranteed by construction:
- The "A cute {animals} {places}" prefix is built with image_generator.build_subject
(the SAME function the app uses), from the app's exact ANIMAL_MAP / PLACE_MAP.
- The NUMZOO_STYLE suffix is imported from image_generator.
- Scenes deliberately mix 1–3 animals and 1–3 places, exactly like the app does
when the player selects multiple emojis.
So every caption looks like a real app prompt, plus a rich scene detail clause.
Output: training/image_001.jpg + training/image_001.txt (caption)
Requirements:
~/miniforge3/bin/pip install huggingface_hub pillow python-dotenv
Setup (HF Pro β€” just your existing token):
1. Get an HF token (fine-grained, "Make calls to Inference Providers" permission)
https://huggingface.co/settings/tokens/new?ownUserPermissions=inference.serverless.write&tokenType=fineGrained
2. Add to .env: HF_TOKEN=hf_...
Usage:
~/miniforge3/bin/python3 scripts/generate_dataset.py # all scenes
~/miniforge3/bin/python3 scripts/generate_dataset.py --count 5 # first 5 (test run)
~/miniforge3/bin/python3 scripts/generate_dataset.py --start 20 # resume from #20
~/miniforge3/bin/python3 scripts/generate_dataset.py --dry-run # preview prompts
"""
import os
import sys
import argparse
import time
from pathlib import Path
# Load .env from project root (GEMINI_API_KEY etc.)
try:
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / ".env")
except ImportError:
pass # dotenv optional β€” can also export GEMINI_API_KEY manually
# Reuse the app's exact prompt builder + vocabulary so training captions and live
# prompts share the same structure ("A cute {animals} {places}") and emoji mappings.
sys.path.insert(0, str(Path(__file__).parent.parent))
from image_generator import ( # noqa: E402
build_subject,
NUMZOO_STYLE as STYLE,
ANIMAL_MAP,
PLACE_MAP,
)
# ---------------------------------------------------------------------------
# Scenes: (animal emojis, place emojis, scene-detail clause)
# Animals/places use the app's exact emoji keys. The "A cute {animals} {places}"
# prefix is built by image_generator.build_subject; the detail adds rich props,
# activity and lighting for the cozy aesthetic. Deliberately mixes 1–3 animals
# and 1–3 places to mirror multi-emoji selections in the app.
# ---------------------------------------------------------------------------
SCENES: list[tuple[list[str], list[str], str]] = [
# ── Solo animal, single place β€” covers all 12 animals + all 10 places ──
(["🐰"], ["πŸ„"], "sitting on a polka-dot toadstool, fireflies and floating spores drifting around, soft lantern glow"),
(["🐱"], ["🌊"], "building a tiny sandcastle with a bucket and shells, gentle waves lapping, warm sunset sky"),
(["🐢"], ["🏑"], "napping in a flower-filled wheelbarrow, watering can and butterflies nearby, golden afternoon light"),
(["🦊"], ["⭐"], "curled up on a fluffy cloud cradling a tiny glowing star, glittering night sky"),
(["🐼"], ["🌸"], "nibbling a dango skewer as petals fall, paper lanterns strung above, soft pink light"),
(["🐨"], ["🌴"], "hugging a palm trunk with a coconut drink, striped hammock and a parrot, turquoise sea behind"),
(["🦁"], ["🌈"], "wearing a tiny crown at the end of a rainbow, pastel clouds and floating sparkles"),
(["🐯"], ["πŸ”οΈ"], "bundled in a knitted scarf on a snowy peak planting a tiny flag, sparkling snow and faint aurora"),
(["🐸"], ["🌺"], "perched on a giant hibiscus bloom, dewdrops glistening, big tropical leaves and warm bokeh"),
(["🐧"], ["πŸŒ™"], "sitting on the curve of a glowing crescent moon in a knitted hat, scattered twinkling stars"),
(["πŸ¦‹"], ["🌸"], "fluttering through cherry blossoms trailing sparkles, pastel petals swirling in the breeze"),
(["πŸ¦„"], ["⭐"], "galloping across a starry sky, rainbow mane glowing, a trail of sparkles behind"),
(["🐢"], ["πŸ„"], "exploring beneath a giant mushroom with a tiny lantern, glowing toadstools and soft moss"),
(["🐱"], ["πŸŒ™"], "curled asleep on a crescent moon wearing a nightcap, twinkling stars all around"),
(["🐰"], ["🌊"], "splashing in shallow waves beside a starfish friend, beach pail and spade, pink sunset"),
(["🐼"], ["🏑"], "tending a vegetable patch in a straw hat, bees and tall sunflowers, warm sun"),
(["🐧"], ["πŸ”οΈ"], "sliding down a snowy slope on its belly, scarf flying, sparkling powder snow"),
(["🦊"], ["🌴"], "lounging in a hammock between two palms with sunglasses, coconuts and calm ocean"),
(["🦁"], ["🌺"], "snoozing in a field of tropical flowers, a butterfly on its nose, dappled golden light"),
(["🐯"], ["🌸"], "chasing falling cherry petals, paper lanterns above, soft pink and lilac tones"),
(["🐸"], ["πŸ„"], "playing a tiny flute on a lily pad among glowing mushrooms, fireflies and reeds"),
(["πŸ¦„"], ["🌈"], "standing proudly under a rainbow, flower garland around its neck, pastel clouds"),
# ── Two animals, single place ──
(["🐰", "🐱"], ["πŸ„"], "roasting marshmallows over a tiny campfire, fireflies and glowing mushrooms, cozy night"),
(["🐢", "🦊"], ["🌊"], "building a sandcastle together with shell flags, gentle waves at golden hour"),
(["🐼", "🐨"], ["🌸"], "sharing tea under a blooming cherry tree, paper lanterns, drifting petals"),
(["🦁", "🐯"], ["🏑"], "tumbling over a ball of yarn in a cottage garden, picket fence and butterflies"),
(["🐧", "🐰"], ["πŸ”οΈ"], "ice skating on a frozen pond atop a snowy mountain, fairy lights, gentle snowfall"),
(["🐸", "πŸ¦‹"], ["🌺"], "resting together on lily pads among tropical flowers, dragonflies and warm bokeh"),
(["🐱", "🐢"], ["πŸŒ™"], "stargazing from a crescent moon with a tiny brass telescope, soft constellations"),
(["πŸ¦„", "🐰"], ["🌈"], "trotting side by side under a rainbow, flower garlands and floating sparkles"),
(["🦊", "🐼"], ["πŸ„"], "reading a glowing storybook under a toadstool, a lantern and curious fireflies"),
(["🐨", "🐧"], ["🌴"], "sipping coconut drinks on a tropical island, beach umbrella and gentle surf"),
(["🐰", "πŸ¦„"], ["⭐"], "swinging on a swing hung from the stars, sparkles raining down, deep blue night"),
(["🐱", "🐸"], ["🌊"], "collecting shells in tide pools at low tide, a little net and a pastel sunset"),
# ── Three animals, single place ──
(["🐰", "🐱", "🐢"], ["πŸ„"], "having a picnic on a checkered blanket among glowing mushrooms, lanterns and fireflies"),
(["🦊", "🐼", "🐨"], ["🌸"], "a tea party under cherry blossoms with tiny cups, paper lanterns and drifting petals"),
(["🦁", "🐯", "🐸"], ["🏑"], "playing tag through flower beds in a cottage garden, butterflies and warm sun"),
(["🐧", "🐰", "🐱"], ["πŸ”οΈ"], "building a snowman on a snowy peak in matching scarves, sparkling snow, aurora above"),
(["πŸ¦„", "πŸ¦‹", "🐰"], ["🌈"], "dancing under a rainbow amid sparkles and flower petals, pastel sky"),
(["🐢", "🦊", "🐼"], ["🌊"], "surfing tiny waves together with a beach ball, palm trees and sunset glow"),
(["🐱", "🐨", "🐸"], ["🌺"], "weaving flower crowns in a field of tropical flowers, butterflies and golden bokeh"),
(["🦁", "🐯", "🐰"], ["⭐"], "huddled on a cloud counting sparkling stars under a shared blanket, soft glow"),
# ── Two places ──
(["🐰"], ["πŸ„", "🌈"], "hopping from a mushroom grove toward a rainbow, sparkles bridging the two, pastel light"),
(["🐱", "🐢"], ["🌊", "🌴"], "a beach day between ocean waves and a tropical island, palm shade and scattered shells"),
(["πŸ¦„"], ["⭐", "πŸŒ™"], "soaring past sparkling stars toward a crescent moon, a glowing rainbow trail"),
(["🐼"], ["🌸", "🏑"], "wandering from cherry blossoms into a cosy cottage garden, petals and busy bees"),
(["🐧", "🐰"], ["πŸ”οΈ", "⭐"], "watching sparkling stars from a snowy mountain top, fairy lights and soft snow"),
(["🦊"], ["🌺", "🌴"], "exploring tropical flowers along a tropical island shore, parrots and warm bokeh"),
(["🐸", "πŸ¦‹"], ["🌸", "🌺"], "drifting between cherry blossoms and tropical flowers, dewdrops and floating petals"),
(["🦁"], ["🏑", "🌈"], "lazing in a cottage garden as a rainbow arcs overhead, butterflies and golden light"),
# ── Three places ──
(["🐰", "🐱"], ["πŸ„", "🌈", "⭐"], "a dreamy journey through a mushroom forest, under a rainbow and beneath sparkling stars, a glowing trail"),
(["πŸ¦„"], ["πŸŒ™", "⭐", "🌈"], "flying past a crescent moon and sparkling stars toward a rainbow, sparkles everywhere"),
(["🐢", "🦊", "🐼"], ["🌊", "🌴", "🌺"], "a tropical adventure across a sunny beach, a tropical island and fields of flowers, parrots and surf"),
(["🐧", "🐰", "🐱"], ["πŸ”οΈ", "⭐", "πŸŒ™"], "a starry night on a snowy peak under a crescent moon and sparkling stars, fairy lights and aurora"),
]
# LoRA trigger word β€” a nonsense token the style LoRA learns to associate with
# the whole NumZoo aesthetic. The app prepends it at inference once the LoRA is
# trained. Per BFL guidance, training captions = "TRIGGER. <content only>",
# WITHOUT spelling out the style (so the trigger alone summons the look).
TRIGGER = "NUMZOO"
def _scene_name(idx: int, animals: list[str], places: list[str]) -> str:
"""Short readable label for logs, e.g. '03_panda_1a1p'."""
first = ANIMAL_MAP[animals[0]].replace("baby ", "")
return f"{idx:02d}_{first}_{len(animals)}a{len(places)}p"
def _gen_prompt(a: list[str], p: list[str], detail: str) -> str:
"""Full STYLED prompt sent to the image model (needs explicit style cues)."""
return f"{build_subject(a, p)}, {detail}, {STYLE}"
def _caption(a: list[str], p: list[str], detail: str) -> str:
"""Training caption: trigger word + content only, NO style words (BFL style-LoRA)."""
return f"{TRIGGER}. {build_subject(a, p)}, {detail}"
# (name, generation_prompt, training_caption) for each scene.
PROMPTS: list[tuple[str, str, str]] = [
(_scene_name(i + 1, a, p), _gen_prompt(a, p, detail), _caption(a, p, detail))
for i, (a, p, detail) in enumerate(SCENES)
]
# ---------------------------------------------------------------------------
# Generator using Qwen-Image via HuggingFace Inference API (fal-ai provider)
# ---------------------------------------------------------------------------
# Qwen-Image chosen over FLUX.1-dev: far better at rendering DISTINCT animals in
# multi-animal scenes (critical β€” the app lets players pick up to 3), and a
# softer painterly storybook style closer to the NumZoo references.
# Note: FLUX.2-dev is edit-only (image-to-image) on every HF provider, so it
# cannot be used for text-to-image dataset generation.
MODEL = "Qwen/Qwen-Image"
PROVIDER = "fal-ai"
RETRIES = 4 # fal-ai occasionally returns transient 504s
def generate_image(prompt: str) -> "PIL.Image.Image":
from huggingface_hub import InferenceClient
# HF Pro token (with "Make calls to Inference Providers" permission) covers
# all providers β€” no separate provider key needed.
hf_token = os.environ.get("HF_TOKEN")
last_err = None
for attempt in range(1, RETRIES + 1):
try:
client = InferenceClient(provider=PROVIDER, api_key=hf_token)
return client.text_to_image(prompt, model=MODEL, width=1024, height=1024)
except Exception as e:
last_err = e
if attempt < RETRIES:
print(f" ⚠️ attempt {attempt}/{RETRIES} failed ({str(e)[:60]}) β€” retrying")
time.sleep(4)
raise last_err # exhausted retries
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--start", type=int, default=1, help="Resume from image N (1-based)")
parser.add_argument("--count", type=int, default=None, help="Generate at most N images then stop")
parser.add_argument("--dry-run", action="store_true", help="Print prompts without generating")
parser.add_argument("--captions-only", action="store_true", help="Rewrite .txt captions for existing images (no API calls)")
args = parser.parse_args()
if not args.dry_run and not args.captions_only:
if not os.environ.get("HF_TOKEN"):
print("❌ HF_TOKEN not found. Add it to .env")
print(" Get yours at https://huggingface.co/settings/tokens")
sys.exit(1)
out_dir = Path(__file__).parent.parent / "training"
out_dir.mkdir(exist_ok=True)
total = len(PROMPTS)
end_at = (args.start - 1 + args.count) if args.count else total # inclusive upper bound (index)
count_label = f"{args.count} images" if args.count else f"all {total} images"
print(f"NumZoo dataset generator β€” {count_label} β†’ {out_dir}")
print(f"Range: {args.start}–{min(end_at, total)} of {total}")
print()
generated_this_run = 0
for i, (name, gen_prompt, caption) in enumerate(PROMPTS):
n = i + 1
if n < args.start:
continue
if n > end_at:
break
img_path = out_dir / f"image_{n:03d}.jpg"
txt_path = out_dir / f"image_{n:03d}.txt"
# Rewrite captions for existing images, no image generation
if args.captions_only:
txt_path.write_text(caption)
print(f"[{n:02d}/{total}] πŸ“ {name} β€” caption updated")
continue
if img_path.exists():
print(f"[{n:02d}/{total}] ⏭ {name} β€” already exists, skipping")
continue
print(f"[{n:02d}/{total}] 🎨 {name}")
if args.dry_run:
print(f" gen: {gen_prompt[:90]}…")
print(f" caption: {caption[:90]}…")
continue
try:
image = generate_image(gen_prompt)
image.save(img_path, "JPEG", quality=95)
txt_path.write_text(caption)
generated_this_run += 1
print(f" βœ… saved {img_path.name}")
except Exception as e:
print(f" ❌ failed: {e}")
time.sleep(5) # brief pause on error before continuing
total_on_disk = len(list(out_dir.glob("*.jpg")))
print(f"\nDone. {generated_this_run} generated this run Β· {total_on_disk}/{total} total in {out_dir}")
if __name__ == "__main__":
main()