File size: 2,331 Bytes
fb12ddc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | # HF_Space_hipVS/seed_data.py
# =============================
# Auto-seed from a HF Dataset so the Space launches with content indexed.
# Called on first launch if AUTO_SEED=true and the default project is empty.
import logging
from config import SEED_DATASET, SEED_SPLIT, HF_TOKEN, DEFAULT_PROJECT
logger = logging.getLogger(__name__)
def run(project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]:
"""Seed a project with images from a HF dataset."""
from datasets import load_dataset
from ingest import ingest_image_from_pil
from vector_store import get_store
log = [f"Seeding project '{project}' from {SEED_DATASET} [{SEED_SPLIT}]\n"]
try:
ds = load_dataset(SEED_DATASET, split=SEED_SPLIT, token=HF_TOKEN or None)
log.append(f"Loaded {len(ds)} items")
except Exception as e:
msg = f"Failed to load dataset: {e}"
logger.error(msg)
return 0, msg
count = 0
total = len(ds)
for i, item in enumerate(ds):
image = item.get("image")
if image is None:
continue
filename = item.get("filename", f"seed_{i:05d}.jpg")
extra = {"source": SEED_DATASET}
# Grab any available caption as metadata (not used for embedding)
for key in ("caption", "sentences", "text"):
if key in item:
val = item[key]
extra["caption_hint"] = val[0] if isinstance(val, list) else str(val)
break
ok, _ = ingest_image_from_pil(image, filename, extra, project=project)
if ok:
count += 1
if count <= 3 or count % 50 == 0:
log.append(f" [{count}/{total}] {filename}")
if progress_callback:
progress_callback((i + 1) / total, desc=f"Seeding {i+1}/{total}...")
# Rebuild CAGRA once after all images
store = get_store(project, "image_index")
if store.has_data():
store.rebuild_gpu_index()
store._persist()
log.append(f"\nSeeding complete: {count} images indexed")
log.append(f"Store: {store}")
return count, "\n".join(log)
def is_needed() -> bool:
from config import AUTO_SEED
from vector_store import get_store
store = get_store(DEFAULT_PROJECT, "image_index")
return AUTO_SEED and not store.has_data()
|