| |
| |
| |
| |
|
|
| import logging |
| from config import SEED_DATASET, SEED_SPLIT, HF_TOKEN, DEFAULT_PROJECT |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def run(project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]: |
| """Seed a project with images from a HF dataset.""" |
| from datasets import load_dataset |
| from ingest import ingest_image_from_pil |
| from vector_store import get_store |
|
|
| log = [f"Seeding project '{project}' from {SEED_DATASET} [{SEED_SPLIT}]\n"] |
|
|
| try: |
| ds = load_dataset(SEED_DATASET, split=SEED_SPLIT, token=HF_TOKEN or None) |
| log.append(f"Loaded {len(ds)} items") |
| except Exception as e: |
| msg = f"Failed to load dataset: {e}" |
| logger.error(msg) |
| return 0, msg |
|
|
| count = 0 |
| total = len(ds) |
|
|
| for i, item in enumerate(ds): |
| image = item.get("image") |
| if image is None: |
| continue |
|
|
| filename = item.get("filename", f"seed_{i:05d}.jpg") |
| extra = {"source": SEED_DATASET} |
|
|
| |
| for key in ("caption", "sentences", "text"): |
| if key in item: |
| val = item[key] |
| extra["caption_hint"] = val[0] if isinstance(val, list) else str(val) |
| break |
|
|
| ok, _ = ingest_image_from_pil(image, filename, extra, project=project) |
| if ok: |
| count += 1 |
| if count <= 3 or count % 50 == 0: |
| log.append(f" [{count}/{total}] {filename}") |
|
|
| if progress_callback: |
| progress_callback((i + 1) / total, desc=f"Seeding {i+1}/{total}...") |
|
|
| |
| store = get_store(project, "image_index") |
| if store.has_data(): |
| store.rebuild_gpu_index() |
| store._persist() |
|
|
| log.append(f"\nSeeding complete: {count} images indexed") |
| log.append(f"Store: {store}") |
| return count, "\n".join(log) |
|
|
|
|
| def is_needed() -> bool: |
| from config import AUTO_SEED |
| from vector_store import get_store |
| store = get_store(DEFAULT_PROJECT, "image_index") |
| return AUTO_SEED and not store.has_data() |
|
|