Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ | |
| reassemble_bbox_dataset_resume.py | |
| --------------------------------- | |
| Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from | |
| QA artefacts and pushes the final dataset **privately** to HF Hub. | |
| β’ Safe to ^C / rerun (uses on-disk Arrow cache) | |
| β’ When NOTHING is left to process it *just* loads the cache and pushes. | |
| β’ Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny. | |
| """ | |
| import os, json | |
| from pathlib import Path | |
| from tqdm.auto import tqdm | |
| from datasets import ( | |
| load_dataset, load_from_disk, Dataset, disable_progress_bar, Features, | |
| Value, Image as HFImage | |
| ) | |
| from PIL import Image | |
| from huggingface_hub.utils import HfHubHTTPError | |
| disable_progress_bar() | |
| # ββββββ CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt" | |
| SPLIT = "train" | |
| QA_DIR = Path("bbox_review_recaptioned") # artefacts | |
| CACHE_DIR = Path("rebuild_cache") # incremental Arrow cache | |
| CACHE_DIR.mkdir(exist_ok=True) | |
| TARGET_SIDE = 1500 | |
| GREEN_RGB = (0, 255, 0) | |
| BATCH_SAVE = 500 | |
| HUB_REPO = "fotographerai/furniture_bboxfilled_rebuild" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() # needs write+private | |
| # ββββββ HELPERS βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def img_ref(p: Path) -> dict: # path-only image dict | |
| return {"path": str(p), "bytes": None} | |
| def make_green_png(p: Path): | |
| if not p.exists(): | |
| Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p) | |
| def ensure_full_bbox(p: Path): | |
| if not p.exists(): | |
| p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]})) | |
| # ββββββ LOAD SOURCE DATASET βββββββββββββββββββββββββββββββββββββββββ | |
| base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False) | |
| N_TOTAL = len(base_ds) | |
| print("Original rows:", N_TOTAL) | |
| # ββββββ LOAD OR INIT CACHE ββββββββββββββββββββββββββββββββββββββββββ | |
| if (CACHE_DIR / "dataset_info.json").exists(): | |
| cache_ds = load_from_disk(CACHE_DIR) | |
| done = set(cache_ds["__row_idx__"]) | |
| print(f"Cache found β {len(done)} rows already processed.") | |
| records = {k: list(v) for k, v in cache_ds.to_dict().items()} | |
| else: | |
| done, records = set(), {"__row_idx__": [], "bbox_filled": [], | |
| "annotated": [], "bbox_json": []} | |
| missing = [i for i in range(N_TOTAL) if i not in done] | |
| print("Rows still to process:", len(missing)) | |
| # ββββββ NO WORK LEFT? push & exit ββββββββββββββββββββββββββββββββββ | |
| if not missing: | |
| print("π€ nothing new to process β pushing cached datasetβ¦") | |
| try: | |
| url = cache_ds.push_to_hub( | |
| HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB" | |
| ) | |
| print("π dataset pushed to:", url) | |
| except HfHubHTTPError as e: | |
| print("β push failed:", e) | |
| exit(0) | |
| # ββββββ PROCESS MISSING ROWS βββββββββββββββββββββββββββββββββββββββ | |
| for n, i in enumerate(tqdm(missing, desc="Re-assembling")): | |
| g_png = QA_DIR / f"{i:06d}_green.png" | |
| a_png = QA_DIR / f"{i:06d}_anno.png" | |
| bbox_j = QA_DIR / f"{i:06d}_bbox.json" | |
| if not (g_png.exists() and a_png.exists() and bbox_j.exists()): | |
| mask_png = QA_DIR / f"{i:06d}_mask.png" | |
| make_green_png(mask_png) | |
| g_png = a_png = mask_png | |
| ensure_full_bbox(bbox_j) | |
| row = base_ds[i] # copy original cols once | |
| records["__row_idx__"].append(i) | |
| for k, v in row.items(): | |
| records.setdefault(k, []).append(v) | |
| records["bbox_filled"].append(img_ref(g_png)) | |
| records["annotated"].append(img_ref(a_png)) | |
| records["bbox_json"].append(bbox_j.read_text()) | |
| if (n + 1) % BATCH_SAVE == 0: | |
| Dataset.from_dict(records).save_to_disk(CACHE_DIR) | |
| print(f"β« cached at {n+1}/{len(missing)}") | |
| # ββββββ FINAL DATASET FEATURES & SAVE βββββββββββββββββββββββββββββββ | |
| features = Features({ | |
| "__row_idx__" : Value("int32"), | |
| "bbox_filled" : HFImage(decode=False), | |
| "annotated" : HFImage(decode=False), | |
| "bbox_json" : Value("string"), | |
| # original columns inferred below | |
| }) | |
| for k in base_ds.features: | |
| if k not in features: | |
| features[k] = base_ds.features[k] | |
| final_ds = Dataset.from_dict(records, features=features) | |
| final_ds.save_to_disk(CACHE_DIR) | |
| print("β cached dataset saved to", CACHE_DIR.resolve()) | |
| # ββββββ PUSH PRIVATE βββββββββββββββββββββββββββββββββββββββββββββββ | |
| if not HF_TOKEN: | |
| print("β οΈ HF_TOKEN env-var not set β skipping push.") | |
| else: | |
| try: | |
| url = final_ds.push_to_hub( | |
| HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB" | |
| ) | |
| print("π dataset pushed to:", url) | |
| except HfHubHTTPError as e: | |
| print("β push failed:", e) | |