Spaces:
Runtime error
Runtime error
File size: 5,578 Bytes
545e508 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
#!/usr/bin/env python
"""
reassemble_bbox_dataset_resume.py
---------------------------------
Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from
QA artefacts and pushes the final dataset **privately** to HF Hub.
β’ Safe to ^C / rerun (uses on-disk Arrow cache)
β’ When NOTHING is left to process it *just* loads the cache and pushes.
β’ Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny.
"""
import os, json
from pathlib import Path
from tqdm.auto import tqdm
from datasets import (
load_dataset, load_from_disk, Dataset, disable_progress_bar, Features,
Value, Image as HFImage
)
from PIL import Image
from huggingface_hub.utils import HfHubHTTPError
disable_progress_bar()
# ββββββ CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
SPLIT = "train"
QA_DIR = Path("bbox_review_recaptioned") # artefacts
CACHE_DIR = Path("rebuild_cache") # incremental Arrow cache
CACHE_DIR.mkdir(exist_ok=True)
TARGET_SIDE = 1500
GREEN_RGB = (0, 255, 0)
BATCH_SAVE = 500
HUB_REPO = "fotographerai/furniture_bboxfilled_rebuild"
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() # needs write+private
# ββββββ HELPERS βββββββββββββββββββββββββββββββββββββββββββββββββββββ
def img_ref(p: Path) -> dict: # path-only image dict
return {"path": str(p), "bytes": None}
def make_green_png(p: Path):
if not p.exists():
Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p)
def ensure_full_bbox(p: Path):
if not p.exists():
p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]}))
# ββββββ LOAD SOURCE DATASET βββββββββββββββββββββββββββββββββββββββββ
base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
N_TOTAL = len(base_ds)
print("Original rows:", N_TOTAL)
# ββββββ LOAD OR INIT CACHE ββββββββββββββββββββββββββββββββββββββββββ
if (CACHE_DIR / "dataset_info.json").exists():
cache_ds = load_from_disk(CACHE_DIR)
done = set(cache_ds["__row_idx__"])
print(f"Cache found β {len(done)} rows already processed.")
records = {k: list(v) for k, v in cache_ds.to_dict().items()}
else:
done, records = set(), {"__row_idx__": [], "bbox_filled": [],
"annotated": [], "bbox_json": []}
missing = [i for i in range(N_TOTAL) if i not in done]
print("Rows still to process:", len(missing))
# ββββββ NO WORK LEFT? push & exit ββββββββββββββββββββββββββββββββββ
if not missing:
print("π€ nothing new to process β pushing cached datasetβ¦")
try:
url = cache_ds.push_to_hub(
HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
)
print("π dataset pushed to:", url)
except HfHubHTTPError as e:
print("β push failed:", e)
exit(0)
# ββββββ PROCESS MISSING ROWS βββββββββββββββββββββββββββββββββββββββ
for n, i in enumerate(tqdm(missing, desc="Re-assembling")):
g_png = QA_DIR / f"{i:06d}_green.png"
a_png = QA_DIR / f"{i:06d}_anno.png"
bbox_j = QA_DIR / f"{i:06d}_bbox.json"
if not (g_png.exists() and a_png.exists() and bbox_j.exists()):
mask_png = QA_DIR / f"{i:06d}_mask.png"
make_green_png(mask_png)
g_png = a_png = mask_png
ensure_full_bbox(bbox_j)
row = base_ds[i] # copy original cols once
records["__row_idx__"].append(i)
for k, v in row.items():
records.setdefault(k, []).append(v)
records["bbox_filled"].append(img_ref(g_png))
records["annotated"].append(img_ref(a_png))
records["bbox_json"].append(bbox_j.read_text())
if (n + 1) % BATCH_SAVE == 0:
Dataset.from_dict(records).save_to_disk(CACHE_DIR)
print(f"β« cached at {n+1}/{len(missing)}")
# ββββββ FINAL DATASET FEATURES & SAVE βββββββββββββββββββββββββββββββ
features = Features({
"__row_idx__" : Value("int32"),
"bbox_filled" : HFImage(decode=False),
"annotated" : HFImage(decode=False),
"bbox_json" : Value("string"),
# original columns inferred below
})
for k in base_ds.features:
if k not in features:
features[k] = base_ds.features[k]
final_ds = Dataset.from_dict(records, features=features)
final_ds.save_to_disk(CACHE_DIR)
print("β
cached dataset saved to", CACHE_DIR.resolve())
# ββββββ PUSH PRIVATE βββββββββββββββββββββββββββββββββββββββββββββββ
if not HF_TOKEN:
print("β οΈ HF_TOKEN env-var not set β skipping push.")
else:
try:
url = final_ds.push_to_hub(
HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
)
print("π dataset pushed to:", url)
except HfHubHTTPError as e:
print("β push failed:", e)
|