Apiarist / scripts /extract_dataset.py
Apiarist Dev
polish: remove emojis and em dashes from sources and docs
238bdf6
Raw
History Blame Contribute Delete
2.01 kB
"""
Extract Roboflow dataset zip with Windows long-path support.
Roboflow ships images with absurdly long filenames (URL slugs preserved).
Windows' default 260-char MAX_PATH limit breaks normal extraction.
We use the \\?\ prefix which opts a path into the long-path code path.
Usage:
py scripts/extract_dataset.py
"""
import sys
import zipfile
from pathlib import Path
ZIP_PATH = Path("data/raw/roboflow_honey-bee-detection-model-zgjnb_v4/roboflow.zip")
DEST = Path("data/raw/roboflow_honey-bee-detection-model-zgjnb_v4").resolve()
def lp(path) -> str:
"""Return a Windows long-path string (\\?\C:\...) if needed."""
s = str(path)
if sys.platform == "win32":
# \\?\ prefix MUST use absolute path with backslashes
s = s.replace("/", "\\")
if not s.startswith("\\\\?\\"):
s = "\\\\?\\" + s
return s
def main() -> None:
if not ZIP_PATH.exists():
raise SystemExit(f"Missing {ZIP_PATH}")
with zipfile.ZipFile(ZIP_PATH) as z:
members = z.namelist()
total = len(members)
print(f"Extracting {total} entries from {ZIP_PATH.name} ...")
ok = 0
fail = 0
for i, member in enumerate(members):
if i % 500 == 0:
print(f" progress: {i}/{total} (ok={ok}, fail={fail})")
target = DEST / member
try:
target.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
if member.endswith("/"):
continue
try:
with z.open(member) as src:
data = src.read()
with open(lp(target), "wb") as dst:
dst.write(data)
ok += 1
except Exception as e:
fail += 1
if fail <= 5:
print(f" [!] {member[:80]}... -> {type(e).__name__}: {e}")
print(f"\nDone. ok={ok}, fail={fail}")
if __name__ == "__main__":
main()