File size: 4,651 Bytes
07de2d7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3
"""
Reorganize WebSight images into subdirectories (HF 10K files/dir limit)
and update JSONL paths, then upload in batches.
"""
import json
import os
import shutil
from pathlib import Path
from huggingface_hub import HfApi
TOKEN = os.environ["HF_TOKEN"] # set HF_TOKEN env var before running
REPO_ID = "Mindigenous/MINDI-1.5-training-data"
IMAGES_DIR = Path("data/websight/images")
FILES_PER_DIR = 10000 # max files per directory on HF
# Step 1: Reorganize images into subdirectories
print("=" * 60)
print(" Step 1: Reorganizing images into subdirectories")
print("=" * 60)
all_images = sorted(IMAGES_DIR.glob("*.jpg"))
print(f"Found {len(all_images)} images in flat directory")
if not all_images:
# Check if already reorganized
subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
if subdirs:
total = sum(len(list(d.glob("*.jpg"))) for d in subdirs)
print(f"Already reorganized into {len(subdirs)} subdirs with {total} total images")
else:
print("ERROR: No images found!")
exit(1)
else:
for i, img in enumerate(all_images):
subdir_idx = i // FILES_PER_DIR
subdir = IMAGES_DIR / f"{subdir_idx:02d}"
subdir.mkdir(exist_ok=True)
shutil.move(str(img), str(subdir / img.name))
if (i + 1) % 10000 == 0:
print(f" Moved {i + 1:,} images...")
subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
for sd in subdirs:
count = len(list(sd.glob("*.jpg")))
print(f" {sd.name}/: {count:,} images")
# Step 2: Update JSONL files with new paths
print(f"\n{'=' * 60}")
print(" Step 2: Updating JSONL paths")
print("=" * 60)
for jsonl_name in ["train.jsonl", "val.jsonl"]:
jsonl_path = Path("data/websight") / jsonl_name
if not jsonl_path.exists():
print(f" {jsonl_name}: not found, skipping")
continue
lines = jsonl_path.read_text(encoding="utf-8").strip().split("\n")
updated = []
for line in lines:
entry = json.loads(line)
old_path = entry["image_path"]
filename = os.path.basename(old_path)
num = int(filename.replace("ws_", "").replace(".jpg", ""))
subdir_idx = num // FILES_PER_DIR
new_path = f"data/websight/images/{subdir_idx:02d}/{filename}"
entry["image_path"] = new_path
updated.append(json.dumps(entry, ensure_ascii=False))
jsonl_path.write_text("\n".join(updated) + "\n", encoding="utf-8")
print(f" {jsonl_name}: updated {len(updated):,} entries")
# Step 3: Upload to HF
print(f"\n{'=' * 60}")
print(" Step 3: Uploading to HuggingFace")
print("=" * 60)
api = HfApi(token=TOKEN)
# Upload updated JSONL files first
print("\nUploading updated JSONL files...")
for jsonl_name in ["train.jsonl", "val.jsonl"]:
jsonl_path = Path("data/websight") / jsonl_name
api.upload_file(
path_or_fileobj=str(jsonl_path),
path_in_repo=f"websight/{jsonl_name}",
repo_id=REPO_ID,
repo_type="dataset",
)
print(f" {jsonl_name} uploaded")
# Check which subdirs are already uploaded
import time
repo_files = set(api.list_repo_files(REPO_ID, repo_type="dataset"))
# Upload each subdirectory separately
subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
for i, subdir in enumerate(subdirs):
count = len(list(subdir.glob("*.jpg")))
# Check if this subdir is already fully uploaded
sample_file = f"websight/images/{subdir.name}/{sorted(subdir.glob('*.jpg'))[0].name}"
if sample_file in repo_files:
print(f"\nSubdir {subdir.name}/ ({count:,} images) [{i+1}/{len(subdirs)}] — already uploaded, skipping.")
continue
for attempt in range(3):
try:
print(f"\nUploading subdir {subdir.name}/ ({count:,} images) [{i+1}/{len(subdirs)}] (attempt {attempt+1})...")
api.upload_folder(
folder_path=str(subdir),
path_in_repo=f"websight/images/{subdir.name}",
repo_id=REPO_ID,
repo_type="dataset",
commit_message=f"Add WebSight images subdir {subdir.name} ({count} images)",
)
print(f" Subdir {subdir.name} committed!")
break
except Exception as e:
print(f" Error: {e}")
if attempt < 2:
wait = 30 * (attempt + 1)
print(f" Retrying in {wait}s...")
time.sleep(wait)
else:
print(f" FAILED after 3 attempts. Run script again to resume.")
print(f"\n{'=' * 60}")
print(" ALL DONE! All WebSight data uploaded to HF.")
print("=" * 60)
|