File size: 4,651 Bytes
07de2d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""
Reorganize WebSight images into subdirectories (HF 10K files/dir limit)
and update JSONL paths, then upload in batches.
"""

import json
import os
import shutil
from pathlib import Path
from huggingface_hub import HfApi

TOKEN = os.environ["HF_TOKEN"]  # set HF_TOKEN env var before running
REPO_ID = "Mindigenous/MINDI-1.5-training-data"
IMAGES_DIR = Path("data/websight/images")
FILES_PER_DIR = 10000  # max files per directory on HF

# Step 1: Reorganize images into subdirectories
print("=" * 60)
print("  Step 1: Reorganizing images into subdirectories")
print("=" * 60)

all_images = sorted(IMAGES_DIR.glob("*.jpg"))
print(f"Found {len(all_images)} images in flat directory")

if not all_images:
    # Check if already reorganized
    subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
    if subdirs:
        total = sum(len(list(d.glob("*.jpg"))) for d in subdirs)
        print(f"Already reorganized into {len(subdirs)} subdirs with {total} total images")
    else:
        print("ERROR: No images found!")
        exit(1)
else:
    for i, img in enumerate(all_images):
        subdir_idx = i // FILES_PER_DIR
        subdir = IMAGES_DIR / f"{subdir_idx:02d}"
        subdir.mkdir(exist_ok=True)
        shutil.move(str(img), str(subdir / img.name))
        if (i + 1) % 10000 == 0:
            print(f"  Moved {i + 1:,} images...")

    subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
    for sd in subdirs:
        count = len(list(sd.glob("*.jpg")))
        print(f"  {sd.name}/: {count:,} images")

# Step 2: Update JSONL files with new paths
print(f"\n{'=' * 60}")
print("  Step 2: Updating JSONL paths")
print("=" * 60)

for jsonl_name in ["train.jsonl", "val.jsonl"]:
    jsonl_path = Path("data/websight") / jsonl_name
    if not jsonl_path.exists():
        print(f"  {jsonl_name}: not found, skipping")
        continue

    lines = jsonl_path.read_text(encoding="utf-8").strip().split("\n")
    updated = []
    for line in lines:
        entry = json.loads(line)
        old_path = entry["image_path"]
        filename = os.path.basename(old_path)
        num = int(filename.replace("ws_", "").replace(".jpg", ""))
        subdir_idx = num // FILES_PER_DIR
        new_path = f"data/websight/images/{subdir_idx:02d}/{filename}"
        entry["image_path"] = new_path
        updated.append(json.dumps(entry, ensure_ascii=False))

    jsonl_path.write_text("\n".join(updated) + "\n", encoding="utf-8")
    print(f"  {jsonl_name}: updated {len(updated):,} entries")

# Step 3: Upload to HF
print(f"\n{'=' * 60}")
print("  Step 3: Uploading to HuggingFace")
print("=" * 60)

api = HfApi(token=TOKEN)

# Upload updated JSONL files first
print("\nUploading updated JSONL files...")
for jsonl_name in ["train.jsonl", "val.jsonl"]:
    jsonl_path = Path("data/websight") / jsonl_name
    api.upload_file(
        path_or_fileobj=str(jsonl_path),
        path_in_repo=f"websight/{jsonl_name}",
        repo_id=REPO_ID,
        repo_type="dataset",
    )
    print(f"  {jsonl_name} uploaded")

# Check which subdirs are already uploaded
import time
repo_files = set(api.list_repo_files(REPO_ID, repo_type="dataset"))

# Upload each subdirectory separately
subdirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
for i, subdir in enumerate(subdirs):
    count = len(list(subdir.glob("*.jpg")))
    # Check if this subdir is already fully uploaded
    sample_file = f"websight/images/{subdir.name}/{sorted(subdir.glob('*.jpg'))[0].name}"
    if sample_file in repo_files:
        print(f"\nSubdir {subdir.name}/ ({count:,} images) [{i+1}/{len(subdirs)}] — already uploaded, skipping.")
        continue

    for attempt in range(3):
        try:
            print(f"\nUploading subdir {subdir.name}/ ({count:,} images) [{i+1}/{len(subdirs)}] (attempt {attempt+1})...")
            api.upload_folder(
                folder_path=str(subdir),
                path_in_repo=f"websight/images/{subdir.name}",
                repo_id=REPO_ID,
                repo_type="dataset",
                commit_message=f"Add WebSight images subdir {subdir.name} ({count} images)",
            )
            print(f"  Subdir {subdir.name} committed!")
            break
        except Exception as e:
            print(f"  Error: {e}")
            if attempt < 2:
                wait = 30 * (attempt + 1)
                print(f"  Retrying in {wait}s...")
                time.sleep(wait)
            else:
                print(f"  FAILED after 3 attempts. Run script again to resume.")

print(f"\n{'=' * 60}")
print("  ALL DONE! All WebSight data uploaded to HF.")
print("=" * 60)