Spaces:
Sleeping
Sleeping
Upload dataset.py with huggingface_hub
Browse files- dataset.py +66 -24
dataset.py
CHANGED
|
@@ -109,12 +109,6 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
|
|
| 109 |
cache_dir = Path(cache_dir)
|
| 110 |
cache_dir.mkdir(exist_ok=True, parents=True)
|
| 111 |
|
| 112 |
-
extracted_path = cache_dir / "ipad_dataset"
|
| 113 |
-
|
| 114 |
-
if extracted_path.exists():
|
| 115 |
-
print(f"β
Dataset already extracted at {extracted_path}")
|
| 116 |
-
return extracted_path
|
| 117 |
-
|
| 118 |
print("π₯ Downloading dataset from HF Hub...")
|
| 119 |
zip_path = hf_hub_download(
|
| 120 |
repo_id="MSherbinii/ipad-industrial-anomaly",
|
|
@@ -123,29 +117,77 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
|
|
| 123 |
cache_dir=str(cache_dir)
|
| 124 |
)
|
| 125 |
|
| 126 |
-
print("π¦
|
|
|
|
|
|
|
| 127 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
zip_ref.extractall(cache_dir)
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
break
|
| 147 |
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
return extracted_path
|
| 150 |
|
| 151 |
|
|
|
|
| 109 |
cache_dir = Path(cache_dir)
|
| 110 |
cache_dir.mkdir(exist_ok=True, parents=True)
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
print("π₯ Downloading dataset from HF Hub...")
|
| 113 |
zip_path = hf_hub_download(
|
| 114 |
repo_id="MSherbinii/ipad-industrial-anomaly",
|
|
|
|
| 117 |
cache_dir=str(cache_dir)
|
| 118 |
)
|
| 119 |
|
| 120 |
+
print(f"π¦ Downloaded to: {zip_path}")
|
| 121 |
+
print(f"π¦ Extracting dataset to {cache_dir}...")
|
| 122 |
+
|
| 123 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 124 |
+
# List first 20 files to see structure
|
| 125 |
+
all_files = zip_ref.namelist()
|
| 126 |
+
print(f"π Total files in zip: {len(all_files)}")
|
| 127 |
+
print(f"π First 20 files:")
|
| 128 |
+
for f in all_files[:20]:
|
| 129 |
+
print(f" {f}")
|
| 130 |
+
|
| 131 |
zip_ref.extractall(cache_dir)
|
| 132 |
|
| 133 |
+
# Diagnostic: Show what was actually extracted
|
| 134 |
+
print(f"\nπ Checking extracted contents in {cache_dir}:")
|
| 135 |
+
for item in sorted(cache_dir.glob("*")):
|
| 136 |
+
if item.is_dir():
|
| 137 |
+
print(f" DIR: {item.name}/")
|
| 138 |
+
# Show subdirectories
|
| 139 |
+
for subitem in sorted(item.glob("*"))[:10]:
|
| 140 |
+
if subitem.is_dir():
|
| 141 |
+
print(f" ββ DIR: {subitem.name}/")
|
| 142 |
+
else:
|
| 143 |
+
print(f" ββ FILE: {subitem.name}")
|
| 144 |
+
else:
|
| 145 |
+
print(f" FILE: {item.name}")
|
| 146 |
+
|
| 147 |
+
# Find the actual dataset root
|
| 148 |
+
extracted_path = None
|
| 149 |
+
|
| 150 |
+
# Try different possible paths
|
| 151 |
+
candidates = [
|
| 152 |
+
cache_dir / "ipad_dataset",
|
| 153 |
+
cache_dir / "ipad_dataset" / "ipad_dataset",
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
# Also check for any directory containing S01
|
| 157 |
+
for item in cache_dir.rglob("S01"):
|
| 158 |
+
if item.is_dir():
|
| 159 |
+
# Go up to find the root that contains device folders
|
| 160 |
+
potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
|
| 161 |
+
candidates.append(potential_root)
|
| 162 |
+
|
| 163 |
+
print(f"\nπ Checking candidate paths:")
|
| 164 |
+
for candidate in candidates:
|
| 165 |
+
print(f" {candidate}")
|
| 166 |
+
if candidate.exists():
|
| 167 |
+
has_s01 = (candidate / "S01").exists()
|
| 168 |
+
has_train = (candidate / "S01" / "train").exists() if has_s01 else False
|
| 169 |
+
has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
|
| 170 |
+
print(f" S01: {has_s01}, train: {has_train}, frames: {has_frames}")
|
| 171 |
+
|
| 172 |
+
if has_frames:
|
| 173 |
+
extracted_path = candidate
|
| 174 |
+
print(f" β
VALID ROOT!")
|
| 175 |
break
|
| 176 |
|
| 177 |
+
if extracted_path is None:
|
| 178 |
+
# Last resort: find any S01 and work backwards
|
| 179 |
+
s01_paths = list(cache_dir.rglob("S01"))
|
| 180 |
+
if s01_paths:
|
| 181 |
+
print(f"\nβ οΈ Found S01 at: {s01_paths[0]}")
|
| 182 |
+
print(f" Using parent as root")
|
| 183 |
+
extracted_path = s01_paths[0].parent
|
| 184 |
+
else:
|
| 185 |
+
raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
|
| 186 |
+
|
| 187 |
+
print(f"\nβ
Dataset root: {extracted_path}")
|
| 188 |
+
print(f"β
Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
|
| 189 |
+
print(f" Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
|
| 190 |
+
|
| 191 |
return extracted_path
|
| 192 |
|
| 193 |
|