Spaces:
Sleeping
Sleeping
Upload dataset.py with huggingface_hub
Browse files- dataset.py +21 -68
dataset.py
CHANGED
|
@@ -46,7 +46,9 @@ class IPADVideoDataset(Dataset):
|
|
| 46 |
self.normalize = normalize
|
| 47 |
|
| 48 |
# Construct path to device frames
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
|
| 51 |
if not self.device_path.exists():
|
| 52 |
raise ValueError(f"Dataset path not found: {self.device_path}")
|
|
@@ -103,12 +105,22 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
|
|
| 103 |
"""
|
| 104 |
Download IPAD dataset from HF Hub and extract it
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
Returns:
|
| 107 |
-
Path to extracted dataset directory
|
| 108 |
"""
|
| 109 |
cache_dir = Path(cache_dir)
|
| 110 |
cache_dir.mkdir(exist_ok=True, parents=True)
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
print("π₯ Downloading dataset from HF Hub...")
|
| 113 |
zip_path = hf_hub_download(
|
| 114 |
repo_id="MSherbinii/ipad-industrial-anomaly",
|
|
@@ -117,77 +129,18 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
|
|
| 117 |
cache_dir=str(cache_dir)
|
| 118 |
)
|
| 119 |
|
| 120 |
-
print(f"π¦ Downloaded to: {zip_path}")
|
| 121 |
print(f"π¦ Extracting dataset to {cache_dir}...")
|
| 122 |
-
|
| 123 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 124 |
-
# List first 20 files to see structure
|
| 125 |
-
all_files = zip_ref.namelist()
|
| 126 |
-
print(f"π Total files in zip: {len(all_files)}")
|
| 127 |
-
print(f"π First 20 files:")
|
| 128 |
-
for f in all_files[:20]:
|
| 129 |
-
print(f" {f}")
|
| 130 |
-
|
| 131 |
zip_ref.extractall(cache_dir)
|
| 132 |
|
| 133 |
-
#
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
for subitem in sorted(item.glob("*"))[:10]:
|
| 140 |
-
if subitem.is_dir():
|
| 141 |
-
print(f" ββ DIR: {subitem.name}/")
|
| 142 |
-
else:
|
| 143 |
-
print(f" ββ FILE: {subitem.name}")
|
| 144 |
-
else:
|
| 145 |
-
print(f" FILE: {item.name}")
|
| 146 |
-
|
| 147 |
-
# Find the actual dataset root
|
| 148 |
-
extracted_path = None
|
| 149 |
-
|
| 150 |
-
# Try different possible paths
|
| 151 |
-
candidates = [
|
| 152 |
-
cache_dir / "ipad_dataset",
|
| 153 |
-
cache_dir / "ipad_dataset" / "ipad_dataset",
|
| 154 |
-
]
|
| 155 |
-
|
| 156 |
-
# Also check for any directory containing S01
|
| 157 |
-
for item in cache_dir.rglob("S01"):
|
| 158 |
-
if item.is_dir():
|
| 159 |
-
# Go up to find the root that contains device folders
|
| 160 |
-
potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
|
| 161 |
-
candidates.append(potential_root)
|
| 162 |
-
|
| 163 |
-
print(f"\nπ Checking candidate paths:")
|
| 164 |
-
for candidate in candidates:
|
| 165 |
-
print(f" {candidate}")
|
| 166 |
-
if candidate.exists():
|
| 167 |
-
has_s01 = (candidate / "S01").exists()
|
| 168 |
-
has_train = (candidate / "S01" / "train").exists() if has_s01 else False
|
| 169 |
-
has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
|
| 170 |
-
print(f" S01: {has_s01}, train: {has_train}, frames: {has_frames}")
|
| 171 |
-
|
| 172 |
-
if has_frames:
|
| 173 |
-
extracted_path = candidate
|
| 174 |
-
print(f" β
VALID ROOT!")
|
| 175 |
-
break
|
| 176 |
-
|
| 177 |
-
if extracted_path is None:
|
| 178 |
-
# Last resort: find any S01 and work backwards
|
| 179 |
-
s01_paths = list(cache_dir.rglob("S01"))
|
| 180 |
-
if s01_paths:
|
| 181 |
-
print(f"\nβ οΈ Found S01 at: {s01_paths[0]}")
|
| 182 |
-
print(f" Using parent as root")
|
| 183 |
-
extracted_path = s01_paths[0].parent
|
| 184 |
-
else:
|
| 185 |
-
raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
|
| 186 |
-
|
| 187 |
-
print(f"\nβ
Dataset root: {extracted_path}")
|
| 188 |
-
print(f"β
Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
|
| 189 |
-
print(f" Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
|
| 190 |
|
|
|
|
| 191 |
return extracted_path
|
| 192 |
|
| 193 |
|
|
|
|
| 46 |
self.normalize = normalize
|
| 47 |
|
| 48 |
# Construct path to device frames
|
| 49 |
+
# Note: The dataset uses "training" and "testing", not "train" and "test"
|
| 50 |
+
split_folder = "training" if split == "train" else "testing"
|
| 51 |
+
self.device_path = self.root_dir / device_name / split_folder / "frames"
|
| 52 |
|
| 53 |
if not self.device_path.exists():
|
| 54 |
raise ValueError(f"Dataset path not found: {self.device_path}")
|
|
|
|
| 105 |
"""
|
| 106 |
Download IPAD dataset from HF Hub and extract it
|
| 107 |
|
| 108 |
+
The zip contains: IPAD_dataset/S01/training/frames/...
|
| 109 |
+
We return the path to IPAD_dataset directory
|
| 110 |
+
|
| 111 |
Returns:
|
| 112 |
+
Path to extracted dataset directory (IPAD_dataset)
|
| 113 |
"""
|
| 114 |
cache_dir = Path(cache_dir)
|
| 115 |
cache_dir.mkdir(exist_ok=True, parents=True)
|
| 116 |
|
| 117 |
+
extracted_path = cache_dir / "IPAD_dataset"
|
| 118 |
+
|
| 119 |
+
# Check if already extracted
|
| 120 |
+
if extracted_path.exists() and (extracted_path / "S01" / "training" / "frames").exists():
|
| 121 |
+
print(f"β
Dataset already extracted at {extracted_path}")
|
| 122 |
+
return extracted_path
|
| 123 |
+
|
| 124 |
print("π₯ Downloading dataset from HF Hub...")
|
| 125 |
zip_path = hf_hub_download(
|
| 126 |
repo_id="MSherbinii/ipad-industrial-anomaly",
|
|
|
|
| 129 |
cache_dir=str(cache_dir)
|
| 130 |
)
|
| 131 |
|
|
|
|
| 132 |
print(f"π¦ Extracting dataset to {cache_dir}...")
|
|
|
|
| 133 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
zip_ref.extractall(cache_dir)
|
| 135 |
|
| 136 |
+
# Verify extraction
|
| 137 |
+
if not extracted_path.exists():
|
| 138 |
+
raise ValueError(f"Expected {extracted_path} after extraction, but not found")
|
| 139 |
+
|
| 140 |
+
if not (extracted_path / "S01" / "training" / "frames").exists():
|
| 141 |
+
raise ValueError(f"Dataset structure incorrect. Missing S01/training/frames in {extracted_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
print(f"β
Dataset extracted to {extracted_path}")
|
| 144 |
return extracted_path
|
| 145 |
|
| 146 |
|