MSherbinii commited on
Commit
0105ea4
Β·
verified Β·
1 Parent(s): b8c2b74

Upload dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset.py +66 -24
dataset.py CHANGED
@@ -109,12 +109,6 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
109
  cache_dir = Path(cache_dir)
110
  cache_dir.mkdir(exist_ok=True, parents=True)
111
 
112
- extracted_path = cache_dir / "ipad_dataset"
113
-
114
- if extracted_path.exists():
115
- print(f"βœ… Dataset already extracted at {extracted_path}")
116
- return extracted_path
117
-
118
  print("πŸ“₯ Downloading dataset from HF Hub...")
119
  zip_path = hf_hub_download(
120
  repo_id="MSherbinii/ipad-industrial-anomaly",
@@ -123,29 +117,77 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
123
  cache_dir=str(cache_dir)
124
  )
125
 
126
- print("πŸ“¦ Extracting dataset...")
 
 
127
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 
 
 
 
 
 
 
128
  zip_ref.extractall(cache_dir)
129
 
130
- # Check actual structure and return correct path
131
- # The zip might extract to cache/ipad_dataset/ or cache/ipad_dataset/ipad_dataset/
132
- if (cache_dir / "ipad_dataset" / "ipad_dataset").exists():
133
- # Nested structure, use inner one
134
- extracted_path = cache_dir / "ipad_dataset" / "ipad_dataset"
135
- elif (cache_dir / "ipad_dataset").exists():
136
- # Direct structure
137
- extracted_path = cache_dir / "ipad_dataset"
138
- else:
139
- # Check what was actually extracted
140
- extracted_items = list(cache_dir.glob("*"))
141
- print(f"πŸ“ Extracted items: {[p.name for p in extracted_items if p.is_dir()]}")
142
- # Try to find a directory with device folders
143
- for item in extracted_items:
144
- if item.is_dir() and (item / "S01").exists():
145
- extracted_path = item
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  break
147
 
148
- print(f"βœ… Dataset extracted to {extracted_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  return extracted_path
150
 
151
 
 
109
  cache_dir = Path(cache_dir)
110
  cache_dir.mkdir(exist_ok=True, parents=True)
111
 
 
 
 
 
 
 
112
  print("πŸ“₯ Downloading dataset from HF Hub...")
113
  zip_path = hf_hub_download(
114
  repo_id="MSherbinii/ipad-industrial-anomaly",
 
117
  cache_dir=str(cache_dir)
118
  )
119
 
120
+ print(f"πŸ“¦ Downloaded to: {zip_path}")
121
+ print(f"πŸ“¦ Extracting dataset to {cache_dir}...")
122
+
123
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
124
+ # List first 20 files to see structure
125
+ all_files = zip_ref.namelist()
126
+ print(f"πŸ“ Total files in zip: {len(all_files)}")
127
+ print(f"πŸ“ First 20 files:")
128
+ for f in all_files[:20]:
129
+ print(f" {f}")
130
+
131
  zip_ref.extractall(cache_dir)
132
 
133
+ # Diagnostic: Show what was actually extracted
134
+ print(f"\nπŸ“ Checking extracted contents in {cache_dir}:")
135
+ for item in sorted(cache_dir.glob("*")):
136
+ if item.is_dir():
137
+ print(f" DIR: {item.name}/")
138
+ # Show subdirectories
139
+ for subitem in sorted(item.glob("*"))[:10]:
140
+ if subitem.is_dir():
141
+ print(f" └─ DIR: {subitem.name}/")
142
+ else:
143
+ print(f" └─ FILE: {subitem.name}")
144
+ else:
145
+ print(f" FILE: {item.name}")
146
+
147
+ # Find the actual dataset root
148
+ extracted_path = None
149
+
150
+ # Try different possible paths
151
+ candidates = [
152
+ cache_dir / "ipad_dataset",
153
+ cache_dir / "ipad_dataset" / "ipad_dataset",
154
+ ]
155
+
156
+ # Also check for any directory containing S01
157
+ for item in cache_dir.rglob("S01"):
158
+ if item.is_dir():
159
+ # Go up to find the root that contains device folders
160
+ potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
161
+ candidates.append(potential_root)
162
+
163
+ print(f"\nπŸ” Checking candidate paths:")
164
+ for candidate in candidates:
165
+ print(f" {candidate}")
166
+ if candidate.exists():
167
+ has_s01 = (candidate / "S01").exists()
168
+ has_train = (candidate / "S01" / "train").exists() if has_s01 else False
169
+ has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
170
+ print(f" S01: {has_s01}, train: {has_train}, frames: {has_frames}")
171
+
172
+ if has_frames:
173
+ extracted_path = candidate
174
+ print(f" βœ… VALID ROOT!")
175
  break
176
 
177
+ if extracted_path is None:
178
+ # Last resort: find any S01 and work backwards
179
+ s01_paths = list(cache_dir.rglob("S01"))
180
+ if s01_paths:
181
+ print(f"\n⚠️ Found S01 at: {s01_paths[0]}")
182
+ print(f" Using parent as root")
183
+ extracted_path = s01_paths[0].parent
184
+ else:
185
+ raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
186
+
187
+ print(f"\nβœ… Dataset root: {extracted_path}")
188
+ print(f"βœ… Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
189
+ print(f" Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
190
+
191
  return extracted_path
192
 
193