MSherbinii commited on
Commit
97b37cd
Β·
verified Β·
1 Parent(s): 0105ea4

Upload dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset.py +21 -68
dataset.py CHANGED
@@ -46,7 +46,9 @@ class IPADVideoDataset(Dataset):
46
  self.normalize = normalize
47
 
48
  # Construct path to device frames
49
- self.device_path = self.root_dir / device_name / split / "frames"
 
 
50
 
51
  if not self.device_path.exists():
52
  raise ValueError(f"Dataset path not found: {self.device_path}")
@@ -103,12 +105,22 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
103
  """
104
  Download IPAD dataset from HF Hub and extract it
105
 
 
 
 
106
  Returns:
107
- Path to extracted dataset directory
108
  """
109
  cache_dir = Path(cache_dir)
110
  cache_dir.mkdir(exist_ok=True, parents=True)
111
 
 
 
 
 
 
 
 
112
  print("πŸ“₯ Downloading dataset from HF Hub...")
113
  zip_path = hf_hub_download(
114
  repo_id="MSherbinii/ipad-industrial-anomaly",
@@ -117,77 +129,18 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
117
  cache_dir=str(cache_dir)
118
  )
119
 
120
- print(f"πŸ“¦ Downloaded to: {zip_path}")
121
  print(f"πŸ“¦ Extracting dataset to {cache_dir}...")
122
-
123
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
124
- # List first 20 files to see structure
125
- all_files = zip_ref.namelist()
126
- print(f"πŸ“ Total files in zip: {len(all_files)}")
127
- print(f"πŸ“ First 20 files:")
128
- for f in all_files[:20]:
129
- print(f" {f}")
130
-
131
  zip_ref.extractall(cache_dir)
132
 
133
- # Diagnostic: Show what was actually extracted
134
- print(f"\nπŸ“ Checking extracted contents in {cache_dir}:")
135
- for item in sorted(cache_dir.glob("*")):
136
- if item.is_dir():
137
- print(f" DIR: {item.name}/")
138
- # Show subdirectories
139
- for subitem in sorted(item.glob("*"))[:10]:
140
- if subitem.is_dir():
141
- print(f" └─ DIR: {subitem.name}/")
142
- else:
143
- print(f" └─ FILE: {subitem.name}")
144
- else:
145
- print(f" FILE: {item.name}")
146
-
147
- # Find the actual dataset root
148
- extracted_path = None
149
-
150
- # Try different possible paths
151
- candidates = [
152
- cache_dir / "ipad_dataset",
153
- cache_dir / "ipad_dataset" / "ipad_dataset",
154
- ]
155
-
156
- # Also check for any directory containing S01
157
- for item in cache_dir.rglob("S01"):
158
- if item.is_dir():
159
- # Go up to find the root that contains device folders
160
- potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
161
- candidates.append(potential_root)
162
-
163
- print(f"\nπŸ” Checking candidate paths:")
164
- for candidate in candidates:
165
- print(f" {candidate}")
166
- if candidate.exists():
167
- has_s01 = (candidate / "S01").exists()
168
- has_train = (candidate / "S01" / "train").exists() if has_s01 else False
169
- has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
170
- print(f" S01: {has_s01}, train: {has_train}, frames: {has_frames}")
171
-
172
- if has_frames:
173
- extracted_path = candidate
174
- print(f" βœ… VALID ROOT!")
175
- break
176
-
177
- if extracted_path is None:
178
- # Last resort: find any S01 and work backwards
179
- s01_paths = list(cache_dir.rglob("S01"))
180
- if s01_paths:
181
- print(f"\n⚠️ Found S01 at: {s01_paths[0]}")
182
- print(f" Using parent as root")
183
- extracted_path = s01_paths[0].parent
184
- else:
185
- raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
186
-
187
- print(f"\nβœ… Dataset root: {extracted_path}")
188
- print(f"βœ… Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
189
- print(f" Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
190
 
 
191
  return extracted_path
192
 
193
 
 
46
  self.normalize = normalize
47
 
48
  # Construct path to device frames
49
+ # Note: The dataset uses "training" and "testing", not "train" and "test"
50
+ split_folder = "training" if split == "train" else "testing"
51
+ self.device_path = self.root_dir / device_name / split_folder / "frames"
52
 
53
  if not self.device_path.exists():
54
  raise ValueError(f"Dataset path not found: {self.device_path}")
 
105
  """
106
  Download IPAD dataset from HF Hub and extract it
107
 
108
+ The zip contains: IPAD_dataset/S01/training/frames/...
109
+ We return the path to IPAD_dataset directory
110
+
111
  Returns:
112
+ Path to extracted dataset directory (IPAD_dataset)
113
  """
114
  cache_dir = Path(cache_dir)
115
  cache_dir.mkdir(exist_ok=True, parents=True)
116
 
117
+ extracted_path = cache_dir / "IPAD_dataset"
118
+
119
+ # Check if already extracted
120
+ if extracted_path.exists() and (extracted_path / "S01" / "training" / "frames").exists():
121
+ print(f"βœ… Dataset already extracted at {extracted_path}")
122
+ return extracted_path
123
+
124
  print("πŸ“₯ Downloading dataset from HF Hub...")
125
  zip_path = hf_hub_download(
126
  repo_id="MSherbinii/ipad-industrial-anomaly",
 
129
  cache_dir=str(cache_dir)
130
  )
131
 
 
132
  print(f"πŸ“¦ Extracting dataset to {cache_dir}...")
 
133
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 
 
 
 
 
 
 
134
  zip_ref.extractall(cache_dir)
135
 
136
+ # Verify extraction
137
+ if not extracted_path.exists():
138
+ raise ValueError(f"Expected {extracted_path} after extraction, but not found")
139
+
140
+ if not (extracted_path / "S01" / "training" / "frames").exists():
141
+ raise ValueError(f"Dataset structure incorrect. Missing S01/training/frames in {extracted_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ print(f"βœ… Dataset extracted to {extracted_path}")
144
  return extracted_path
145
 
146