Spaces:

MSherbinii
/

ipad-vad-training

Sleeping

App Files Files Community

MSherbinii commited on Nov 13, 2025

Commit

97b37cd

verified ·

1 Parent(s): 0105ea4

Upload dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

dataset.py +21 -68

dataset.py CHANGED Viewed

@@ -46,7 +46,9 @@ class IPADVideoDataset(Dataset):
         self.normalize = normalize
         # Construct path to device frames
-        self.device_path = self.root_dir / device_name / split / "frames"
         if not self.device_path.exists():
             raise ValueError(f"Dataset path not found: {self.device_path}")
@@ -103,12 +105,22 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
     """
     Download IPAD dataset from HF Hub and extract it
     Returns:
-        Path to extracted dataset directory
     """
     cache_dir = Path(cache_dir)
     cache_dir.mkdir(exist_ok=True, parents=True)
     print("📥 Downloading dataset from HF Hub...")
     zip_path = hf_hub_download(
         repo_id="MSherbinii/ipad-industrial-anomaly",
@@ -117,77 +129,18 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
         cache_dir=str(cache_dir)
     )
-    print(f"📦 Downloaded to: {zip_path}")
     print(f"📦 Extracting dataset to {cache_dir}...")
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        # List first 20 files to see structure
-        all_files = zip_ref.namelist()
-        print(f"📁 Total files in zip: {len(all_files)}")
-        print(f"📁 First 20 files:")
-        for f in all_files[:20]:
-            print(f"   {f}")
         zip_ref.extractall(cache_dir)
-    # Diagnostic: Show what was actually extracted
-    print(f"\n📁 Checking extracted contents in {cache_dir}:")
-    for item in sorted(cache_dir.glob("*")):
-        if item.is_dir():
-            print(f"   DIR: {item.name}/")
-            # Show subdirectories
-            for subitem in sorted(item.glob("*"))[:10]:
-                if subitem.is_dir():
-                    print(f"      └─ DIR: {subitem.name}/")
-                else:
-                    print(f"      └─ FILE: {subitem.name}")
-        else:
-            print(f"   FILE: {item.name}")
-    # Find the actual dataset root
-    extracted_path = None
-    # Try different possible paths
-    candidates = [
-        cache_dir / "ipad_dataset",
-        cache_dir / "ipad_dataset" / "ipad_dataset",
-    ]
-    # Also check for any directory containing S01
-    for item in cache_dir.rglob("S01"):
-        if item.is_dir():
-            # Go up to find the root that contains device folders
-            potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
-            candidates.append(potential_root)
-    print(f"\n🔍 Checking candidate paths:")
-    for candidate in candidates:
-        print(f"   {candidate}")
-        if candidate.exists():
-            has_s01 = (candidate / "S01").exists()
-            has_train = (candidate / "S01" / "train").exists() if has_s01 else False
-            has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
-            print(f"      S01: {has_s01}, train: {has_train}, frames: {has_frames}")
-            if has_frames:
-                extracted_path = candidate
-                print(f"      ✅ VALID ROOT!")
-                break
-    if extracted_path is None:
-        # Last resort: find any S01 and work backwards
-        s01_paths = list(cache_dir.rglob("S01"))
-        if s01_paths:
-            print(f"\n⚠️  Found S01 at: {s01_paths[0]}")
-            print(f"   Using parent as root")
-            extracted_path = s01_paths[0].parent
-        else:
-            raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
-    print(f"\n✅ Dataset root: {extracted_path}")
-    print(f"✅ Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
-    print(f"   Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
     return extracted_path

         self.normalize = normalize
         # Construct path to device frames
+        # Note: The dataset uses "training" and "testing", not "train" and "test"
+        split_folder = "training" if split == "train" else "testing"
+        self.device_path = self.root_dir / device_name / split_folder / "frames"
         if not self.device_path.exists():
             raise ValueError(f"Dataset path not found: {self.device_path}")
     """
     Download IPAD dataset from HF Hub and extract it
+    The zip contains: IPAD_dataset/S01/training/frames/...
+    We return the path to IPAD_dataset directory
     Returns:
+        Path to extracted dataset directory (IPAD_dataset)
     """
     cache_dir = Path(cache_dir)
     cache_dir.mkdir(exist_ok=True, parents=True)
+    extracted_path = cache_dir / "IPAD_dataset"
+    # Check if already extracted
+    if extracted_path.exists() and (extracted_path / "S01" / "training" / "frames").exists():
+        print(f"✅ Dataset already extracted at {extracted_path}")
+        return extracted_path
     print("📥 Downloading dataset from HF Hub...")
     zip_path = hf_hub_download(
         repo_id="MSherbinii/ipad-industrial-anomaly",
         cache_dir=str(cache_dir)
     )
     print(f"📦 Extracting dataset to {cache_dir}...")
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         zip_ref.extractall(cache_dir)
+    # Verify extraction
+    if not extracted_path.exists():
+        raise ValueError(f"Expected {extracted_path} after extraction, but not found")
+    if not (extracted_path / "S01" / "training" / "frames").exists():
+        raise ValueError(f"Dataset structure incorrect. Missing S01/training/frames in {extracted_path}")
+    print(f"✅ Dataset extracted to {extracted_path}")
     return extracted_path