Spaces:

MSherbinii
/

ipad-vad-training

Sleeping

App Files Files Community

MSherbinii commited on Nov 13, 2025

Commit

0105ea4

verified ·

1 Parent(s): b8c2b74

Upload dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

dataset.py +66 -24

dataset.py CHANGED Viewed

@@ -109,12 +109,6 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
     cache_dir = Path(cache_dir)
     cache_dir.mkdir(exist_ok=True, parents=True)
-    extracted_path = cache_dir / "ipad_dataset"
-    if extracted_path.exists():
-        print(f"✅ Dataset already extracted at {extracted_path}")
-        return extracted_path
     print("📥 Downloading dataset from HF Hub...")
     zip_path = hf_hub_download(
         repo_id="MSherbinii/ipad-industrial-anomaly",
@@ -123,29 +117,77 @@ def download_and_extract_dataset(cache_dir: str = "./cache") -> Path:
         cache_dir=str(cache_dir)
     )
-    print("📦 Extracting dataset...")
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         zip_ref.extractall(cache_dir)
-    # Check actual structure and return correct path
-    # The zip might extract to cache/ipad_dataset/ or cache/ipad_dataset/ipad_dataset/
-    if (cache_dir / "ipad_dataset" / "ipad_dataset").exists():
-        # Nested structure, use inner one
-        extracted_path = cache_dir / "ipad_dataset" / "ipad_dataset"
-    elif (cache_dir / "ipad_dataset").exists():
-        # Direct structure
-        extracted_path = cache_dir / "ipad_dataset"
-    else:
-        # Check what was actually extracted
-        extracted_items = list(cache_dir.glob("*"))
-        print(f"📁 Extracted items: {[p.name for p in extracted_items if p.is_dir()]}")
-        # Try to find a directory with device folders
-        for item in extracted_items:
-            if item.is_dir() and (item / "S01").exists():
-                extracted_path = item
                 break
-    print(f"✅ Dataset extracted to {extracted_path}")
     return extracted_path

     cache_dir = Path(cache_dir)
     cache_dir.mkdir(exist_ok=True, parents=True)
     print("📥 Downloading dataset from HF Hub...")
     zip_path = hf_hub_download(
         repo_id="MSherbinii/ipad-industrial-anomaly",
         cache_dir=str(cache_dir)
     )
+    print(f"📦 Downloaded to: {zip_path}")
+    print(f"📦 Extracting dataset to {cache_dir}...")
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        # List first 20 files to see structure
+        all_files = zip_ref.namelist()
+        print(f"📁 Total files in zip: {len(all_files)}")
+        print(f"📁 First 20 files:")
+        for f in all_files[:20]:
+            print(f"   {f}")
         zip_ref.extractall(cache_dir)
+    # Diagnostic: Show what was actually extracted
+    print(f"\n📁 Checking extracted contents in {cache_dir}:")
+    for item in sorted(cache_dir.glob("*")):
+        if item.is_dir():
+            print(f"   DIR: {item.name}/")
+            # Show subdirectories
+            for subitem in sorted(item.glob("*"))[:10]:
+                if subitem.is_dir():
+                    print(f"      └─ DIR: {subitem.name}/")
+                else:
+                    print(f"      └─ FILE: {subitem.name}")
+        else:
+            print(f"   FILE: {item.name}")
+    # Find the actual dataset root
+    extracted_path = None
+    # Try different possible paths
+    candidates = [
+        cache_dir / "ipad_dataset",
+        cache_dir / "ipad_dataset" / "ipad_dataset",
+    ]
+    # Also check for any directory containing S01
+    for item in cache_dir.rglob("S01"):
+        if item.is_dir():
+            # Go up to find the root that contains device folders
+            potential_root = item.parent.parent if (item / "train").exists() or (item / "test").exists() else item.parent
+            candidates.append(potential_root)
+    print(f"\n🔍 Checking candidate paths:")
+    for candidate in candidates:
+        print(f"   {candidate}")
+        if candidate.exists():
+            has_s01 = (candidate / "S01").exists()
+            has_train = (candidate / "S01" / "train").exists() if has_s01 else False
+            has_frames = (candidate / "S01" / "train" / "frames").exists() if has_train else False
+            print(f"      S01: {has_s01}, train: {has_train}, frames: {has_frames}")
+            if has_frames:
+                extracted_path = candidate
+                print(f"      ✅ VALID ROOT!")
                 break
+    if extracted_path is None:
+        # Last resort: find any S01 and work backwards
+        s01_paths = list(cache_dir.rglob("S01"))
+        if s01_paths:
+            print(f"\n⚠️  Found S01 at: {s01_paths[0]}")
+            print(f"   Using parent as root")
+            extracted_path = s01_paths[0].parent
+        else:
+            raise ValueError(f"Could not find valid dataset structure in {cache_dir}")
+    print(f"\n✅ Dataset root: {extracted_path}")
+    print(f"✅ Checking final path: {extracted_path / 'S01' / 'train' / 'frames'}")
+    print(f"   Exists: {(extracted_path / 'S01' / 'train' / 'frames').exists()}")
     return extracted_path