Spaces:

MSherbinii
/

ipad-vad-training

Sleeping

App Files Files Community

MSherbinii commited on Nov 13, 2025

Commit

6c6de8d

verified ·

1 Parent(s): c3981cb

Update app.py with integrated training interface and full functionality

Browse files

Files changed (1) hide show

app.py +286 -128

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """
 IPAD VAD Training Interface on HuggingFace Spaces with ZeroGPU
 """
 import gradio as gr
 import torch
@@ -12,48 +13,53 @@ import zipfile
 from huggingface_hub import hf_hub_download, HfApi
 import subprocess
 import sys
-# Add IPAD code to path
-sys.path.insert(0, str(Path(__file__).parent / "IPAD"))
-from IPAD.model.video_swin_transformer import VST
-from IPAD.train import train_one_epoch, validate
 import spaces  # ZeroGPU decorator
 # Global state
-DATASET_PATH = Path("./ipad_data")
 CHECKPOINT_DIR = Path("./checkpoints")
 CHECKPOINT_DIR.mkdir(exist_ok=True)
-def download_dataset(progress=gr.Progress()):
     """Download and extract IPAD dataset from HF Hub"""
     progress(0, desc="Downloading dataset...")
-    if DATASET_PATH.exists():
-        return "✅ Dataset already downloaded"
     try:
-        zip_path = hf_hub_download(
-            repo_id="MSherbinii/ipad-industrial-anomaly",
-            filename="ipad_dataset.zip",
-            repo_type="dataset",
-            cache_dir="./cache"
-        )
-        progress(0.5, desc="Extracting dataset...")
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(DATASET_PATH.parent)
         progress(1.0, desc="Complete!")
-        return f"✅ Dataset downloaded and extracted to {DATASET_PATH}"
     except Exception as e:
         return f"❌ Error: {str(e)}"
-@spaces.GPU(duration=120)  # Request GPU for 2 minutes
-def quick_test(device_name="S01"):
-    """Quick test to verify model and data loading"""
     try:
         # Load model
         model = VST(mem_dim=2000, shrink_thres=0.0025)
         model = model.cuda()
@@ -67,158 +73,280 @@ def quick_test(device_name="S01"):
         result = {
             "status": "✅ Success",
             "output_shape": str(output['output'].shape),
             "attention_shape": str(output['att'].shape),
             "period_shape": str(output['recon_index'].shape),
-            "gpu_available": torch.cuda.is_available(),
-            "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
         }
-        return json.dumps(result, indent=2)
     except Exception as e:
-        return f"❌ Error: {str(e)}"
 @spaces.GPU(duration=3600)  # Request GPU for 1 hour
-def train_baseline(
-    device_name="S01",
-    epochs=10,
-    batch_size=4,
-    lr=1e-4,
-    mem_dim=2000,
     progress=gr.Progress()
-):
-    """Train baseline IPAD model on selected device"""
-    progress(0, desc="Initializing training...")
     try:
-        # Model setup
-        model = VST(mem_dim=mem_dim, shrink_thres=0.0025)
-        model = model.cuda()
-        # Optimizer
-        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-        # Training loop placeholder
-        # (Full implementation requires dataset loaders from IPAD/train.py)
-        results = {
-            "status": "✅ Training started",
-            "device": device_name,
-            "epochs": epochs,
-            "batch_size": batch_size,
-            "lr": lr,
-            "mem_dim": mem_dim,
-            "checkpoint_dir": str(CHECKPOINT_DIR)
-        }
-        # Save checkpoint
-        checkpoint_path = CHECKPOINT_DIR / f"baseline_{device_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pth"
-        torch.save({
-            'model_state_dict': model.state_dict(),
-            'optimizer_state_dict': optimizer.state_dict(),
-            'config': results
-        }, checkpoint_path)
-        results["checkpoint"] = str(checkpoint_path)
-        progress(1.0, desc="Training complete!")
-        return json.dumps(results, indent=2)
     except Exception as e:
-        return f"❌ Error: {str(e)}"
-def upload_checkpoint(checkpoint_name):
-    """Upload trained checkpoint to HF Hub"""
-    try:
-        api = HfApi()
-        checkpoint_path = CHECKPOINT_DIR / checkpoint_name
-        if not checkpoint_path.exists():
-            return f"❌ Checkpoint not found: {checkpoint_name}"
-        api.upload_file(
-            path_or_fileobj=str(checkpoint_path),
-            path_in_repo=f"checkpoints/{checkpoint_name}",
-            repo_id="MSherbinii/ipad-vad-training",
-            repo_type="model",
         )
-        return f"✅ Uploaded to https://huggingface.co/MSherbinii/ipad-vad-training"
     except Exception as e:
-        return f"❌ Error: {str(e)}"
 # Gradio Interface
-with gr.Blocks(title="IPAD VAD Training on ZeroGPU") as demo:
     gr.Markdown("# 🏭 IPAD: Industrial Process Anomaly Detection Training")
     gr.Markdown("Train video anomaly detection models on ZeroGPU with the IPAD dataset")
-    with gr.Tab("📥 Dataset Setup"):
-        gr.Markdown("## Download IPAD Dataset from HF Hub")
-        download_btn = gr.Button("Download Dataset (8.3 GB)", variant="primary")
-        download_output = gr.Textbox(label="Status", lines=3)
-        download_btn.click(download_dataset, outputs=download_output)
-    with gr.Tab("🧪 Quick Test"):
-        gr.Markdown("## Test Model Loading (No Dataset Required)")
-        test_device = gr.Dropdown(
-            choices=["S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S10", "S11", "S12"],
-            value="S01",
-            label="Device"
         )
-        test_btn = gr.Button("Run Quick Test", variant="primary")
-        test_output = gr.JSON(label="Test Results")
-        test_btn.click(quick_test, inputs=test_device, outputs=test_output)
-    with gr.Tab("🚀 Baseline Training"):
-        gr.Markdown("## Train IPAD Baseline Model")
         with gr.Row():
-            train_device = gr.Dropdown(
-                choices=["S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S10", "S11", "S12"],
                 value="S01",
                 label="Training Device"
             )
-            train_epochs = gr.Slider(1, 200, value=10, step=1, label="Epochs")
         with gr.Row():
-            train_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
-            train_lr = gr.Number(value=1e-4, label="Learning Rate")
-            train_mem = gr.Slider(500, 2000, value=2000, step=100, label="Memory Dimension")
-        train_btn = gr.Button("Start Training", variant="primary")
-        train_output = gr.JSON(label="Training Results")
-        train_btn.click(
-            train_baseline,
-            inputs=[train_device, train_epochs, train_batch, train_lr, train_mem],
-            outputs=train_output
-        )
-    with gr.Tab("💾 Checkpoint Management"):
-        gr.Markdown("## Upload Checkpoints to HF Hub")
-        checkpoint_list = gr.Dropdown(
-            choices=[f.name for f in CHECKPOINT_DIR.glob("*.pth")] if CHECKPOINT_DIR.exists() else [],
-            label="Select Checkpoint"
         )
-        upload_btn = gr.Button("Upload to HF Hub", variant="primary")
-        upload_output = gr.Textbox(label="Upload Status")
-        upload_btn.click(upload_checkpoint, inputs=checkpoint_list, outputs=upload_output)
     with gr.Tab("📊 Documentation"):
         gr.Markdown("""
         ## IPAD VAD Training Guide
         ### Quick Start
-        1. **Download Dataset**: Go to "Dataset Setup" tab and download the IPAD dataset
-        2. **Quick Test**: Verify GPU access and model loading in "Quick Test" tab
-        3. **Train Baseline**: Start training on any of the 12 synthetic devices
         ### Hardware
         - **GPU**: NVIDIA H200 (via ZeroGPU)
-        - **Duration**: 1 hour per training session
-        - **Memory**: 80GB HBM3
         ### Model Architecture
         - **Encoder**: Video Swin Transformer (768-dim features)
@@ -226,15 +354,45 @@ with gr.Blocks(title="IPAD VAD Training on ZeroGPU") as demo:
         - **Period Module**: 200-class temporal position classifier
         - **Decoder**: I3D-based 3D decoder
-        ### Expected Results
-        - **Average AUC**: ~68.6% (baseline)
-        - **Best Device (S08)**: 85.6%
-        - **Challenging (R03)**: 43.5%
         ### Resources
         - [Paper](https://arxiv.org/abs/2404.15033)
         - [Dataset](https://huggingface.co/datasets/MSherbinii/ipad-industrial-anomaly)
-        - [Technical Analysis](https://github.com/LJF1113/IPAD)
         """)
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
 IPAD VAD Training Interface on HuggingFace Spaces with ZeroGPU
+Updated version with integrated training infrastructure
 """
 import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download, HfApi
 import subprocess
 import sys
+from typing import Optional, Dict
+# Import training infrastructure
+from train_hf import IPADTrainer
+from dataset import download_and_extract_dataset, DEVICE_NAMES, SYNTHETIC_DEVICES
 import spaces  # ZeroGPU decorator
 # Global state
+DATASET_PATH = None
 CHECKPOINT_DIR = Path("./checkpoints")
 CHECKPOINT_DIR.mkdir(exist_ok=True)
+def setup_dataset(progress=gr.Progress()) -> str:
     """Download and extract IPAD dataset from HF Hub"""
+    global DATASET_PATH
     progress(0, desc="Downloading dataset...")
+    if DATASET_PATH and DATASET_PATH.exists():
+        return f"✅ Dataset already available at {DATASET_PATH}"
     try:
+        DATASET_PATH = download_and_extract_dataset(cache_dir="./cache")
         progress(1.0, desc="Complete!")
+        return f"✅ Dataset downloaded and extracted to {DATASET_PATH}\n📊 Ready for training!"
     except Exception as e:
         return f"❌ Error: {str(e)}"
+@spaces.GPU(duration=60)  # Request GPU for 1 minute
+def quick_gpu_test() -> Dict:
+    """Quick test to verify GPU access and model loading"""
     try:
+        from IPAD.model.video_swin_transformer import VST
+        # Check GPU
+        gpu_available = torch.cuda.is_available()
+        gpu_name = torch.cuda.get_device_name(0) if gpu_available else "None"
+        if not gpu_available:
+            return {
+                "status": "⚠️ Warning",
+                "message": "No GPU available",
+                "gpu_available": False,
+                "gpu_name": "None"
+            }
         # Load model
         model = VST(mem_dim=2000, shrink_thres=0.0025)
         model = model.cuda()
         result = {
             "status": "✅ Success",
+            "message": "GPU test passed!",
+            "gpu_available": True,
+            "gpu_name": gpu_name,
             "output_shape": str(output['output'].shape),
             "attention_shape": str(output['att'].shape),
             "period_shape": str(output['recon_index'].shape),
+            "memory_allocated_gb": f"{torch.cuda.memory_allocated() / 1e9:.2f}",
+            "memory_reserved_gb": f"{torch.cuda.memory_reserved() / 1e9:.2f}"
         }
+        return result
     except Exception as e:
+        return {
+            "status": "❌ Error",
+            "message": str(e),
+            "gpu_available": torch.cuda.is_available(),
+            "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
+        }
 @spaces.GPU(duration=3600)  # Request GPU for 1 hour
+def train_quick_baseline(
+    device_name: str = "S01",
+    epochs: int = 10,
+    batch_size: int = 4,
+    lr: float = 1e-4,
     progress=gr.Progress()
+) -> str:
+    """Quick baseline training (10 epochs for testing)"""
+    global DATASET_PATH
+    if DATASET_PATH is None or not DATASET_PATH.exists():
+        return "❌ Error: Dataset not downloaded. Please download dataset first."
+    progress(0, desc="Initializing trainer...")
     try:
+        # Create trainer
+        trainer = IPADTrainer(
+            device_name=device_name,
+            epochs=epochs,
+            batch_size=batch_size,
+            lr=lr,
+            mem_dim=2000,
+            checkpoint_dir=str(CHECKPOINT_DIR),
+            wandb_project=None,  # Disable wandb for quick test
+            hf_repo=None  # Disable auto-upload for quick test
+        )
+        progress(0.1, desc="Loading dataset...")
+        # Train
+        trainer.train(str(DATASET_PATH))
+        progress(1.0, desc="Training complete!")
+        # Get latest checkpoint
+        checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
+        latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None
+        result = f"""
+✅ Quick baseline training complete!
+📊 Configuration:
+  - Device: {device_name}
+  - Epochs: {epochs}
+  - Batch Size: {batch_size}
+  - Learning Rate: {lr}
+💾 Checkpoint:
+  - {latest_checkpoint.name if latest_checkpoint else 'No checkpoint saved'}
+🎯 Next Steps:
+  1. Review training metrics
+  2. Run full 200-epoch training
+  3. Evaluate on test set
+"""
+        return result
     except Exception as e:
+        return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."
+@spaces.GPU(duration=7200)  # Request GPU for 2 hours
+def train_full_baseline(
+    device_name: str = "S01",
+    epochs: int = 200,
+    batch_size: int = 4,
+    lr: float = 1e-4,
+    mem_dim: int = 2000,
+    enable_wandb: bool = False,
+    enable_hf_upload: bool = True,
+    progress=gr.Progress()
+) -> str:
+    """Full baseline training (200 epochs)"""
+    global DATASET_PATH
+    if DATASET_PATH is None or not DATASET_PATH.exists():
+        return "❌ Error: Dataset not downloaded. Please download dataset first."
+    progress(0, desc="Initializing full training...")
+    try:
+        # Create trainer
+        trainer = IPADTrainer(
+            device_name=device_name,
+            epochs=epochs,
+            batch_size=batch_size,
+            lr=lr,
+            mem_dim=mem_dim,
+            checkpoint_dir=str(CHECKPOINT_DIR),
+            wandb_project="ipad-vad" if enable_wandb else None,
+            hf_repo="MSherbinii/ipad-vad-checkpoints" if enable_hf_upload else None
         )
+        progress(0.05, desc="Loading dataset...")
+        # Train
+        trainer.train(str(DATASET_PATH))
+        progress(1.0, desc="Training complete!")
+        # Get final checkpoint
+        checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
+        latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None
+        result = f"""
+✅ Full baseline training complete!
+📊 Configuration:
+  - Device: {device_name}
+  - Epochs: {epochs}
+  - Batch Size: {batch_size}
+  - Learning Rate: {lr}
+  - Memory Dimension: {mem_dim}
+💾 Checkpoints:
+  - Total saved: {len(checkpoints)}
+  - Latest: {latest_checkpoint.name if latest_checkpoint else 'None'}
+☁️ HuggingFace Hub:
+  - {'✅ Uploaded to MSherbinii/ipad-vad-checkpoints' if enable_hf_upload else '❌ Upload disabled'}
+📈 WandB Logging:
+  - {'✅ Logged to ipad-vad project' if enable_wandb else '❌ Logging disabled'}
+🎯 Expected Performance:
+  - Target AUC for {device_name}: Check baseline results table
+  - Paper baseline avg: 68.6%
+"""
+        return result
     except Exception as e:
+        return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."
+def list_checkpoints() -> str:
+    """List all saved checkpoints"""
+    checkpoints = sorted(CHECKPOINT_DIR.glob("*.pth"))
+    if not checkpoints:
+        return "📁 No checkpoints found"
+    result = "💾 **Available Checkpoints:**\n\n"
+    for ckpt in checkpoints:
+        size_mb = ckpt.stat().st_size / (1024 * 1024)
+        modified = datetime.fromtimestamp(ckpt.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
+        result += f"- `{ckpt.name}` ({size_mb:.1f} MB, modified {modified})\n"
+    return result
 # Gradio Interface
+with gr.Blocks(title="IPAD VAD Training on ZeroGPU", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🏭 IPAD: Industrial Process Anomaly Detection Training")
     gr.Markdown("Train video anomaly detection models on ZeroGPU with the IPAD dataset")
+    with gr.Tab("📥 Setup"):
+        gr.Markdown("## 1️⃣ Download Dataset from HF Hub")
+        gr.Markdown("Downloads the 8.3GB IPAD dataset. **This only needs to be done once** - the dataset is cached.")
+        download_btn = gr.Button("📥 Download Dataset", variant="primary", size="lg")
+        download_output = gr.Textbox(label="Download Status", lines=4)
+        download_btn.click(setup_dataset, outputs=download_output)
+        gr.Markdown("---")
+        gr.Markdown("## 2️⃣ Test GPU Access")
+        gr.Markdown("Verify that ZeroGPU is working and the model loads correctly. **No dataset required.**")
+        test_btn = gr.Button("🧪 Run GPU Test", variant="secondary")
+        test_output = gr.JSON(label="GPU Test Results")
+        test_btn.click(quick_gpu_test, outputs=test_output)
+    with gr.Tab("⚡ Quick Test (10 epochs)"):
+        gr.Markdown("## Quick Baseline Test")
+        gr.Markdown("Train for 10 epochs to verify everything works. Takes ~10-15 minutes.")
+        with gr.Row():
+            quick_device = gr.Dropdown(
+                choices=SYNTHETIC_DEVICES,
+                value="S01",
+                label="Device"
+            )
+            quick_epochs = gr.Slider(5, 50, value=10, step=5, label="Epochs")
+        with gr.Row():
+            quick_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
+            quick_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)
+        quick_train_btn = gr.Button("🚀 Start Quick Training", variant="primary", size="lg")
+        quick_output = gr.Textbox(label="Training Results", lines=15)
+        quick_train_btn.click(
+            train_quick_baseline,
+            inputs=[quick_device, quick_epochs, quick_batch, quick_lr],
+            outputs=quick_output
         )
+    with gr.Tab("🎯 Full Training (200 epochs)"):
+        gr.Markdown("## Full Baseline Training")
+        gr.Markdown("Complete 200-epoch training to match paper results. Takes ~2-3 hours.")
         with gr.Row():
+            full_device = gr.Dropdown(
+                choices=SYNTHETIC_DEVICES,
                 value="S01",
                 label="Training Device"
             )
+            full_epochs = gr.Slider(50, 300, value=200, step=10, label="Epochs")
         with gr.Row():
+            full_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
+            full_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)
+        with gr.Row():
+            full_mem_dim = gr.Slider(500, 2000, value=2000, step=100, label="Memory Dimension")
+            full_wandb = gr.Checkbox(value=False, label="Enable WandB Logging")
+            full_hf_upload = gr.Checkbox(value=True, label="Upload to HF Hub")
+        full_train_btn = gr.Button("🚀 Start Full Training", variant="primary", size="lg")
+        full_output = gr.Textbox(label="Training Results", lines=20)
+        full_train_btn.click(
+            train_full_baseline,
+            inputs=[full_device, full_epochs, full_batch, full_lr, full_mem_dim, full_wandb, full_hf_upload],
+            outputs=full_output
         )
+    with gr.Tab("💾 Checkpoints"):
+        gr.Markdown("## Checkpoint Management")
+        refresh_btn = gr.Button("🔄 Refresh Checkpoint List")
+        checkpoint_list = gr.Markdown(value=list_checkpoints())
+        refresh_btn.click(list_checkpoints, outputs=checkpoint_list)
+        gr.Markdown("### Checkpoint Info")
+        gr.Markdown("""
+        - Checkpoints are saved every 10 epochs
+        - Best model (lowest val loss) is automatically selected
+        - Files are in PyTorch `.pth` format
+        - Can be loaded with `torch.load(checkpoint_path)`
+        """)
     with gr.Tab("📊 Documentation"):
         gr.Markdown("""
         ## IPAD VAD Training Guide
         ### Quick Start
+        1. **Download Dataset**: Go to "Setup" tab and download the IPAD dataset (once)
+        2. **GPU Test**: Verify GPU access in "Setup" tab
+        3. **Quick Test**: Train for 10 epochs in "Quick Test" tab to verify setup
+        4. **Full Training**: Launch 200-epoch training in "Full Training" tab
         ### Hardware
         - **GPU**: NVIDIA H200 (via ZeroGPU)
+        - **VRAM**: 80GB HBM3
+        - **Duration**: 1-2 hours per full training session
         ### Model Architecture
         - **Encoder**: Video Swin Transformer (768-dim features)
         - **Period Module**: 200-class temporal position classifier
         - **Decoder**: I3D-based 3D decoder
+        ### Expected Baseline Results (200 epochs)
+        | Device | AUC (%) | Device | AUC (%) |
+        |--------|---------|--------|---------|
+        | S01 | 69.5 | S07 | 60.6 |
+        | S02 | 63.9 | S08 | 85.6 |
+        | S03 | 70.6 | S09 | 71.2 |
+        | S04 | 58.3 | S10 | 62.2 |
+        | S05 | 86.2 | S11 | 60.9 |
+        | S06 | 61.2 | S12 | 67.1 |
+        | **Avg** | **68.6** | | |
+        ### Training Configuration
+        - **Batch Size**: 4 (default, can increase with more VRAM)
+        - **Learning Rate**: 1e-4 (Adam optimizer)
+        - **Clip Length**: 16 frames
+        - **Frame Size**: 256×256 pixels
+        - **Mixed Precision**: FP16 (automatic)
+        ### Loss Function
+        ```
+        Total Loss = Reconstruction Loss
+                   + 0.0002 × Entropy Loss
+                   + 0.02 × Period Loss
+        ```
         ### Resources
         - [Paper](https://arxiv.org/abs/2404.15033)
         - [Dataset](https://huggingface.co/datasets/MSherbinii/ipad-industrial-anomaly)
+        - [Original Code](https://github.com/LJF1113/IPAD)
+        - [Checkpoints](https://huggingface.co/MSherbinii/ipad-vad-checkpoints)
+        ### Next Steps (SOTA Improvements)
+        After baseline reproduction:
+        1. **Modern Transformer**: Replace Video Swin → MViTv2 (+2-4% AUC)
+        2. **Diffusion Decoder**: Add diffusion-based reconstruction (+3-5% AUC)
+        3. **Enhanced Memory**: GWN regularization (+1-3% AUC)
+        **Target**: 75-80% average AUC (vs 68.6% baseline)
         """)
 if __name__ == "__main__":