MSherbinii commited on
Commit
6c6de8d
Β·
verified Β·
1 Parent(s): c3981cb

Update app.py with integrated training interface and full functionality

Browse files
Files changed (1) hide show
  1. app.py +286 -128
app.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
  IPAD VAD Training Interface on HuggingFace Spaces with ZeroGPU
 
4
  """
5
  import gradio as gr
6
  import torch
@@ -12,48 +13,53 @@ import zipfile
12
  from huggingface_hub import hf_hub_download, HfApi
13
  import subprocess
14
  import sys
 
15
 
16
- # Add IPAD code to path
17
- sys.path.insert(0, str(Path(__file__).parent / "IPAD"))
18
-
19
- from IPAD.model.video_swin_transformer import VST
20
- from IPAD.train import train_one_epoch, validate
21
  import spaces # ZeroGPU decorator
22
 
23
  # Global state
24
- DATASET_PATH = Path("./ipad_data")
25
  CHECKPOINT_DIR = Path("./checkpoints")
26
  CHECKPOINT_DIR.mkdir(exist_ok=True)
27
 
28
- def download_dataset(progress=gr.Progress()):
29
  """Download and extract IPAD dataset from HF Hub"""
 
 
30
  progress(0, desc="Downloading dataset...")
31
 
32
- if DATASET_PATH.exists():
33
- return "βœ… Dataset already downloaded"
34
 
35
  try:
36
- zip_path = hf_hub_download(
37
- repo_id="MSherbinii/ipad-industrial-anomaly",
38
- filename="ipad_dataset.zip",
39
- repo_type="dataset",
40
- cache_dir="./cache"
41
- )
42
-
43
- progress(0.5, desc="Extracting dataset...")
44
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
45
- zip_ref.extractall(DATASET_PATH.parent)
46
-
47
  progress(1.0, desc="Complete!")
48
- return f"βœ… Dataset downloaded and extracted to {DATASET_PATH}"
49
 
50
  except Exception as e:
51
  return f"❌ Error: {str(e)}"
52
 
53
- @spaces.GPU(duration=120) # Request GPU for 2 minutes
54
- def quick_test(device_name="S01"):
55
- """Quick test to verify model and data loading"""
56
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # Load model
58
  model = VST(mem_dim=2000, shrink_thres=0.0025)
59
  model = model.cuda()
@@ -67,158 +73,280 @@ def quick_test(device_name="S01"):
67
 
68
  result = {
69
  "status": "βœ… Success",
 
 
 
70
  "output_shape": str(output['output'].shape),
71
  "attention_shape": str(output['att'].shape),
72
  "period_shape": str(output['recon_index'].shape),
73
- "gpu_available": torch.cuda.is_available(),
74
- "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
75
  }
76
 
77
- return json.dumps(result, indent=2)
78
 
79
  except Exception as e:
80
- return f"❌ Error: {str(e)}"
 
 
 
 
 
81
 
82
  @spaces.GPU(duration=3600) # Request GPU for 1 hour
83
- def train_baseline(
84
- device_name="S01",
85
- epochs=10,
86
- batch_size=4,
87
- lr=1e-4,
88
- mem_dim=2000,
89
  progress=gr.Progress()
90
- ):
91
- """Train baseline IPAD model on selected device"""
 
 
 
 
92
 
93
- progress(0, desc="Initializing training...")
94
 
95
  try:
96
- # Model setup
97
- model = VST(mem_dim=mem_dim, shrink_thres=0.0025)
98
- model = model.cuda()
 
 
 
 
 
 
 
 
99
 
100
- # Optimizer
101
- optimizer = torch.optim.Adam(model.parameters(), lr=lr)
102
 
103
- # Training loop placeholder
104
- # (Full implementation requires dataset loaders from IPAD/train.py)
105
 
106
- results = {
107
- "status": "βœ… Training started",
108
- "device": device_name,
109
- "epochs": epochs,
110
- "batch_size": batch_size,
111
- "lr": lr,
112
- "mem_dim": mem_dim,
113
- "checkpoint_dir": str(CHECKPOINT_DIR)
114
- }
115
 
116
- # Save checkpoint
117
- checkpoint_path = CHECKPOINT_DIR / f"baseline_{device_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pth"
118
- torch.save({
119
- 'model_state_dict': model.state_dict(),
120
- 'optimizer_state_dict': optimizer.state_dict(),
121
- 'config': results
122
- }, checkpoint_path)
123
 
124
- results["checkpoint"] = str(checkpoint_path)
 
125
 
126
- progress(1.0, desc="Training complete!")
127
- return json.dumps(results, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  except Exception as e:
130
- return f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- def upload_checkpoint(checkpoint_name):
133
- """Upload trained checkpoint to HF Hub"""
134
- try:
135
- api = HfApi()
136
- checkpoint_path = CHECKPOINT_DIR / checkpoint_name
137
 
138
- if not checkpoint_path.exists():
139
- return f"❌ Checkpoint not found: {checkpoint_name}"
140
 
141
- api.upload_file(
142
- path_or_fileobj=str(checkpoint_path),
143
- path_in_repo=f"checkpoints/{checkpoint_name}",
144
- repo_id="MSherbinii/ipad-vad-training",
145
- repo_type="model",
 
 
 
 
 
 
146
  )
147
 
148
- return f"βœ… Uploaded to https://huggingface.co/MSherbinii/ipad-vad-training"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  except Exception as e:
151
- return f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # Gradio Interface
154
- with gr.Blocks(title="IPAD VAD Training on ZeroGPU") as demo:
155
  gr.Markdown("# 🏭 IPAD: Industrial Process Anomaly Detection Training")
156
  gr.Markdown("Train video anomaly detection models on ZeroGPU with the IPAD dataset")
157
 
158
- with gr.Tab("πŸ“₯ Dataset Setup"):
159
- gr.Markdown("## Download IPAD Dataset from HF Hub")
160
- download_btn = gr.Button("Download Dataset (8.3 GB)", variant="primary")
161
- download_output = gr.Textbox(label="Status", lines=3)
162
- download_btn.click(download_dataset, outputs=download_output)
163
-
164
- with gr.Tab("πŸ§ͺ Quick Test"):
165
- gr.Markdown("## Test Model Loading (No Dataset Required)")
166
- test_device = gr.Dropdown(
167
- choices=["S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S10", "S11", "S12"],
168
- value="S01",
169
- label="Device"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  )
171
- test_btn = gr.Button("Run Quick Test", variant="primary")
172
- test_output = gr.JSON(label="Test Results")
173
- test_btn.click(quick_test, inputs=test_device, outputs=test_output)
174
 
175
- with gr.Tab("πŸš€ Baseline Training"):
176
- gr.Markdown("## Train IPAD Baseline Model")
 
177
 
178
  with gr.Row():
179
- train_device = gr.Dropdown(
180
- choices=["S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S10", "S11", "S12"],
181
  value="S01",
182
  label="Training Device"
183
  )
184
- train_epochs = gr.Slider(1, 200, value=10, step=1, label="Epochs")
185
 
186
  with gr.Row():
187
- train_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
188
- train_lr = gr.Number(value=1e-4, label="Learning Rate")
189
- train_mem = gr.Slider(500, 2000, value=2000, step=100, label="Memory Dimension")
190
-
191
- train_btn = gr.Button("Start Training", variant="primary")
192
- train_output = gr.JSON(label="Training Results")
193
- train_btn.click(
194
- train_baseline,
195
- inputs=[train_device, train_epochs, train_batch, train_lr, train_mem],
196
- outputs=train_output
197
- )
198
 
199
- with gr.Tab("πŸ’Ύ Checkpoint Management"):
200
- gr.Markdown("## Upload Checkpoints to HF Hub")
201
- checkpoint_list = gr.Dropdown(
202
- choices=[f.name for f in CHECKPOINT_DIR.glob("*.pth")] if CHECKPOINT_DIR.exists() else [],
203
- label="Select Checkpoint"
 
 
204
  )
205
- upload_btn = gr.Button("Upload to HF Hub", variant="primary")
206
- upload_output = gr.Textbox(label="Upload Status")
207
- upload_btn.click(upload_checkpoint, inputs=checkpoint_list, outputs=upload_output)
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  with gr.Tab("πŸ“Š Documentation"):
210
  gr.Markdown("""
211
  ## IPAD VAD Training Guide
212
 
213
  ### Quick Start
214
- 1. **Download Dataset**: Go to "Dataset Setup" tab and download the IPAD dataset
215
- 2. **Quick Test**: Verify GPU access and model loading in "Quick Test" tab
216
- 3. **Train Baseline**: Start training on any of the 12 synthetic devices
 
217
 
218
  ### Hardware
219
  - **GPU**: NVIDIA H200 (via ZeroGPU)
220
- - **Duration**: 1 hour per training session
221
- - **Memory**: 80GB HBM3
222
 
223
  ### Model Architecture
224
  - **Encoder**: Video Swin Transformer (768-dim features)
@@ -226,15 +354,45 @@ with gr.Blocks(title="IPAD VAD Training on ZeroGPU") as demo:
226
  - **Period Module**: 200-class temporal position classifier
227
  - **Decoder**: I3D-based 3D decoder
228
 
229
- ### Expected Results
230
- - **Average AUC**: ~68.6% (baseline)
231
- - **Best Device (S08)**: 85.6%
232
- - **Challenging (R03)**: 43.5%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  ### Resources
235
  - [Paper](https://arxiv.org/abs/2404.15033)
236
  - [Dataset](https://huggingface.co/datasets/MSherbinii/ipad-industrial-anomaly)
237
- - [Technical Analysis](https://github.com/LJF1113/IPAD)
 
 
 
 
 
 
 
 
 
238
  """)
239
 
240
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """
3
  IPAD VAD Training Interface on HuggingFace Spaces with ZeroGPU
4
+ Updated version with integrated training infrastructure
5
  """
6
  import gradio as gr
7
  import torch
 
13
  from huggingface_hub import hf_hub_download, HfApi
14
  import subprocess
15
  import sys
16
+ from typing import Optional, Dict
17
 
18
+ # Import training infrastructure
19
+ from train_hf import IPADTrainer
20
+ from dataset import download_and_extract_dataset, DEVICE_NAMES, SYNTHETIC_DEVICES
 
 
21
  import spaces # ZeroGPU decorator
22
 
23
  # Global state
24
+ DATASET_PATH = None
25
  CHECKPOINT_DIR = Path("./checkpoints")
26
  CHECKPOINT_DIR.mkdir(exist_ok=True)
27
 
28
+ def setup_dataset(progress=gr.Progress()) -> str:
29
  """Download and extract IPAD dataset from HF Hub"""
30
+ global DATASET_PATH
31
+
32
  progress(0, desc="Downloading dataset...")
33
 
34
+ if DATASET_PATH and DATASET_PATH.exists():
35
+ return f"βœ… Dataset already available at {DATASET_PATH}"
36
 
37
  try:
38
+ DATASET_PATH = download_and_extract_dataset(cache_dir="./cache")
 
 
 
 
 
 
 
 
 
 
39
  progress(1.0, desc="Complete!")
40
+ return f"βœ… Dataset downloaded and extracted to {DATASET_PATH}\nπŸ“Š Ready for training!"
41
 
42
  except Exception as e:
43
  return f"❌ Error: {str(e)}"
44
 
45
+ @spaces.GPU(duration=60) # Request GPU for 1 minute
46
+ def quick_gpu_test() -> Dict:
47
+ """Quick test to verify GPU access and model loading"""
48
  try:
49
+ from IPAD.model.video_swin_transformer import VST
50
+
51
+ # Check GPU
52
+ gpu_available = torch.cuda.is_available()
53
+ gpu_name = torch.cuda.get_device_name(0) if gpu_available else "None"
54
+
55
+ if not gpu_available:
56
+ return {
57
+ "status": "⚠️ Warning",
58
+ "message": "No GPU available",
59
+ "gpu_available": False,
60
+ "gpu_name": "None"
61
+ }
62
+
63
  # Load model
64
  model = VST(mem_dim=2000, shrink_thres=0.0025)
65
  model = model.cuda()
 
73
 
74
  result = {
75
  "status": "βœ… Success",
76
+ "message": "GPU test passed!",
77
+ "gpu_available": True,
78
+ "gpu_name": gpu_name,
79
  "output_shape": str(output['output'].shape),
80
  "attention_shape": str(output['att'].shape),
81
  "period_shape": str(output['recon_index'].shape),
82
+ "memory_allocated_gb": f"{torch.cuda.memory_allocated() / 1e9:.2f}",
83
+ "memory_reserved_gb": f"{torch.cuda.memory_reserved() / 1e9:.2f}"
84
  }
85
 
86
+ return result
87
 
88
  except Exception as e:
89
+ return {
90
+ "status": "❌ Error",
91
+ "message": str(e),
92
+ "gpu_available": torch.cuda.is_available(),
93
+ "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
94
+ }
95
 
96
  @spaces.GPU(duration=3600) # Request GPU for 1 hour
97
+ def train_quick_baseline(
98
+ device_name: str = "S01",
99
+ epochs: int = 10,
100
+ batch_size: int = 4,
101
+ lr: float = 1e-4,
 
102
  progress=gr.Progress()
103
+ ) -> str:
104
+ """Quick baseline training (10 epochs for testing)"""
105
+ global DATASET_PATH
106
+
107
+ if DATASET_PATH is None or not DATASET_PATH.exists():
108
+ return "❌ Error: Dataset not downloaded. Please download dataset first."
109
 
110
+ progress(0, desc="Initializing trainer...")
111
 
112
  try:
113
+ # Create trainer
114
+ trainer = IPADTrainer(
115
+ device_name=device_name,
116
+ epochs=epochs,
117
+ batch_size=batch_size,
118
+ lr=lr,
119
+ mem_dim=2000,
120
+ checkpoint_dir=str(CHECKPOINT_DIR),
121
+ wandb_project=None, # Disable wandb for quick test
122
+ hf_repo=None # Disable auto-upload for quick test
123
+ )
124
 
125
+ progress(0.1, desc="Loading dataset...")
 
126
 
127
+ # Train
128
+ trainer.train(str(DATASET_PATH))
129
 
130
+ progress(1.0, desc="Training complete!")
 
 
 
 
 
 
 
 
131
 
132
+ # Get latest checkpoint
133
+ checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
134
+ latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None
 
 
 
 
135
 
136
+ result = f"""
137
+ βœ… Quick baseline training complete!
138
 
139
+ πŸ“Š Configuration:
140
+ - Device: {device_name}
141
+ - Epochs: {epochs}
142
+ - Batch Size: {batch_size}
143
+ - Learning Rate: {lr}
144
+
145
+ πŸ’Ύ Checkpoint:
146
+ - {latest_checkpoint.name if latest_checkpoint else 'No checkpoint saved'}
147
+
148
+ 🎯 Next Steps:
149
+ 1. Review training metrics
150
+ 2. Run full 200-epoch training
151
+ 3. Evaluate on test set
152
+ """
153
+ return result
154
 
155
  except Exception as e:
156
+ return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."
157
+
158
+ @spaces.GPU(duration=7200) # Request GPU for 2 hours
159
+ def train_full_baseline(
160
+ device_name: str = "S01",
161
+ epochs: int = 200,
162
+ batch_size: int = 4,
163
+ lr: float = 1e-4,
164
+ mem_dim: int = 2000,
165
+ enable_wandb: bool = False,
166
+ enable_hf_upload: bool = True,
167
+ progress=gr.Progress()
168
+ ) -> str:
169
+ """Full baseline training (200 epochs)"""
170
+ global DATASET_PATH
171
 
172
+ if DATASET_PATH is None or not DATASET_PATH.exists():
173
+ return "❌ Error: Dataset not downloaded. Please download dataset first."
 
 
 
174
 
175
+ progress(0, desc="Initializing full training...")
 
176
 
177
+ try:
178
+ # Create trainer
179
+ trainer = IPADTrainer(
180
+ device_name=device_name,
181
+ epochs=epochs,
182
+ batch_size=batch_size,
183
+ lr=lr,
184
+ mem_dim=mem_dim,
185
+ checkpoint_dir=str(CHECKPOINT_DIR),
186
+ wandb_project="ipad-vad" if enable_wandb else None,
187
+ hf_repo="MSherbinii/ipad-vad-checkpoints" if enable_hf_upload else None
188
  )
189
 
190
+ progress(0.05, desc="Loading dataset...")
191
+
192
+ # Train
193
+ trainer.train(str(DATASET_PATH))
194
+
195
+ progress(1.0, desc="Training complete!")
196
+
197
+ # Get final checkpoint
198
+ checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
199
+ latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None
200
+
201
+ result = f"""
202
+ βœ… Full baseline training complete!
203
+
204
+ πŸ“Š Configuration:
205
+ - Device: {device_name}
206
+ - Epochs: {epochs}
207
+ - Batch Size: {batch_size}
208
+ - Learning Rate: {lr}
209
+ - Memory Dimension: {mem_dim}
210
+
211
+ πŸ’Ύ Checkpoints:
212
+ - Total saved: {len(checkpoints)}
213
+ - Latest: {latest_checkpoint.name if latest_checkpoint else 'None'}
214
+
215
+ ☁️ HuggingFace Hub:
216
+ - {'βœ… Uploaded to MSherbinii/ipad-vad-checkpoints' if enable_hf_upload else '❌ Upload disabled'}
217
+
218
+ πŸ“ˆ WandB Logging:
219
+ - {'βœ… Logged to ipad-vad project' if enable_wandb else '❌ Logging disabled'}
220
+
221
+ 🎯 Expected Performance:
222
+ - Target AUC for {device_name}: Check baseline results table
223
+ - Paper baseline avg: 68.6%
224
+ """
225
+ return result
226
 
227
  except Exception as e:
228
+ return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."
229
+
230
+ def list_checkpoints() -> str:
231
+ """List all saved checkpoints"""
232
+ checkpoints = sorted(CHECKPOINT_DIR.glob("*.pth"))
233
+
234
+ if not checkpoints:
235
+ return "πŸ“ No checkpoints found"
236
+
237
+ result = "πŸ’Ύ **Available Checkpoints:**\n\n"
238
+ for ckpt in checkpoints:
239
+ size_mb = ckpt.stat().st_size / (1024 * 1024)
240
+ modified = datetime.fromtimestamp(ckpt.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
241
+ result += f"- `{ckpt.name}` ({size_mb:.1f} MB, modified {modified})\n"
242
+
243
+ return result
244
 
245
  # Gradio Interface
246
+ with gr.Blocks(title="IPAD VAD Training on ZeroGPU", theme=gr.themes.Soft()) as demo:
247
  gr.Markdown("# 🏭 IPAD: Industrial Process Anomaly Detection Training")
248
  gr.Markdown("Train video anomaly detection models on ZeroGPU with the IPAD dataset")
249
 
250
+ with gr.Tab("πŸ“₯ Setup"):
251
+ gr.Markdown("## 1️⃣ Download Dataset from HF Hub")
252
+ gr.Markdown("Downloads the 8.3GB IPAD dataset. **This only needs to be done once** - the dataset is cached.")
253
+
254
+ download_btn = gr.Button("πŸ“₯ Download Dataset", variant="primary", size="lg")
255
+ download_output = gr.Textbox(label="Download Status", lines=4)
256
+ download_btn.click(setup_dataset, outputs=download_output)
257
+
258
+ gr.Markdown("---")
259
+ gr.Markdown("## 2️⃣ Test GPU Access")
260
+ gr.Markdown("Verify that ZeroGPU is working and the model loads correctly. **No dataset required.**")
261
+
262
+ test_btn = gr.Button("πŸ§ͺ Run GPU Test", variant="secondary")
263
+ test_output = gr.JSON(label="GPU Test Results")
264
+ test_btn.click(quick_gpu_test, outputs=test_output)
265
+
266
+ with gr.Tab("⚑ Quick Test (10 epochs)"):
267
+ gr.Markdown("## Quick Baseline Test")
268
+ gr.Markdown("Train for 10 epochs to verify everything works. Takes ~10-15 minutes.")
269
+
270
+ with gr.Row():
271
+ quick_device = gr.Dropdown(
272
+ choices=SYNTHETIC_DEVICES,
273
+ value="S01",
274
+ label="Device"
275
+ )
276
+ quick_epochs = gr.Slider(5, 50, value=10, step=5, label="Epochs")
277
+
278
+ with gr.Row():
279
+ quick_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
280
+ quick_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)
281
+
282
+ quick_train_btn = gr.Button("πŸš€ Start Quick Training", variant="primary", size="lg")
283
+ quick_output = gr.Textbox(label="Training Results", lines=15)
284
+
285
+ quick_train_btn.click(
286
+ train_quick_baseline,
287
+ inputs=[quick_device, quick_epochs, quick_batch, quick_lr],
288
+ outputs=quick_output
289
  )
 
 
 
290
 
291
+ with gr.Tab("🎯 Full Training (200 epochs)"):
292
+ gr.Markdown("## Full Baseline Training")
293
+ gr.Markdown("Complete 200-epoch training to match paper results. Takes ~2-3 hours.")
294
 
295
  with gr.Row():
296
+ full_device = gr.Dropdown(
297
+ choices=SYNTHETIC_DEVICES,
298
  value="S01",
299
  label="Training Device"
300
  )
301
+ full_epochs = gr.Slider(50, 300, value=200, step=10, label="Epochs")
302
 
303
  with gr.Row():
304
+ full_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
305
+ full_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)
306
+
307
+ with gr.Row():
308
+ full_mem_dim = gr.Slider(500, 2000, value=2000, step=100, label="Memory Dimension")
309
+ full_wandb = gr.Checkbox(value=False, label="Enable WandB Logging")
310
+ full_hf_upload = gr.Checkbox(value=True, label="Upload to HF Hub")
 
 
 
 
311
 
312
+ full_train_btn = gr.Button("πŸš€ Start Full Training", variant="primary", size="lg")
313
+ full_output = gr.Textbox(label="Training Results", lines=20)
314
+
315
+ full_train_btn.click(
316
+ train_full_baseline,
317
+ inputs=[full_device, full_epochs, full_batch, full_lr, full_mem_dim, full_wandb, full_hf_upload],
318
+ outputs=full_output
319
  )
320
+
321
+ with gr.Tab("πŸ’Ύ Checkpoints"):
322
+ gr.Markdown("## Checkpoint Management")
323
+
324
+ refresh_btn = gr.Button("πŸ”„ Refresh Checkpoint List")
325
+ checkpoint_list = gr.Markdown(value=list_checkpoints())
326
+ refresh_btn.click(list_checkpoints, outputs=checkpoint_list)
327
+
328
+ gr.Markdown("### Checkpoint Info")
329
+ gr.Markdown("""
330
+ - Checkpoints are saved every 10 epochs
331
+ - Best model (lowest val loss) is automatically selected
332
+ - Files are in PyTorch `.pth` format
333
+ - Can be loaded with `torch.load(checkpoint_path)`
334
+ """)
335
 
336
  with gr.Tab("πŸ“Š Documentation"):
337
  gr.Markdown("""
338
  ## IPAD VAD Training Guide
339
 
340
  ### Quick Start
341
+ 1. **Download Dataset**: Go to "Setup" tab and download the IPAD dataset (once)
342
+ 2. **GPU Test**: Verify GPU access in "Setup" tab
343
+ 3. **Quick Test**: Train for 10 epochs in "Quick Test" tab to verify setup
344
+ 4. **Full Training**: Launch 200-epoch training in "Full Training" tab
345
 
346
  ### Hardware
347
  - **GPU**: NVIDIA H200 (via ZeroGPU)
348
+ - **VRAM**: 80GB HBM3
349
+ - **Duration**: 1-2 hours per full training session
350
 
351
  ### Model Architecture
352
  - **Encoder**: Video Swin Transformer (768-dim features)
 
354
  - **Period Module**: 200-class temporal position classifier
355
  - **Decoder**: I3D-based 3D decoder
356
 
357
+ ### Expected Baseline Results (200 epochs)
358
+
359
+ | Device | AUC (%) | Device | AUC (%) |
360
+ |--------|---------|--------|---------|
361
+ | S01 | 69.5 | S07 | 60.6 |
362
+ | S02 | 63.9 | S08 | 85.6 |
363
+ | S03 | 70.6 | S09 | 71.2 |
364
+ | S04 | 58.3 | S10 | 62.2 |
365
+ | S05 | 86.2 | S11 | 60.9 |
366
+ | S06 | 61.2 | S12 | 67.1 |
367
+ | **Avg** | **68.6** | | |
368
+
369
+ ### Training Configuration
370
+ - **Batch Size**: 4 (default, can increase with more VRAM)
371
+ - **Learning Rate**: 1e-4 (Adam optimizer)
372
+ - **Clip Length**: 16 frames
373
+ - **Frame Size**: 256Γ—256 pixels
374
+ - **Mixed Precision**: FP16 (automatic)
375
+
376
+ ### Loss Function
377
+ ```
378
+ Total Loss = Reconstruction Loss
379
+ + 0.0002 Γ— Entropy Loss
380
+ + 0.02 Γ— Period Loss
381
+ ```
382
 
383
  ### Resources
384
  - [Paper](https://arxiv.org/abs/2404.15033)
385
  - [Dataset](https://huggingface.co/datasets/MSherbinii/ipad-industrial-anomaly)
386
+ - [Original Code](https://github.com/LJF1113/IPAD)
387
+ - [Checkpoints](https://huggingface.co/MSherbinii/ipad-vad-checkpoints)
388
+
389
+ ### Next Steps (SOTA Improvements)
390
+ After baseline reproduction:
391
+ 1. **Modern Transformer**: Replace Video Swin β†’ MViTv2 (+2-4% AUC)
392
+ 2. **Diffusion Decoder**: Add diffusion-based reconstruction (+3-5% AUC)
393
+ 3. **Enhanced Memory**: GWN regularization (+1-3% AUC)
394
+
395
+ **Target**: 75-80% average AUC (vs 68.6% baseline)
396
  """)
397
 
398
  if __name__ == "__main__":