koreashin commited on
Commit
11655ae
·
verified ·
1 Parent(s): eb02aa3

Upload 4 files

Browse files
Files changed (2) hide show
  1. README.md +178 -231
  2. model.py +226 -0
README.md CHANGED
@@ -17,13 +17,13 @@ datasets:
17
  - custom
18
  ---
19
 
20
- # 🚗 Driver Abnormal Behavior Detection Model
21
 
22
  **운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
23
 
24
  차량 내 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
25
 
26
- ## 📊 Model Performance
27
 
28
  | Metric | Score |
29
  |--------|-------|
@@ -33,75 +33,85 @@ datasets:
33
 
34
  ### Per-Class Performance
35
 
36
- | Class | Korean | Precision | Recall | F1-Score | Support |
37
- |-------|--------|-----------|--------|----------|---------|
38
- | 0 | 정상 (Normal) | 0.93 | 0.92 | 0.92 | 159,224 |
39
- | 1 | 졸음운전 (Drowsy) | 0.99 | 0.98 | 0.98 | 619,450 |
40
- | 2 | 물건찾기 (Searching) | 0.90 | 0.94 | 0.92 | 261,435 |
41
- | 3 | 휴대폰 사용 (Phone) | 0.91 | 0.88 | 0.90 | 150,981 |
42
- | 4 | 운전자 폭행 (Assault) | 1.00 | 1.00 | 1.00 | 179,972 |
43
 
44
  ---
45
 
46
- ## 🛠️ Installation
47
 
48
- ```bash
49
- # PyTorch 2.0+ 필요
50
- pip install torch torchvision
 
 
 
 
51
 
52
- # 추가 dependencies
53
- pip install opencv-python numpy
54
 
55
- # (선택) HuggingFace에서 다운로드
56
- pip install huggingface_hub
 
 
 
 
 
57
  ```
58
 
59
  ---
60
 
61
- ## 🚀 Quick Start
62
 
63
- ### 1. 모델 다운로드 및 로드
64
 
65
- ```python
66
- import torch
67
- from torchvision.models.video import swin3d_t
68
 
69
- # ===== 방법 1: 로컬 파일에서 로드 =====
70
- model = swin3d_t(weights=None)
71
- model.head = torch.nn.Linear(model.head.in_features, 5) # 5 classes
 
72
 
73
- state_dict = torch.load("pytorch_model.bin", map_location="cpu", weights_only=True)
74
- model.load_state_dict(state_dict)
75
- model.eval()
76
 
77
- # ===== 방법 2: HuggingFace Hub에서 로드 =====
78
- from huggingface_hub import hf_hub_download
 
79
 
80
- model_path = hf_hub_download(
81
- repo_id="YOUR_USERNAME/driver-behavior-swin-t",
82
- filename="pytorch_model.bin"
83
- )
84
- state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
85
 
86
- model = swin3d_t(weights=None)
87
- model.head = torch.nn.Linear(model.head.in_features, 5)
 
 
 
88
  model.load_state_dict(state_dict)
89
  model.eval()
 
 
90
  ```
91
 
92
- ### 2. 단일 비디오 추론
93
 
94
  ```python
95
  import cv2
96
  import torch
97
  import numpy as np
98
 
99
- # 클래스 정의
100
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
101
  CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
102
 
103
- def load_video_frames(video_path, num_frames=30, size=(224, 224)):
104
- """비디오에서 프레임 추출 및 전처리"""
105
  cap = cv2.VideoCapture(video_path)
106
  frames = []
107
 
@@ -109,12 +119,9 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
109
  ret, frame = cap.read()
110
  if not ret:
111
  break
112
- # BGR -> RGB
113
  frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
114
- # Resize
115
  frame = cv2.resize(frame, size)
116
  frames.append(frame)
117
-
118
  cap.release()
119
 
120
  # 프레임 부족 시 마지막 프레임 복제
@@ -123,10 +130,7 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
123
 
124
  # [T, H, W, C] -> [C, T, H, W]
125
  frames = np.array(frames[:num_frames], dtype=np.float32)
126
- frames = frames.transpose(3, 0, 1, 2) # [C, T, H, W]
127
-
128
- # Normalize to [0, 1]
129
- frames = frames / 255.0
130
 
131
  # ImageNet normalization
132
  mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
@@ -135,16 +139,15 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
135
 
136
  return torch.FloatTensor(frames)
137
 
 
138
  def predict(model, video_path, device="cuda"):
139
  """단일 비디오 추론"""
140
  model = model.to(device)
141
  model.eval()
142
 
143
- # 프레임 로드
144
- frames = load_video_frames(video_path)
145
- frames = frames.unsqueeze(0).to(device) # [1, C, T, H, W]
146
 
147
- # 추론
148
  with torch.no_grad():
149
  outputs = model(frames)
150
  probs = torch.softmax(outputs, dim=1)
@@ -156,20 +159,19 @@ def predict(model, video_path, device="cuda"):
156
  "class_name_ko": CLASS_NAMES[pred_idx],
157
  "class_name_en": CLASS_NAMES_EN[pred_idx],
158
  "confidence": confidence,
159
- "all_probabilities": {
160
- CLASS_NAMES[i]: probs[0, i].item()
161
- for i in range(len(CLASS_NAMES))
162
- }
163
  }
164
 
 
165
  # 사용 예시
166
- result = predict(model, "test_video.mp4")
167
- print(f"예측: {result['class_name_ko']} ({result['confidence']:.2%})")
 
168
  ```
169
 
170
  ---
171
 
172
- ## 📹 Real-time Inference (실시간 추론)
173
 
174
  ```python
175
  import cv2
@@ -177,103 +179,77 @@ import torch
177
  import numpy as np
178
  from collections import deque
179
 
180
- class RealtimeDriverBehaviorDetector:
181
  """실시간 운전자 이상행동 탐지기"""
182
 
183
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
 
 
 
 
 
 
 
184
 
185
- def __init__(self, model_path, device="cuda", window_size=30, stride=15):
186
- """
187
- Args:
188
- model_path: pytorch_model.bin 경로
189
- device: 'cuda' 또는 'cpu'
190
- window_size: 분석할 프레임 수 (기본 30 = 1초 @30fps)
191
- stride: 슬라이딩 윈도우 간격 (기본 15 = 0.5초)
192
- """
193
  self.device = device
194
  self.window_size = window_size
195
  self.stride = stride
196
 
197
  # 모델 로드
198
- from torchvision.models.video import swin3d_t
199
- self.model = swin3d_t(weights=None)
200
- self.model.head = torch.nn.Linear(self.model.head.in_features, 5)
201
 
202
- state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
 
 
203
  self.model.load_state_dict(state_dict)
204
  self.model.to(device)
205
  self.model.eval()
206
 
207
  # 프레임 버퍼
208
- self.frame_buffer = deque(maxlen=window_size)
209
  self.frame_count = 0
210
 
211
- # Normalization 파라미터
212
  self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
213
  self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
214
 
215
- def preprocess_frame(self, frame):
216
- """단일 프레임 전처리"""
217
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
218
- frame = cv2.resize(frame, (224, 224))
219
- return frame
 
 
220
 
221
- def predict(self):
222
- """현재 버퍼의 프레임으로 추론"""
223
- if len(self.frame_buffer) < self.window_size:
224
- return None
225
 
226
- # [T, H, W, C] -> [C, T, H, W]
227
- frames = np.array(list(self.frame_buffer), dtype=np.float32)
228
  frames = frames.transpose(3, 0, 1, 2) / 255.0
229
  frames = (frames - self.mean) / self.std
230
 
231
- # 추론
232
  with torch.no_grad():
233
  inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
234
  outputs = self.model(inputs)
235
  probs = torch.softmax(outputs, dim=1)
236
  pred_idx = torch.argmax(probs, dim=1).item()
237
- confidence = probs[0, pred_idx].item()
238
 
239
  return {
240
  "class_id": pred_idx,
241
  "class_name": self.CLASS_NAMES[pred_idx],
242
- "confidence": confidence,
243
- "is_abnormal": pred_idx != 0, # 0 = 정상
244
- "probabilities": probs[0].cpu().numpy()
245
- }
246
-
247
- def process_frame(self, frame):
248
- """프레임 처리 (stride마다 추론)"""
249
- processed = self.preprocess_frame(frame)
250
- self.frame_buffer.append(processed)
251
- self.frame_count += 1
252
-
253
- # stride마다 추론
254
- if self.frame_count % self.stride == 0:
255
- return self.predict()
256
- return None
257
-
258
- def run_on_video(self, video_source=0, show_display=True):
259
- """
260
- 비디오 소스에서 실시간 추론
261
-
262
- Args:
263
- video_source: 웹캠(0) 또는 비디오 파일 경로
264
- show_display: 화면 출력 여부
265
- """
266
- cap = cv2.VideoCapture(video_source)
267
-
268
- # 색상 정의 (BGR)
269
- colors = {
270
- "정상": (0, 255, 0), # 초록
271
- "졸음운전": (0, 165, 255), # 주황
272
- "물건찾기": (0, 255, 255), # 노랑
273
- "휴대폰 사용": (0, 0, 255), # 빨강
274
- "운전자 폭행": (255, 0, 255) # 보라
275
  }
276
 
 
 
 
277
  current_result = None
278
 
279
  while True:
@@ -281,29 +257,24 @@ class RealtimeDriverBehaviorDetector:
281
  if not ret:
282
  break
283
 
284
- # 추론
285
  result = self.process_frame(frame)
286
  if result:
287
  current_result = result
288
 
289
- # 화면 출력
290
- if show_display and current_result:
291
  label = current_result["class_name"]
292
  conf = current_result["confidence"]
293
- color = colors.get(label, (255, 255, 255))
294
 
295
- # 상태 표시
296
- text = f"{label}: {conf:.1%}"
297
- cv2.putText(frame, text, (10, 40),
298
  cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
299
 
300
- # 경고 (이상행동 탐지 시)
301
  if current_result["is_abnormal"]:
302
  cv2.putText(frame, "WARNING!", (10, 80),
303
  cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
304
 
305
- cv2.imshow("Driver Behavior Detection", frame)
306
-
307
  if cv2.waitKey(1) & 0xFF == ord('q'):
308
  break
309
 
@@ -311,19 +282,15 @@ class RealtimeDriverBehaviorDetector:
311
  cv2.destroyAllWindows()
312
 
313
 
314
- # ===== 사용 예시 =====
315
-
316
- # 1. 웹캠 실시간 추론
317
- detector = RealtimeDriverBehaviorDetector("pytorch_model.bin", device="cuda")
318
- detector.run_on_video(video_source=0) # 웹캠
319
-
320
- # 2. 비디오 파일 추론
321
- detector.run_on_video(video_source="test_video.mp4")
322
  ```
323
 
324
  ---
325
 
326
- ## 🔧 Batch Inference (배치 추론)
327
 
328
  ```python
329
  import torch
@@ -331,8 +298,6 @@ from pathlib import Path
331
  from torch.utils.data import Dataset, DataLoader
332
 
333
  class VideoDataset(Dataset):
334
- """비디오 파일 배치 처리용 Dataset"""
335
-
336
  def __init__(self, video_paths, num_frames=30):
337
  self.video_paths = video_paths
338
  self.num_frames = num_frames
@@ -343,9 +308,8 @@ class VideoDataset(Dataset):
343
  return len(self.video_paths)
344
 
345
  def __getitem__(self, idx):
346
- video_path = self.video_paths[idx]
347
-
348
- cap = cv2.VideoCapture(str(video_path))
349
  frames = []
350
 
351
  while len(frames) < self.num_frames:
@@ -355,7 +319,6 @@ class VideoDataset(Dataset):
355
  frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
356
  frame = cv2.resize(frame, (224, 224))
357
  frames.append(frame)
358
-
359
  cap.release()
360
 
361
  while len(frames) < self.num_frames:
@@ -365,130 +328,120 @@ class VideoDataset(Dataset):
365
  frames = frames.transpose(3, 0, 1, 2) / 255.0
366
  frames = (frames - self.mean) / self.std
367
 
368
- return torch.FloatTensor(frames), str(video_path)
369
 
370
 
371
- def batch_inference(model, video_folder, batch_size=8, device="cuda"):
372
- """
373
- 폴더 내 모든 비디오 배치 추론
374
-
375
- Args:
376
- model: 로드된 모델
377
- video_folder: 비디오 폴더 경로
378
- batch_size: 배치 크기
379
- device: 'cuda' 또는 'cpu'
380
-
381
- Returns:
382
- List of (video_path, prediction) tuples
383
- """
384
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
385
 
386
- video_folder = Path(video_folder)
387
- video_paths = list(video_folder.glob("*.mp4")) + list(video_folder.glob("*.avi"))
388
-
389
  dataset = VideoDataset(video_paths)
390
- dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
391
 
392
  model = model.to(device)
393
  model.eval()
394
 
395
  results = []
396
-
397
  with torch.no_grad():
398
- for frames, paths in dataloader:
399
  frames = frames.to(device)
400
  outputs = model(frames)
401
  probs = torch.softmax(outputs, dim=1)
402
  preds = torch.argmax(probs, dim=1)
403
 
404
- for path, pred_idx, prob in zip(paths, preds, probs):
405
  results.append({
406
- "video_path": path,
407
- "class_id": pred_idx.item(),
408
- "class_name": CLASS_NAMES[pred_idx.item()],
409
- "confidence": prob[pred_idx].item()
410
  })
411
 
412
  return results
413
 
 
414
  # 사용 예시
415
- results = batch_inference(model, "./videos/", batch_size=16)
416
  for r in results:
417
- print(f"{r['video_path']}: {r['class_name']} ({r['confidence']:.2%})")
418
  ```
419
 
420
  ---
421
 
422
- ## 📐 Input/Output Specification
423
 
424
- ### Input Format
425
 
426
  | Parameter | Value |
427
  |-----------|-------|
428
- | **Shape** | `[batch, 3, 30, 224, 224]` |
429
- | **Format** | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
430
- | **Channels** | RGB (not BGR) |
431
- | **Normalization** | ImageNet (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
432
- | **Value Range** | After normalization: approximately [-2.5, 2.5] |
433
 
434
- ### Output Format
435
 
436
  | Parameter | Value |
437
  |-----------|-------|
438
- | **Shape** | `[batch, 5]` |
439
- | **Format** | Raw logits (use softmax for probabilities) |
440
- | **Classes** | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
441
 
442
  ---
443
 
444
- ## ⚙️ Model Architecture
445
 
446
  ```
447
- VideoSwinTransformer (swin3d_t)
448
- ├── patch_embed: PatchEmbed3d
449
- │ └── proj: Conv3d(3, 96, kernel_size=(2,4,4), stride=(2,4,4))
450
- ├── layers: Sequential
451
- │ ├── BasicLayer (depth=2, heads=3, dim=96)
452
- │ ├── BasicLayer (depth=2, heads=6, dim=192)
453
- │ ├── BasicLayer (depth=6, heads=12, dim=384)
454
- └── BasicLayer (depth=2, heads=24, dim=768)
455
- ├── norm: LayerNorm(768)
456
- ├── avgpool: AdaptiveAvgPool3d(1)
457
- └── head: Linear(768, 5) # Modified for 5 classes
458
-
459
- Total Parameters: 27,855,851
460
- Trainable Parameters: 27,855,851
 
 
 
 
461
  ```
462
 
463
  ---
464
 
465
- ## 🏋️ Training Details
466
 
467
  | Parameter | Value |
468
  |-----------|-------|
469
- | **Base Model** | swin3d_t (Kinetics-400 pretrained) |
470
- | **Framework** | PyTorch 2.0+ |
471
- | **GPUs** | 2x NVIDIA A6000 (48GB each) |
472
- | **Training Method** | DistributedDataParallel (DDP) |
473
- | **Batch Size** | 128 effective (16 per GPU × 2 GPUs × 4 accumulation) |
474
- | **Optimizer** | AdamW (lr=1e-3, weight_decay=1e-4) |
475
- | **Scheduler** | OneCycleLR (pct_start=0.2, anneal=cosine) |
476
- | **Mixed Precision** | FP16 (torch.amp) |
477
- | **Epochs** | 1 (of 5 total) |
478
 
479
  ---
480
 
481
- ## 📁 Dataset Information
482
 
483
  | Property | Value |
484
  |----------|-------|
485
- | **Name** | Korean Driver Behavior Dataset |
486
- | **Total Videos** | 243,979 |
487
- | **Total Samples** | 1,371,062 (sliding window) |
488
- | **Window Size** | 30 frames |
489
- | **Stride** | 15 frames |
490
- | **Resolution** | Various (resized to 224×224) |
491
- | **FPS** | 30 |
492
 
493
  ### Class Distribution
494
 
@@ -502,34 +455,28 @@ Trainable Parameters: 27,855,851
502
 
503
  ---
504
 
505
- ## ⚠️ Limitations & Considerations
506
 
507
- 1. **카메라 위치**: 운전석 정면 또는 측면 카메라에 최적화됨
508
- 2. **조명 조건**: 야간/터널 저조도 환경에서 성능 저하 가능
509
- 3. **가림 현상**: 선글라스, 마스크 착용 시 정확도 감소 가능
510
- 4. **실시간 요구사항**: GPU 필요 (CPU에서는 느림)
511
 
512
  ---
513
 
514
- ## 📜 License
515
 
516
  Apache 2.0
517
 
518
  ---
519
 
520
- ## 🔗 Citation
521
 
522
  ```bibtex
523
- @misc{driver-behavior-detection-2025,
524
  title={Driver Abnormal Behavior Detection using Video Swin Transformer},
525
  author={C-Team},
526
  year={2025},
527
- howpublished={\url{https://huggingface.co/YOUR_USERNAME/driver-behavior-swin-t}}
528
  }
529
  ```
530
-
531
- ---
532
-
533
- ## 📞 Contact
534
-
535
- Issues and questions: [GitHub Issues](https://github.com/YOUR_USERNAME/driver-behavior-detection/issues)
 
17
  - custom
18
  ---
19
 
20
+ # Driver Abnormal Behavior Detection Model
21
 
22
  **운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
23
 
24
  차량 내 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
25
 
26
+ ## Model Performance
27
 
28
  | Metric | Score |
29
  |--------|-------|
 
33
 
34
  ### Per-Class Performance
35
 
36
+ | Class ID | Korean | English | Precision | Recall | F1-Score |
37
+ |----------|--------|---------|-----------|--------|----------|
38
+ | 0 | 정상 | Normal | 0.93 | 0.92 | 0.92 |
39
+ | 1 | 졸음운전 | Drowsy Driving | 0.99 | 0.98 | 0.98 |
40
+ | 2 | 물건찾기 | Searching Objects | 0.90 | 0.94 | 0.92 |
41
+ | 3 | 휴대폰 사용 | Phone Usage | 0.91 | 0.88 | 0.90 |
42
+ | 4 | 운전자 폭행 | Driver Assault | 1.00 | 1.00 | 1.00 |
43
 
44
  ---
45
 
46
+ ## Files in This Repository
47
 
48
+ ```
49
+ driver-behavior-model-epoch1/
50
+ ├── pytorch_model.bin # 모델 가중치 (120MB)
51
+ ├── model.py # 모델 클래스 정의 (필수!)
52
+ ├── config.json # 설정 파일
53
+ └── README.md # 이 파일
54
+ ```
55
 
56
+ **중요: `model.py`와 `pytorch_model.bin` 둘 다 필요합니다!**
 
57
 
58
+ ---
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ pip install torch torchvision opencv-python numpy
64
+ pip install huggingface_hub # HuggingFace에서 다운로드 시
65
  ```
66
 
67
  ---
68
 
69
+ ## Quick Start
70
 
71
+ ### 1. 모델 다운로드
72
 
73
+ ```bash
74
+ # HuggingFace CLI로 다운로드
75
+ huggingface-cli download YOUR_USERNAME/driver-behavior-swin-t --local-dir ./model
76
 
77
+ # 또는 Python으로
78
+ from huggingface_hub import snapshot_download
79
+ snapshot_download(repo_id="YOUR_USERNAME/driver-behavior-swin-t", local_dir="./model")
80
+ ```
81
 
82
+ ### 2. 모델 로드
 
 
83
 
84
+ ```python
85
+ import torch
86
+ import sys
87
 
88
+ # model.py가 있는 경로 추가
89
+ sys.path.insert(0, "./model")
90
+ from model import DriverBehaviorModel
 
 
91
 
92
+ # 모델 생성 (pretrained=False: Kinetics 가중치 다운로드 안함)
93
+ model = DriverBehaviorModel(num_classes=5, pretrained=False)
94
+
95
+ # 학습된 가중치 로드
96
+ state_dict = torch.load("./model/pytorch_model.bin", map_location="cpu", weights_only=True)
97
  model.load_state_dict(state_dict)
98
  model.eval()
99
+
100
+ print("모델 로드 완료!")
101
  ```
102
 
103
+ ### 3. 단일 비디오 추론
104
 
105
  ```python
106
  import cv2
107
  import torch
108
  import numpy as np
109
 
 
110
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
111
  CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
112
 
113
+ def preprocess_video(video_path, num_frames=30, size=(224, 224)):
114
+ """비디오 전처리"""
115
  cap = cv2.VideoCapture(video_path)
116
  frames = []
117
 
 
119
  ret, frame = cap.read()
120
  if not ret:
121
  break
 
122
  frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 
123
  frame = cv2.resize(frame, size)
124
  frames.append(frame)
 
125
  cap.release()
126
 
127
  # 프레임 부족 시 마지막 프레임 복제
 
130
 
131
  # [T, H, W, C] -> [C, T, H, W]
132
  frames = np.array(frames[:num_frames], dtype=np.float32)
133
+ frames = frames.transpose(3, 0, 1, 2) / 255.0
 
 
 
134
 
135
  # ImageNet normalization
136
  mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
 
139
 
140
  return torch.FloatTensor(frames)
141
 
142
+
143
  def predict(model, video_path, device="cuda"):
144
  """단일 비디오 추론"""
145
  model = model.to(device)
146
  model.eval()
147
 
148
+ frames = preprocess_video(video_path)
149
+ frames = frames.unsqueeze(0).to(device) # [1, 3, 30, 224, 224]
 
150
 
 
151
  with torch.no_grad():
152
  outputs = model(frames)
153
  probs = torch.softmax(outputs, dim=1)
 
159
  "class_name_ko": CLASS_NAMES[pred_idx],
160
  "class_name_en": CLASS_NAMES_EN[pred_idx],
161
  "confidence": confidence,
162
+ "probabilities": {name: probs[0, i].item() for i, name in enumerate(CLASS_NAMES)}
 
 
 
163
  }
164
 
165
+
166
  # 사용 예시
167
+ result = predict(model, "test_video.mp4", device="cuda")
168
+ print(f"예측: {result['class_name_ko']} ({result['confidence']:.1%})")
169
+ print(f"전체 확률: {result['probabilities']}")
170
  ```
171
 
172
  ---
173
 
174
+ ## Real-time Inference (실시간 추론)
175
 
176
  ```python
177
  import cv2
 
179
  import numpy as np
180
  from collections import deque
181
 
182
+ class RealtimeDetector:
183
  """실시간 운전자 이상행동 탐지기"""
184
 
185
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
186
+ COLORS = {
187
+ "정상": (0, 255, 0), # 초록
188
+ "졸음운전": (0, 165, 255), # 주황
189
+ "물건찾기": (0, 255, 255), # 노랑
190
+ "휴대폰 사용": (0, 0, 255), # 빨강
191
+ "운전자 폭행": (255, 0, 255) # 보라
192
+ }
193
 
194
+ def __init__(self, model_dir, device="cuda", window_size=30, stride=15):
 
 
 
 
 
 
 
195
  self.device = device
196
  self.window_size = window_size
197
  self.stride = stride
198
 
199
  # 모델 로드
200
+ import sys
201
+ sys.path.insert(0, model_dir)
202
+ from model import DriverBehaviorModel
203
 
204
+ self.model = DriverBehaviorModel(num_classes=5, pretrained=False)
205
+ state_dict = torch.load(f"{model_dir}/pytorch_model.bin",
206
+ map_location="cpu", weights_only=True)
207
  self.model.load_state_dict(state_dict)
208
  self.model.to(device)
209
  self.model.eval()
210
 
211
  # 프레임 버퍼
212
+ self.buffer = deque(maxlen=window_size)
213
  self.frame_count = 0
214
 
215
+ # Normalization
216
  self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
217
  self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
218
 
219
+ def process_frame(self, frame):
220
+ """프레임 처리 및 추론"""
221
+ # 전처리
222
+ processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
223
+ processed = cv2.resize(processed, (224, 224))
224
+ self.buffer.append(processed)
225
+ self.frame_count += 1
226
 
227
+ # stride마다 추론
228
+ if self.frame_count % self.stride == 0 and len(self.buffer) == self.window_size:
229
+ return self._predict()
230
+ return None
231
 
232
+ def _predict(self):
233
+ frames = np.array(list(self.buffer), dtype=np.float32)
234
  frames = frames.transpose(3, 0, 1, 2) / 255.0
235
  frames = (frames - self.mean) / self.std
236
 
 
237
  with torch.no_grad():
238
  inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
239
  outputs = self.model(inputs)
240
  probs = torch.softmax(outputs, dim=1)
241
  pred_idx = torch.argmax(probs, dim=1).item()
 
242
 
243
  return {
244
  "class_id": pred_idx,
245
  "class_name": self.CLASS_NAMES[pred_idx],
246
+ "confidence": probs[0, pred_idx].item(),
247
+ "is_abnormal": pred_idx != 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  }
249
 
250
+ def run(self, source=0):
251
+ """실시간 추론 실행 (source: 0=웹캠, 또는 비디오 경로)"""
252
+ cap = cv2.VideoCapture(source)
253
  current_result = None
254
 
255
  while True:
 
257
  if not ret:
258
  break
259
 
 
260
  result = self.process_frame(frame)
261
  if result:
262
  current_result = result
263
 
264
+ # 화면 표시
265
+ if current_result:
266
  label = current_result["class_name"]
267
  conf = current_result["confidence"]
268
+ color = self.COLORS.get(label, (255, 255, 255))
269
 
270
+ cv2.putText(frame, f"{label}: {conf:.1%}", (10, 40),
 
 
271
  cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
272
 
 
273
  if current_result["is_abnormal"]:
274
  cv2.putText(frame, "WARNING!", (10, 80),
275
  cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
276
 
277
+ cv2.imshow("Driver Behavior Detection", frame)
 
278
  if cv2.waitKey(1) & 0xFF == ord('q'):
279
  break
280
 
 
282
  cv2.destroyAllWindows()
283
 
284
 
285
+ # 사용 예시
286
+ detector = RealtimeDetector("./model", device="cuda")
287
+ detector.run(source=0) # 웹캠
288
+ # detector.run(source="video.mp4") # 비디오 파일
 
 
 
 
289
  ```
290
 
291
  ---
292
 
293
+ ## Batch Inference (대량 처리)
294
 
295
  ```python
296
  import torch
 
298
  from torch.utils.data import Dataset, DataLoader
299
 
300
  class VideoDataset(Dataset):
 
 
301
  def __init__(self, video_paths, num_frames=30):
302
  self.video_paths = video_paths
303
  self.num_frames = num_frames
 
308
  return len(self.video_paths)
309
 
310
  def __getitem__(self, idx):
311
+ path = str(self.video_paths[idx])
312
+ cap = cv2.VideoCapture(path)
 
313
  frames = []
314
 
315
  while len(frames) < self.num_frames:
 
319
  frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
320
  frame = cv2.resize(frame, (224, 224))
321
  frames.append(frame)
 
322
  cap.release()
323
 
324
  while len(frames) < self.num_frames:
 
328
  frames = frames.transpose(3, 0, 1, 2) / 255.0
329
  frames = (frames - self.mean) / self.std
330
 
331
+ return torch.FloatTensor(frames), path
332
 
333
 
334
+ def batch_predict(model, video_folder, batch_size=8, device="cuda"):
335
+ """폴더 내 모든 비디오 배치 추론"""
 
 
 
 
 
 
 
 
 
 
 
336
  CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
337
 
338
+ video_paths = list(Path(video_folder).glob("*.mp4")) + list(Path(video_folder).glob("*.avi"))
 
 
339
  dataset = VideoDataset(video_paths)
340
+ loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
341
 
342
  model = model.to(device)
343
  model.eval()
344
 
345
  results = []
 
346
  with torch.no_grad():
347
+ for frames, paths in loader:
348
  frames = frames.to(device)
349
  outputs = model(frames)
350
  probs = torch.softmax(outputs, dim=1)
351
  preds = torch.argmax(probs, dim=1)
352
 
353
+ for path, pred, prob in zip(paths, preds, probs):
354
  results.append({
355
+ "path": path,
356
+ "class_id": pred.item(),
357
+ "class_name": CLASS_NAMES[pred.item()],
358
+ "confidence": prob[pred].item()
359
  })
360
 
361
  return results
362
 
363
+
364
  # 사용 예시
365
+ results = batch_predict(model, "./videos/", batch_size=16)
366
  for r in results:
367
+ print(f"{r['path']}: {r['class_name']} ({r['confidence']:.1%})")
368
  ```
369
 
370
  ---
371
 
372
+ ## Input/Output Specification
373
 
374
+ ### Input
375
 
376
  | Parameter | Value |
377
  |-----------|-------|
378
+ | Shape | `[batch, 3, 30, 224, 224]` |
379
+ | Format | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
380
+ | Color | RGB (not BGR!) |
381
+ | Normalization | ImageNet: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
382
+ | Frame Count | 30 frames (1 second at 30fps) |
383
 
384
+ ### Output
385
 
386
  | Parameter | Value |
387
  |-----------|-------|
388
+ | Shape | `[batch, 5]` |
389
+ | Type | Raw logits (use `softmax` for probabilities) |
390
+ | Classes | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
391
 
392
  ---
393
 
394
+ ## Model Architecture
395
 
396
  ```
397
+ DriverBehaviorModel
398
+ └── backbone: SwinTransformer3d (swin3d_t)
399
+ ├── patch_embed: Conv3d(3, 96, kernel=(2,4,4), stride=(2,4,4))
400
+ ├── features: Sequential
401
+ │ ├── BasicLayer (depth=2, heads=3, dim=96)
402
+ │ ├── PatchMerging
403
+ │ ├── BasicLayer (depth=2, heads=6, dim=192)
404
+ ├── PatchMerging
405
+ ├── BasicLayer (depth=6, heads=12, dim=384)
406
+ ├── PatchMerging
407
+ └── BasicLayer (depth=2, heads=24, dim=768)
408
+ ├── norm: LayerNorm(768)
409
+ ├── avgpool: AdaptiveAvgPool3d(1)
410
+ └── head: Sequential
411
+ ├── LayerNorm(768)
412
+ └── Linear(768, 5)
413
+
414
+ Parameters: 29,699,819
415
  ```
416
 
417
  ---
418
 
419
+ ## Training Details
420
 
421
  | Parameter | Value |
422
  |-----------|-------|
423
+ | Base Model | swin3d_t (Kinetics-400 pretrained) |
424
+ | Framework | PyTorch 2.0+ |
425
+ | GPUs | 2x NVIDIA A6000 (48GB each) |
426
+ | Training | DistributedDataParallel (DDP) |
427
+ | Batch Size | 128 effective (16/GPU × 2 GPUs × 4 accum) |
428
+ | Optimizer | AdamW (lr=1e-3, weight_decay=1e-4) |
429
+ | Scheduler | OneCycleLR (pct_start=0.2) |
430
+ | Mixed Precision | FP16 |
431
+ | Epochs | 1 (of 5 total) |
432
 
433
  ---
434
 
435
+ ## Dataset
436
 
437
  | Property | Value |
438
  |----------|-------|
439
+ | Name | Korean Driver Behavior Dataset |
440
+ | Videos | 243,979 |
441
+ | Samples | 1,371,062 (sliding window) |
442
+ | Window | 30 frames |
443
+ | Stride | 15 frames |
444
+ | Classes | 5 |
 
445
 
446
  ### Class Distribution
447
 
 
455
 
456
  ---
457
 
458
+ ## Limitations
459
 
460
+ 1. **Camera Position**: Optimized for front/side dashboard cameras
461
+ 2. **Lighting**: May degrade in low-light conditions (night, tunnels)
462
+ 3. **Occlusion**: Sunglasses, masks may reduce accuracy
463
+ 4. **Hardware**: GPU recommended for real-time inference
464
 
465
  ---
466
 
467
+ ## License
468
 
469
  Apache 2.0
470
 
471
  ---
472
 
473
+ ## Citation
474
 
475
  ```bibtex
476
+ @misc{driver-behavior-2025,
477
  title={Driver Abnormal Behavior Detection using Video Swin Transformer},
478
  author={C-Team},
479
  year={2025},
480
+ publisher={HuggingFace}
481
  }
482
  ```
 
 
 
 
 
 
model.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 운전자 이상행동 감지 모델
3
+
4
+ - 백본: TorchVision Video Swin-T (Kinetics-400 사전학습)
5
+ - 입력: [B, 3, 30, 224, 224] (배치, 채널, 프레임, 높이, 너비)
6
+ - 출력: 5클래스 분류 (정상, 졸음운전, 물건찾기, 휴대폰 사용, 운전자 폭행)
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from torchvision.models.video import swin3d_t, Swin3D_T_Weights
13
+ from typing import Dict, Optional
14
+
15
+
16
+ class DriverBehaviorModel(nn.Module):
17
+ """
18
+ 운전자 이상행동 감지 모델
19
+
20
+ Args:
21
+ num_classes: 출력 클래스 수 (기본값: 5, 전체 버전)
22
+ pretrained: Kinetics-400 사전학습 가중치 사용 여부
23
+ freeze_backbone: 백본 파라미터 동결 여부 (전이학습 시)
24
+ """
25
+
26
+ # 전체 5클래스
27
+ CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
28
+
29
+ def __init__(
30
+ self,
31
+ num_classes: int = 5,
32
+ pretrained: bool = True,
33
+ freeze_backbone: bool = False,
34
+ ):
35
+ super().__init__()
36
+
37
+ self.num_classes = num_classes
38
+
39
+ # TorchVision Video Swin-T 백본 로드
40
+ if pretrained:
41
+ print("Loading Kinetics-400 pretrained weights...")
42
+ self.backbone = swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1)
43
+ else:
44
+ self.backbone = swin3d_t(weights=None)
45
+
46
+ # 원본 head 교체 (Kinetics-400: 400클래스 → 5클래스)
47
+ # swin3d_t의 head는 nn.Linear(768, 400)
48
+ in_features = self.backbone.head.in_features # 768
49
+ self.backbone.head = nn.Sequential(
50
+ nn.LayerNorm(in_features),
51
+ nn.Linear(in_features, num_classes),
52
+ )
53
+
54
+ # 백본 동결 옵션
55
+ if freeze_backbone:
56
+ self._freeze_backbone()
57
+
58
+ # Head 가중치 초기화
59
+ self._init_head()
60
+
61
+ def _freeze_backbone(self):
62
+ """백본 파라미터 동결 (head 제외)"""
63
+ for name, param in self.backbone.named_parameters():
64
+ if 'head' not in name:
65
+ param.requires_grad = False
66
+ print("Backbone parameters frozen (head trainable)")
67
+
68
+ def _init_head(self):
69
+ """Head 가중치 초기화"""
70
+ for m in self.backbone.head.modules():
71
+ if isinstance(m, nn.Linear):
72
+ nn.init.trunc_normal_(m.weight, std=0.02)
73
+ if m.bias is not None:
74
+ nn.init.zeros_(m.bias)
75
+
76
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
77
+ """
78
+ 순전파
79
+
80
+ Args:
81
+ x: [B, C, T, H, W] 형태의 비디오 텐서
82
+ - B: 배치 크기
83
+ - C: 채널 (3)
84
+ - T: 프레임 수 (30)
85
+ - H, W: 높이, 너비 (224, 224)
86
+
87
+ Returns:
88
+ logits: [B, num_classes] 형태의 로짓
89
+ """
90
+ return self.backbone(x)
91
+
92
+ def predict(self, x: torch.Tensor) -> Dict:
93
+ """
94
+ 추론용 예측 (단일 샘플)
95
+
96
+ Args:
97
+ x: [1, 3, 30, 224, 224] 형태의 비디오 텐서
98
+
99
+ Returns:
100
+ {
101
+ "class": int (0~4),
102
+ "confidence": float (0~1),
103
+ "class_name": str
104
+ }
105
+ """
106
+ self.eval()
107
+ with torch.no_grad():
108
+ logits = self.forward(x)
109
+ probs = F.softmax(logits, dim=-1)[0]
110
+
111
+ class_idx = probs.argmax().item()
112
+ confidence = probs[class_idx].item()
113
+
114
+ return {
115
+ "class": class_idx,
116
+ "confidence": confidence,
117
+ "class_name": self.CLASS_NAMES[class_idx],
118
+ }
119
+
120
+ def get_all_probs(self, x: torch.Tensor) -> Dict:
121
+ """
122
+ 모든 클래스의 확률 반환
123
+
124
+ Args:
125
+ x: [1, 3, 30, 224, 224] 형태의 비디오 텐서
126
+
127
+ Returns:
128
+ {
129
+ "predictions": [{"class": int, "class_name": str, "probability": float}, ...],
130
+ "top_class": int,
131
+ "top_confidence": float
132
+ }
133
+ """
134
+ self.eval()
135
+ with torch.no_grad():
136
+ logits = self.forward(x)
137
+ probs = F.softmax(logits, dim=-1)[0]
138
+
139
+ predictions = []
140
+ for i, prob in enumerate(probs):
141
+ predictions.append({
142
+ "class": i,
143
+ "class_name": self.CLASS_NAMES[i],
144
+ "probability": prob.item(),
145
+ })
146
+
147
+ # 확률 내림차순 정렬
148
+ predictions.sort(key=lambda x: x["probability"], reverse=True)
149
+
150
+ return {
151
+ "predictions": predictions,
152
+ "top_class": predictions[0]["class"],
153
+ "top_confidence": predictions[0]["probability"],
154
+ }
155
+
156
+
157
+ def create_model(
158
+ num_classes: int = 3,
159
+ pretrained: bool = True,
160
+ freeze_backbone: bool = False,
161
+ checkpoint_path: Optional[str] = None,
162
+ ) -> DriverBehaviorModel:
163
+ """
164
+ 모델 생성 헬퍼 함수
165
+
166
+ Args:
167
+ num_classes: 출력 클래스 수
168
+ pretrained: 사전학습 가중치 사용 여부
169
+ freeze_backbone: 백본 동결 여부
170
+ checkpoint_path: 체크포인트 경로 (학습된 가중치 로드)
171
+
172
+ Returns:
173
+ DriverBehaviorModel 인스턴스
174
+ """
175
+ model = DriverBehaviorModel(
176
+ num_classes=num_classes,
177
+ pretrained=pretrained,
178
+ freeze_backbone=freeze_backbone,
179
+ )
180
+
181
+ if checkpoint_path:
182
+ print(f"Loading checkpoint from {checkpoint_path}...")
183
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
184
+ model.load_state_dict(checkpoint["model"])
185
+ print("Checkpoint loaded successfully")
186
+
187
+ return model
188
+
189
+
190
+ if __name__ == "__main__":
191
+ # 모델 테스트
192
+ print("=" * 60)
193
+ print("Model Test (3 classes - Demo)")
194
+ print("=" * 60)
195
+
196
+ # 모델 생성
197
+ model = DriverBehaviorModel(num_classes=5, pretrained=True)
198
+
199
+ # 파라미터 수 출력
200
+ total_params = sum(p.numel() for p in model.parameters())
201
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
202
+ print(f"Total parameters: {total_params:,}")
203
+ print(f"Trainable parameters: {trainable_params:,}")
204
+
205
+ # 더미 입력으로 테스트
206
+ dummy_input = torch.randn(2, 3, 30, 224, 224)
207
+ print(f"\nInput shape: {dummy_input.shape}")
208
+
209
+ # Forward pass
210
+ model.eval()
211
+ with torch.no_grad():
212
+ output = model(dummy_input)
213
+ print(f"Output shape: {output.shape}")
214
+
215
+ # 단일 샘플 예측 테스트
216
+ single_input = torch.randn(1, 3, 30, 224, 224)
217
+ prediction = model.predict(single_input)
218
+ print(f"\nPrediction: {prediction}")
219
+
220
+ # 모든 확률 출력 테스트
221
+ all_probs = model.get_all_probs(single_input)
222
+ print(f"\nAll probabilities:")
223
+ for pred in all_probs["predictions"]:
224
+ print(f" {pred['class_name']}: {pred['probability']:.4f}")
225
+
226
+ print("\nModel test passed!")