koreashin commited on
Commit
8c039c2
·
verified ·
1 Parent(s): 11655ae

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +93 -482
  2. config.json +32 -52
  3. model.py +1 -0
  4. pytorch_model.bin +1 -1
README.md CHANGED
@@ -1,482 +1,93 @@
1
- ---
2
- license: apache-2.0
3
- language:
4
- - ko
5
- tags:
6
- - video-classification
7
- - driver-behavior
8
- - video-swin-transformer
9
- - pytorch
10
- - safety
11
- - autonomous-driving
12
- metrics:
13
- - accuracy
14
- - f1
15
- pipeline_tag: video-classification
16
- datasets:
17
- - custom
18
- ---
19
-
20
- # Driver Abnormal Behavior Detection Model
21
-
22
- **운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
23
-
24
- 차량 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
25
-
26
- ## Model Performance
27
-
28
- | Metric | Score |
29
- |--------|-------|
30
- | **Accuracy** | 95.51% |
31
- | **Macro F1** | 0.9436 |
32
- | **Inference Speed** | ~30 FPS (RTX 3090) |
33
-
34
- ### Per-Class Performance
35
-
36
- | Class ID | Korean | English | Precision | Recall | F1-Score |
37
- |----------|--------|---------|-----------|--------|----------|
38
- | 0 | 정상 | Normal | 0.93 | 0.92 | 0.92 |
39
- | 1 | 졸음운전 | Drowsy Driving | 0.99 | 0.98 | 0.98 |
40
- | 2 | 물건찾기 | Searching Objects | 0.90 | 0.94 | 0.92 |
41
- | 3 | 휴대폰 사용 | Phone Usage | 0.91 | 0.88 | 0.90 |
42
- | 4 | 운전자 폭행 | Driver Assault | 1.00 | 1.00 | 1.00 |
43
-
44
- ---
45
-
46
- ## Files in This Repository
47
-
48
- ```
49
- driver-behavior-model-epoch1/
50
- ├── pytorch_model.bin # 모델 가중치 (120MB)
51
- ├── model.py # 모델 클래스 정의 (필수!)
52
- ├── config.json # 설정 파일
53
- └── README.md # 이 파일
54
- ```
55
-
56
- **중요: `model.py`와 `pytorch_model.bin` 둘 다 필요합니다!**
57
-
58
- ---
59
-
60
- ## Installation
61
-
62
- ```bash
63
- pip install torch torchvision opencv-python numpy
64
- pip install huggingface_hub # HuggingFace에서 다운로드 시
65
- ```
66
-
67
- ---
68
-
69
- ## Quick Start
70
-
71
- ### 1. 모델 다운로드
72
-
73
- ```bash
74
- # HuggingFace CLI로 다운로드
75
- huggingface-cli download YOUR_USERNAME/driver-behavior-swin-t --local-dir ./model
76
-
77
- # 또는 Python으로
78
- from huggingface_hub import snapshot_download
79
- snapshot_download(repo_id="YOUR_USERNAME/driver-behavior-swin-t", local_dir="./model")
80
- ```
81
-
82
- ### 2. 모델 로드
83
-
84
- ```python
85
- import torch
86
- import sys
87
-
88
- # model.py가 있는 경로 추가
89
- sys.path.insert(0, "./model")
90
- from model import DriverBehaviorModel
91
-
92
- # 모델 생성 (pretrained=False: Kinetics 가중치 다운로드 안함)
93
- model = DriverBehaviorModel(num_classes=5, pretrained=False)
94
-
95
- # 학습된 가중치 로드
96
- state_dict = torch.load("./model/pytorch_model.bin", map_location="cpu", weights_only=True)
97
- model.load_state_dict(state_dict)
98
- model.eval()
99
-
100
- print("모델 로드 완료!")
101
- ```
102
-
103
- ### 3. 단일 비디오 추론
104
-
105
- ```python
106
- import cv2
107
- import torch
108
- import numpy as np
109
-
110
- CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
111
- CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
112
-
113
- def preprocess_video(video_path, num_frames=30, size=(224, 224)):
114
- """비디오 전처리"""
115
- cap = cv2.VideoCapture(video_path)
116
- frames = []
117
-
118
- while len(frames) < num_frames:
119
- ret, frame = cap.read()
120
- if not ret:
121
- break
122
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
123
- frame = cv2.resize(frame, size)
124
- frames.append(frame)
125
- cap.release()
126
-
127
- # 프레임 부족 시 마지막 프레임 복제
128
- while len(frames) < num_frames:
129
- frames.append(frames[-1] if frames else np.zeros((*size, 3), dtype=np.uint8))
130
-
131
- # [T, H, W, C] -> [C, T, H, W]
132
- frames = np.array(frames[:num_frames], dtype=np.float32)
133
- frames = frames.transpose(3, 0, 1, 2) / 255.0
134
-
135
- # ImageNet normalization
136
- mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
137
- std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
138
- frames = (frames - mean) / std
139
-
140
- return torch.FloatTensor(frames)
141
-
142
-
143
- def predict(model, video_path, device="cuda"):
144
- """단일 비디오 추론"""
145
- model = model.to(device)
146
- model.eval()
147
-
148
- frames = preprocess_video(video_path)
149
- frames = frames.unsqueeze(0).to(device) # [1, 3, 30, 224, 224]
150
-
151
- with torch.no_grad():
152
- outputs = model(frames)
153
- probs = torch.softmax(outputs, dim=1)
154
- pred_idx = torch.argmax(probs, dim=1).item()
155
- confidence = probs[0, pred_idx].item()
156
-
157
- return {
158
- "class_id": pred_idx,
159
- "class_name_ko": CLASS_NAMES[pred_idx],
160
- "class_name_en": CLASS_NAMES_EN[pred_idx],
161
- "confidence": confidence,
162
- "probabilities": {name: probs[0, i].item() for i, name in enumerate(CLASS_NAMES)}
163
- }
164
-
165
-
166
- # 사용 예시
167
- result = predict(model, "test_video.mp4", device="cuda")
168
- print(f"예측: {result['class_name_ko']} ({result['confidence']:.1%})")
169
- print(f"전체 확률: {result['probabilities']}")
170
- ```
171
-
172
- ---
173
-
174
- ## Real-time Inference (실시간 추론)
175
-
176
- ```python
177
- import cv2
178
- import torch
179
- import numpy as np
180
- from collections import deque
181
-
182
- class RealtimeDetector:
183
- """실시간 운전자 이상행동 탐지기"""
184
-
185
- CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
186
- COLORS = {
187
- "정상": (0, 255, 0), # 초록
188
- "졸음운전": (0, 165, 255), # 주황
189
- "물건찾기": (0, 255, 255), # 노랑
190
- "휴대폰 사용": (0, 0, 255), # 빨강
191
- "운전자 폭행": (255, 0, 255) # 보라
192
- }
193
-
194
- def __init__(self, model_dir, device="cuda", window_size=30, stride=15):
195
- self.device = device
196
- self.window_size = window_size
197
- self.stride = stride
198
-
199
- # 모델 로드
200
- import sys
201
- sys.path.insert(0, model_dir)
202
- from model import DriverBehaviorModel
203
-
204
- self.model = DriverBehaviorModel(num_classes=5, pretrained=False)
205
- state_dict = torch.load(f"{model_dir}/pytorch_model.bin",
206
- map_location="cpu", weights_only=True)
207
- self.model.load_state_dict(state_dict)
208
- self.model.to(device)
209
- self.model.eval()
210
-
211
- # 프레임 버퍼
212
- self.buffer = deque(maxlen=window_size)
213
- self.frame_count = 0
214
-
215
- # Normalization
216
- self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
217
- self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
218
-
219
- def process_frame(self, frame):
220
- """프레임 처리 및 추론"""
221
- # 전처리
222
- processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
223
- processed = cv2.resize(processed, (224, 224))
224
- self.buffer.append(processed)
225
- self.frame_count += 1
226
-
227
- # stride마다 추론
228
- if self.frame_count % self.stride == 0 and len(self.buffer) == self.window_size:
229
- return self._predict()
230
- return None
231
-
232
- def _predict(self):
233
- frames = np.array(list(self.buffer), dtype=np.float32)
234
- frames = frames.transpose(3, 0, 1, 2) / 255.0
235
- frames = (frames - self.mean) / self.std
236
-
237
- with torch.no_grad():
238
- inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
239
- outputs = self.model(inputs)
240
- probs = torch.softmax(outputs, dim=1)
241
- pred_idx = torch.argmax(probs, dim=1).item()
242
-
243
- return {
244
- "class_id": pred_idx,
245
- "class_name": self.CLASS_NAMES[pred_idx],
246
- "confidence": probs[0, pred_idx].item(),
247
- "is_abnormal": pred_idx != 0
248
- }
249
-
250
- def run(self, source=0):
251
- """실시간 추론 실행 (source: 0=웹캠, 또는 비디오 경로)"""
252
- cap = cv2.VideoCapture(source)
253
- current_result = None
254
-
255
- while True:
256
- ret, frame = cap.read()
257
- if not ret:
258
- break
259
-
260
- result = self.process_frame(frame)
261
- if result:
262
- current_result = result
263
-
264
- # 화면 표시
265
- if current_result:
266
- label = current_result["class_name"]
267
- conf = current_result["confidence"]
268
- color = self.COLORS.get(label, (255, 255, 255))
269
-
270
- cv2.putText(frame, f"{label}: {conf:.1%}", (10, 40),
271
- cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
272
-
273
- if current_result["is_abnormal"]:
274
- cv2.putText(frame, "WARNING!", (10, 80),
275
- cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
276
-
277
- cv2.imshow("Driver Behavior Detection", frame)
278
- if cv2.waitKey(1) & 0xFF == ord('q'):
279
- break
280
-
281
- cap.release()
282
- cv2.destroyAllWindows()
283
-
284
-
285
- # 사용 예시
286
- detector = RealtimeDetector("./model", device="cuda")
287
- detector.run(source=0) # 웹캠
288
- # detector.run(source="video.mp4") # 비디오 파일
289
- ```
290
-
291
- ---
292
-
293
- ## Batch Inference (대량 처리)
294
-
295
- ```python
296
- import torch
297
- from pathlib import Path
298
- from torch.utils.data import Dataset, DataLoader
299
-
300
- class VideoDataset(Dataset):
301
- def __init__(self, video_paths, num_frames=30):
302
- self.video_paths = video_paths
303
- self.num_frames = num_frames
304
- self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
305
- self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
306
-
307
- def __len__(self):
308
- return len(self.video_paths)
309
-
310
- def __getitem__(self, idx):
311
- path = str(self.video_paths[idx])
312
- cap = cv2.VideoCapture(path)
313
- frames = []
314
-
315
- while len(frames) < self.num_frames:
316
- ret, frame = cap.read()
317
- if not ret:
318
- break
319
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
320
- frame = cv2.resize(frame, (224, 224))
321
- frames.append(frame)
322
- cap.release()
323
-
324
- while len(frames) < self.num_frames:
325
- frames.append(frames[-1] if frames else np.zeros((224, 224, 3), dtype=np.uint8))
326
-
327
- frames = np.array(frames[:self.num_frames], dtype=np.float32)
328
- frames = frames.transpose(3, 0, 1, 2) / 255.0
329
- frames = (frames - self.mean) / self.std
330
-
331
- return torch.FloatTensor(frames), path
332
-
333
-
334
- def batch_predict(model, video_folder, batch_size=8, device="cuda"):
335
- """폴더 내 모든 비디오 배치 추론"""
336
- CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
337
-
338
- video_paths = list(Path(video_folder).glob("*.mp4")) + list(Path(video_folder).glob("*.avi"))
339
- dataset = VideoDataset(video_paths)
340
- loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
341
-
342
- model = model.to(device)
343
- model.eval()
344
-
345
- results = []
346
- with torch.no_grad():
347
- for frames, paths in loader:
348
- frames = frames.to(device)
349
- outputs = model(frames)
350
- probs = torch.softmax(outputs, dim=1)
351
- preds = torch.argmax(probs, dim=1)
352
-
353
- for path, pred, prob in zip(paths, preds, probs):
354
- results.append({
355
- "path": path,
356
- "class_id": pred.item(),
357
- "class_name": CLASS_NAMES[pred.item()],
358
- "confidence": prob[pred].item()
359
- })
360
-
361
- return results
362
-
363
-
364
- # 사용 예시
365
- results = batch_predict(model, "./videos/", batch_size=16)
366
- for r in results:
367
- print(f"{r['path']}: {r['class_name']} ({r['confidence']:.1%})")
368
- ```
369
-
370
- ---
371
-
372
- ## Input/Output Specification
373
-
374
- ### Input
375
-
376
- | Parameter | Value |
377
- |-----------|-------|
378
- | Shape | `[batch, 3, 30, 224, 224]` |
379
- | Format | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
380
- | Color | RGB (not BGR!) |
381
- | Normalization | ImageNet: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
382
- | Frame Count | 30 frames (1 second at 30fps) |
383
-
384
- ### Output
385
-
386
- | Parameter | Value |
387
- |-----------|-------|
388
- | Shape | `[batch, 5]` |
389
- | Type | Raw logits (use `softmax` for probabilities) |
390
- | Classes | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
391
-
392
- ---
393
-
394
- ## Model Architecture
395
-
396
- ```
397
- DriverBehaviorModel
398
- └── backbone: SwinTransformer3d (swin3d_t)
399
- ├── patch_embed: Conv3d(3, 96, kernel=(2,4,4), stride=(2,4,4))
400
- ├── features: Sequential
401
- │ ├── BasicLayer (depth=2, heads=3, dim=96)
402
- │ ├── PatchMerging
403
- │ ├── BasicLayer (depth=2, heads=6, dim=192)
404
- │ ├── PatchMerging
405
- │ ├── BasicLayer (depth=6, heads=12, dim=384)
406
- │ ├── PatchMerging
407
- │ └── BasicLayer (depth=2, heads=24, dim=768)
408
- ├── norm: LayerNorm(768)
409
- ├── avgpool: AdaptiveAvgPool3d(1)
410
- └── head: Sequential
411
- ├── LayerNorm(768)
412
- └── Linear(768, 5)
413
-
414
- Parameters: 29,699,819
415
- ```
416
-
417
- ---
418
-
419
- ## Training Details
420
-
421
- | Parameter | Value |
422
- |-----------|-------|
423
- | Base Model | swin3d_t (Kinetics-400 pretrained) |
424
- | Framework | PyTorch 2.0+ |
425
- | GPUs | 2x NVIDIA A6000 (48GB each) |
426
- | Training | DistributedDataParallel (DDP) |
427
- | Batch Size | 128 effective (16/GPU × 2 GPUs × 4 accum) |
428
- | Optimizer | AdamW (lr=1e-3, weight_decay=1e-4) |
429
- | Scheduler | OneCycleLR (pct_start=0.2) |
430
- | Mixed Precision | FP16 |
431
- | Epochs | 1 (of 5 total) |
432
-
433
- ---
434
-
435
- ## Dataset
436
-
437
- | Property | Value |
438
- |----------|-------|
439
- | Name | Korean Driver Behavior Dataset |
440
- | Videos | 243,979 |
441
- | Samples | 1,371,062 (sliding window) |
442
- | Window | 30 frames |
443
- | Stride | 15 frames |
444
- | Classes | 5 |
445
-
446
- ### Class Distribution
447
-
448
- | Class | Samples | Percentage |
449
- |-------|---------|------------|
450
- | 정상 | 159,224 | 11.6% |
451
- | 졸음운전 | 619,450 | 45.2% |
452
- | 물건찾기 | 261,435 | 19.1% |
453
- | 휴대폰 사용 | 150,981 | 11.0% |
454
- | 운전자 폭행 | 179,972 | 13.1% |
455
-
456
- ---
457
-
458
- ## Limitations
459
-
460
- 1. **Camera Position**: Optimized for front/side dashboard cameras
461
- 2. **Lighting**: May degrade in low-light conditions (night, tunnels)
462
- 3. **Occlusion**: Sunglasses, masks may reduce accuracy
463
- 4. **Hardware**: GPU recommended for real-time inference
464
-
465
- ---
466
-
467
- ## License
468
-
469
- Apache 2.0
470
-
471
- ---
472
-
473
- ## Citation
474
-
475
- ```bibtex
476
- @misc{driver-behavior-2025,
477
- title={Driver Abnormal Behavior Detection using Video Swin Transformer},
478
- author={C-Team},
479
- year={2025},
480
- publisher={HuggingFace}
481
- }
482
- ```
 
1
+ # Driver Behavior Detection Model (Epoch 2)
2
+
3
+ 운전자 이상행동 감지를 위한 Video Swin Transformer 기반 모델입니다.
4
+
5
+ ## Model Description
6
+
7
+ - **Architecture**: Video Swin Transformer Tiny (swin3d_t)
8
+ - **Backbone Pretrained**: Kinetics-400
9
+ - **Parameters**: 27.85M
10
+ - **Input**: [B, 3, 30, 224, 224] (batch, channels, frames, height, width)
11
+
12
+ ## Classes (5)
13
+
14
+ | Label | Class | F1-Score |
15
+ |:-----:|-------|:--------:|
16
+ | 0 | 정상 (Normal) | 0.93 |
17
+ | 1 | 졸음운전 (Drowsy Driving) | 0.98 |
18
+ | 2 | 물건찾기 (Reaching/Searching) | 0.90 |
19
+ | 3 | 휴대폰 사용 (Phone Usage) | 0.88 |
20
+ | 4 | 운전자 폭행 (Driver Assault) | 1.00 |
21
+
22
+ ## Performance (Epoch 2)
23
+
24
+ | Metric | Value |
25
+ |--------|-------|
26
+ | **Accuracy** | 95.15% |
27
+ | **Macro F1** | 0.9392 |
28
+ | **Validation Samples** | 1,371,062 |
29
+
30
+ ## Training Configuration
31
+
32
+ | Parameter | Value |
33
+ |-----------|-------|
34
+ | Hardware | 2x NVIDIA RTX A6000 (48GB) |
35
+ | Distributed | DDP (DistributedDataParallel) |
36
+ | Batch Size | 32 (16 × 2 GPU) |
37
+ | Gradient Accumulation | 4 |
38
+ | Effective Batch | 128 |
39
+ | Optimizer | AdamW (lr=1e-3, wd=0.05) |
40
+ | Scheduler | OneCycleLR |
41
+ | Mixed Precision | FP16 |
42
+ | Loss | CrossEntropy + Label Smoothing (0.1) |
43
+ | Regularization | Mixup (α=0.4), Dropout (0.3) |
44
+
45
+ ## Usage
46
+
47
+ ```python
48
+ import torch
49
+ from model import DriverBehaviorModel
50
+
51
+ # Load model
52
+ model = DriverBehaviorModel(num_classes=5, pretrained=False)
53
+ checkpoint = torch.load("pytorch_model.bin", map_location="cpu")
54
+ model.load_state_dict(checkpoint["model"])
55
+ model.eval()
56
+
57
+ # Inference
58
+ # input: [1, 3, 30, 224, 224] - 30 frames, 224x224, RGB normalized
59
+ with torch.no_grad():
60
+ output = model(video_tensor)
61
+ prediction = output.argmax(dim=1)
62
+ ```
63
+
64
+ ## Dataset
65
+
66
+ - **Total Videos**: 243,979
67
+ - **Total Samples (windows)**: 1,371,062
68
+ - **Window Size**: 30 frames
69
+ - **Stride**: 15 frames
70
+ - **Resolution**: 224×224
71
+
72
+ ## Augmentation (Training)
73
+
74
+ - RandomResizedCrop (scale 0.8-1.0)
75
+ - HorizontalFlip (p=0.5)
76
+ - ColorJitter, HueSaturationValue
77
+ - Temporal Augmentation (speed change, frame drop)
78
+ - Mixup (α=0.4)
79
+ - CoarseDropout
80
+
81
+ ## License
82
+
83
+ This model is for research purposes only.
84
+
85
+ ## Citation
86
+
87
+ ```
88
+ @misc{driver-behavior-detection-2026,
89
+ title={Driver Behavior Detection using Video Swin Transformer},
90
+ author={C-Team},
91
+ year={2026}
92
+ }
93
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,52 +1,32 @@
1
- {
2
- "architectures": [
3
- "VideoSwinTransformer"
4
- ],
5
- "model_type": "video-swin-transformer",
6
- "backbone": "swin3d_t",
7
- "pretrained_source": "kinetics400",
8
- "num_classes": 5,
9
- "class_names": [
10
- "정상",
11
- "졸음운전",
12
- "물건찾기",
13
- "휴대폰 사용",
14
- "운전자 폭행"
15
- ],
16
- "input_size": {
17
- "frames": 30,
18
- "height": 224,
19
- "width": 224,
20
- "channels": 3
21
- },
22
- "input_format": "CTHW",
23
- "training": {
24
- "epochs_trained": 1,
25
- "total_epochs": 5,
26
- "batch_size": 16,
27
- "effective_batch_size": 128,
28
- "learning_rate": 0.001,
29
- "optimizer": "AdamW",
30
- "scheduler": "OneCycleLR",
31
- "mixed_precision": true,
32
- "gradient_accumulation_steps": 4
33
- },
34
- "metrics": {
35
- "accuracy": 0.9551,
36
- "macro_f1": 0.9436,
37
- "per_class_f1": {
38
- "정상": 0.92,
39
- "졸음운전": 0.98,
40
- "물건찾기": 0.92,
41
- "휴대폰 사용": 0.9,
42
- "운전자 폭행": 1.0
43
- }
44
- },
45
- "dataset": {
46
- "name": "Korean Driver Behavior Dataset",
47
- "total_samples": 1371062,
48
- "num_videos": 243979,
49
- "sliding_window": 30,
50
- "stride": 15
51
- }
52
- }
 
1
+ {
2
+ "architectures": ["DriverBehaviorModel"],
3
+ "model_type": "video-swin-transformer",
4
+ "backbone": "swin3d_t",
5
+ "num_classes": 5,
6
+ "class_names": ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"],
7
+ "input_size": [3, 30, 224, 224],
8
+ "pretrained_backbone": "Kinetics-400",
9
+ "head": {
10
+ "type": "Sequential",
11
+ "layers": ["LayerNorm(768)", "Dropout(0.3)", "Linear(768, 5)"]
12
+ },
13
+ "training": {
14
+ "epoch": 2,
15
+ "accuracy": 0.9515,
16
+ "macro_f1": 0.9392,
17
+ "batch_size": 32,
18
+ "optimizer": "AdamW",
19
+ "learning_rate": 1e-3,
20
+ "weight_decay": 0.05,
21
+ "scheduler": "OneCycleLR",
22
+ "mixed_precision": "fp16",
23
+ "augmentation": ["Mixup(0.4)", "RandomResizedCrop", "HorizontalFlip", "ColorJitter", "TemporalAugmentation"]
24
+ },
25
+ "performance": {
26
+ "정상": {"precision": 0.91, "recall": 0.95, "f1": 0.93},
27
+ "졸음운전": {"precision": 0.99, "recall": 0.97, "f1": 0.98},
28
+ "물건찾기": {"precision": 0.92, "recall": 0.88, "f1": 0.90},
29
+ "휴대폰 사용": {"precision": 0.84, "recall": 0.93, "f1": 0.88},
30
+ "운전자 폭행": {"precision": 1.00, "recall": 1.00, "f1": 1.00}
31
+ }
32
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.py CHANGED
@@ -48,6 +48,7 @@ class DriverBehaviorModel(nn.Module):
48
  in_features = self.backbone.head.in_features # 768
49
  self.backbone.head = nn.Sequential(
50
  nn.LayerNorm(in_features),
 
51
  nn.Linear(in_features, num_classes),
52
  )
53
 
 
48
  in_features = self.backbone.head.in_features # 768
49
  self.backbone.head = nn.Sequential(
50
  nn.LayerNorm(in_features),
51
+ nn.Dropout(p=0.3), # 오버피팅 방지
52
  nn.Linear(in_features, num_classes),
53
  )
54
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc7eb66a00e43a79a4db83cad13a36dc97b87d500a1a6f0bcec72779d22fdaf9
3
  size 126244047
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae9125be6e38460b5519ca5fc0bad96e952297b1858a95bd15ebaa7d0a772f3f
3
  size 126244047