koreashin
/

Driver_monitoring

@@ -1,482 +1,93 @@
----
-license: apache-2.0
-language:
-- ko
-tags:
-- video-classification
-- driver-behavior
-- video-swin-transformer
-- pytorch
-- safety
-- autonomous-driving
-metrics:
-- accuracy
-- f1
-pipeline_tag: video-classification
-datasets:
-- custom
----
-# Driver Abnormal Behavior Detection Model
-**운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
-차량 내 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
-## Model Performance
-| Metric | Score |
-|--------|-------|
-| **Accuracy** | 95.51% |
-| **Macro F1** | 0.9436 |
-| **Inference Speed** | ~30 FPS (RTX 3090) |
-### Per-Class Performance
-| Class ID | Korean | English | Precision | Recall | F1-Score |
-|----------|--------|---------|-----------|--------|----------|
-| 0 | 정상 | Normal | 0.93 | 0.92 | 0.92 |
-| 1 | 졸음운전 | Drowsy Driving | 0.99 | 0.98 | 0.98 |
-| 2 | 물건찾기 | Searching Objects | 0.90 | 0.94 | 0.92 |
-| 3 | 휴대폰 사용 | Phone Usage | 0.91 | 0.88 | 0.90 |
-| 4 | 운전자 폭행 | Driver Assault | 1.00 | 1.00 | 1.00 |
----
-## Files in This Repository
-```
-driver-behavior-model-epoch1/
-├── pytorch_model.bin   # 모델 가중치 (120MB)
-├── model.py            # 모델 클래스 정의 (필수!)
-├── config.json         # 설정 파일
-└── README.md           # 이 파일
-```
-**중요: `model.py`와 `pytorch_model.bin` 둘 다 필요합니다!**
----
-## Installation
-```bash
-pip install torch torchvision opencv-python numpy
-pip install huggingface_hub  # HuggingFace에서 다운로드 시
-```
----
-## Quick Start
-### 1. 모델 다운로드
-```bash
-# HuggingFace CLI로 다운로드
-huggingface-cli download YOUR_USERNAME/driver-behavior-swin-t --local-dir ./model
-# 또는 Python으로
-from huggingface_hub import snapshot_download
-snapshot_download(repo_id="YOUR_USERNAME/driver-behavior-swin-t", local_dir="./model")
-```
-### 2. 모델 로드
-```python
-import torch
-import sys
-# model.py가 있는 경로 추가
-sys.path.insert(0, "./model")
-from model import DriverBehaviorModel
-# 모델 생성 (pretrained=False: Kinetics 가중치 다운로드 안함)
-model = DriverBehaviorModel(num_classes=5, pretrained=False)
-# 학습된 가중치 로드
-state_dict = torch.load("./model/pytorch_model.bin", map_location="cpu", weights_only=True)
-model.load_state_dict(state_dict)
-model.eval()
-print("모델 로드 완료!")
-```
-### 3. 단일 비디오 추론
-```python
-import cv2
-import torch
-import numpy as np
-CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
-CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
-def preprocess_video(video_path, num_frames=30, size=(224, 224)):
-    """비디오 전처리"""
-    cap = cv2.VideoCapture(video_path)
-    frames = []
-    while len(frames) < num_frames:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frame = cv2.resize(frame, size)
-        frames.append(frame)
-    cap.release()
-    # 프레임 부족 시 마지막 프레임 복제
-    while len(frames) < num_frames:
-        frames.append(frames[-1] if frames else np.zeros((*size, 3), dtype=np.uint8))
-    # [T, H, W, C] -> [C, T, H, W]
-    frames = np.array(frames[:num_frames], dtype=np.float32)
-    frames = frames.transpose(3, 0, 1, 2) / 255.0
-    # ImageNet normalization
-    mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
-    std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
-    frames = (frames - mean) / std
-    return torch.FloatTensor(frames)
-def predict(model, video_path, device="cuda"):
-    """단일 비디오 추론"""
-    model = model.to(device)
-    model.eval()
-    frames = preprocess_video(video_path)
-    frames = frames.unsqueeze(0).to(device)  # [1, 3, 30, 224, 224]
-    with torch.no_grad():
-        outputs = model(frames)
-        probs = torch.softmax(outputs, dim=1)
-        pred_idx = torch.argmax(probs, dim=1).item()
-        confidence = probs[0, pred_idx].item()
-    return {
-        "class_id": pred_idx,
-        "class_name_ko": CLASS_NAMES[pred_idx],
-        "class_name_en": CLASS_NAMES_EN[pred_idx],
-        "confidence": confidence,
-        "probabilities": {name: probs[0, i].item() for i, name in enumerate(CLASS_NAMES)}
-    }
-# 사용 예시
-result = predict(model, "test_video.mp4", device="cuda")
-print(f"예측: {result['class_name_ko']} ({result['confidence']:.1%})")
-print(f"전체 확률: {result['probabilities']}")
-```
----
-## Real-time Inference (실시간 추론)
-```python
-import cv2
-import torch
-import numpy as np
-from collections import deque
-class RealtimeDetector:
-    """실시간 운전자 이상행동 탐지기"""
-    CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
-    COLORS = {
-        "정상": (0, 255, 0),        # 초록
-        "졸음운전": (0, 165, 255),   # 주황
-        "물건찾기": (0, 255, 255),   # 노랑
-        "휴대폰 사용": (0, 0, 255),  # 빨강
-        "운전자 폭행": (255, 0, 255) # 보라
-    }
-    def __init__(self, model_dir, device="cuda", window_size=30, stride=15):
-        self.device = device
-        self.window_size = window_size
-        self.stride = stride
-        # 모델 로드
-        import sys
-        sys.path.insert(0, model_dir)
-        from model import DriverBehaviorModel
-        self.model = DriverBehaviorModel(num_classes=5, pretrained=False)
-        state_dict = torch.load(f"{model_dir}/pytorch_model.bin",
-                               map_location="cpu", weights_only=True)
-        self.model.load_state_dict(state_dict)
-        self.model.to(device)
-        self.model.eval()
-        # 프레임 버퍼
-        self.buffer = deque(maxlen=window_size)
-        self.frame_count = 0
-        # Normalization
-        self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
-        self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
-    def process_frame(self, frame):
-        """프레임 처리 및 추론"""
-        # 전처리
-        processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        processed = cv2.resize(processed, (224, 224))
-        self.buffer.append(processed)
-        self.frame_count += 1
-        # stride마다 추론
-        if self.frame_count % self.stride == 0 and len(self.buffer) == self.window_size:
-            return self._predict()
-        return None
-    def _predict(self):
-        frames = np.array(list(self.buffer), dtype=np.float32)
-        frames = frames.transpose(3, 0, 1, 2) / 255.0
-        frames = (frames - self.mean) / self.std
-        with torch.no_grad():
-            inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
-            outputs = self.model(inputs)
-            probs = torch.softmax(outputs, dim=1)
-            pred_idx = torch.argmax(probs, dim=1).item()
-        return {
-            "class_id": pred_idx,
-            "class_name": self.CLASS_NAMES[pred_idx],
-            "confidence": probs[0, pred_idx].item(),
-            "is_abnormal": pred_idx != 0
-        }
-    def run(self, source=0):
-        """실시간 추론 실행 (source: 0=웹캠, 또는 비디오 경로)"""
-        cap = cv2.VideoCapture(source)
-        current_result = None
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            result = self.process_frame(frame)
-            if result:
-                current_result = result
-            # 화면 표시
-            if current_result:
-                label = current_result["class_name"]
-                conf = current_result["confidence"]
-                color = self.COLORS.get(label, (255, 255, 255))
-                cv2.putText(frame, f"{label}: {conf:.1%}", (10, 40),
-                           cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
-                if current_result["is_abnormal"]:
-                    cv2.putText(frame, "WARNING!", (10, 80),
-                               cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
-            cv2.imshow("Driver Behavior Detection", frame)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-        cap.release()
-        cv2.destroyAllWindows()
-# 사용 예시
-detector = RealtimeDetector("./model", device="cuda")
-detector.run(source=0)  # 웹캠
-# detector.run(source="video.mp4")  # 비디오 파일
-```
----
-## Batch Inference (대량 처리)
-```python
-import torch
-from pathlib import Path
-from torch.utils.data import Dataset, DataLoader
-class VideoDataset(Dataset):
-    def __init__(self, video_paths, num_frames=30):
-        self.video_paths = video_paths
-        self.num_frames = num_frames
-        self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
-        self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
-    def __len__(self):
-        return len(self.video_paths)
-    def __getitem__(self, idx):
-        path = str(self.video_paths[idx])
-        cap = cv2.VideoCapture(path)
-        frames = []
-        while len(frames) < self.num_frames:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = cv2.resize(frame, (224, 224))
-            frames.append(frame)
-        cap.release()
-        while len(frames) < self.num_frames:
-            frames.append(frames[-1] if frames else np.zeros((224, 224, 3), dtype=np.uint8))
-        frames = np.array(frames[:self.num_frames], dtype=np.float32)
-        frames = frames.transpose(3, 0, 1, 2) / 255.0
-        frames = (frames - self.mean) / self.std
-        return torch.FloatTensor(frames), path
-def batch_predict(model, video_folder, batch_size=8, device="cuda"):
-    """폴더 내 모든 비디오 배치 추론"""
-    CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
-    video_paths = list(Path(video_folder).glob("*.mp4")) + list(Path(video_folder).glob("*.avi"))
-    dataset = VideoDataset(video_paths)
-    loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
-    model = model.to(device)
-    model.eval()
-    results = []
-    with torch.no_grad():
-        for frames, paths in loader:
-            frames = frames.to(device)
-            outputs = model(frames)
-            probs = torch.softmax(outputs, dim=1)
-            preds = torch.argmax(probs, dim=1)
-            for path, pred, prob in zip(paths, preds, probs):
-                results.append({
-                    "path": path,
-                    "class_id": pred.item(),
-                    "class_name": CLASS_NAMES[pred.item()],
-                    "confidence": prob[pred].item()
-                })
-    return results
-# 사용 예시
-results = batch_predict(model, "./videos/", batch_size=16)
-for r in results:
-    print(f"{r['path']}: {r['class_name']} ({r['confidence']:.1%})")
-```
----
-## Input/Output Specification
-### Input
-| Parameter | Value |
-|-----------|-------|
-| Shape | `[batch, 3, 30, 224, 224]` |
-| Format | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
-| Color | RGB (not BGR!) |
-| Normalization | ImageNet: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
-| Frame Count | 30 frames (1 second at 30fps) |
-### Output
-| Parameter | Value |
-|-----------|-------|
-| Shape | `[batch, 5]` |
-| Type | Raw logits (use `softmax` for probabilities) |
-| Classes | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
----
-## Model Architecture
-```
-DriverBehaviorModel
-└── backbone: SwinTransformer3d (swin3d_t)
-    ├── patch_embed: Conv3d(3, 96, kernel=(2,4,4), stride=(2,4,4))
-    ├── features: Sequential
-    │   ├── BasicLayer (depth=2, heads=3, dim=96)
-    │   ├── PatchMerging
-    │   ├── BasicLayer (depth=2, heads=6, dim=192)
-    │   ├── PatchMerging
-    │   ├── BasicLayer (depth=6, heads=12, dim=384)
-    │   ├── PatchMerging
-    │   └── BasicLayer (depth=2, heads=24, dim=768)
-    ├── norm: LayerNorm(768)
-    ├── avgpool: AdaptiveAvgPool3d(1)
-    └── head: Sequential
-        ├── LayerNorm(768)
-        └── Linear(768, 5)
-Parameters: 29,699,819
-```
----
-## Training Details
-| Parameter | Value |
-|-----------|-------|
-| Base Model | swin3d_t (Kinetics-400 pretrained) |
-| Framework | PyTorch 2.0+ |
-| GPUs | 2x NVIDIA A6000 (48GB each) |
-| Training | DistributedDataParallel (DDP) |
-| Batch Size | 128 effective (16/GPU × 2 GPUs × 4 accum) |
-| Optimizer | AdamW (lr=1e-3, weight_decay=1e-4) |
-| Scheduler | OneCycleLR (pct_start=0.2) |
-| Mixed Precision | FP16 |
-| Epochs | 1 (of 5 total) |
----
-## Dataset
-| Property | Value |
-|----------|-------|
-| Name | Korean Driver Behavior Dataset |
-| Videos | 243,979 |
-| Samples | 1,371,062 (sliding window) |
-| Window | 30 frames |
-| Stride | 15 frames |
-| Classes | 5 |
-### Class Distribution
-| Class | Samples | Percentage |
-|-------|---------|------------|
-| 정상 | 159,224 | 11.6% |
-| 졸음운전 | 619,450 | 45.2% |
-| 물건찾기 | 261,435 | 19.1% |
-| 휴대폰 사용 | 150,981 | 11.0% |
-| 운전자 폭행 | 179,972 | 13.1% |
----
-## Limitations
-1. **Camera Position**: Optimized for front/side dashboard cameras
-2. **Lighting**: May degrade in low-light conditions (night, tunnels)
-3. **Occlusion**: Sunglasses, masks may reduce accuracy
-4. **Hardware**: GPU recommended for real-time inference
----
-## License
-Apache 2.0
----
-## Citation
-```bibtex
-@misc{driver-behavior-2025,
-  title={Driver Abnormal Behavior Detection using Video Swin Transformer},
-  author={C-Team},
-  year={2025},
-  publisher={HuggingFace}
-}
-```

+# Driver Behavior Detection Model (Epoch 2)
+운전자 이상행동 감지를 위한 Video Swin Transformer 기반 모델입니다.
+## Model Description
+- **Architecture**: Video Swin Transformer Tiny (swin3d_t)
+- **Backbone Pretrained**: Kinetics-400
+- **Parameters**: 27.85M
+- **Input**: [B, 3, 30, 224, 224] (batch, channels, frames, height, width)
+## Classes (5)
+| Label | Class | F1-Score |
+|:-----:|-------|:--------:|
+| 0 | 정상 (Normal) | 0.93 |
+| 1 | 졸음운전 (Drowsy Driving) | 0.98 |
+| 2 | 물건찾기 (Reaching/Searching) | 0.90 |
+| 3 | 휴대폰 사용 (Phone Usage) | 0.88 |
+| 4 | 운전자 폭행 (Driver Assault) | 1.00 |
+## Performance (Epoch 2)
+| Metric | Value |
+|--------|-------|
+| **Accuracy** | 95.15% |
+| **Macro F1** | 0.9392 |
+| **Validation Samples** | 1,371,062 |
+## Training Configuration
+| Parameter | Value |
+|-----------|-------|
+| Hardware | 2x NVIDIA RTX A6000 (48GB) |
+| Distributed | DDP (DistributedDataParallel) |
+| Batch Size | 32 (16 × 2 GPU) |
+| Gradient Accumulation | 4 |
+| Effective Batch | 128 |
+| Optimizer | AdamW (lr=1e-3, wd=0.05) |
+| Scheduler | OneCycleLR |
+| Mixed Precision | FP16 |
+| Loss | CrossEntropy + Label Smoothing (0.1) |
+| Regularization | Mixup (α=0.4), Dropout (0.3) |
+## Usage
+```python
+import torch
+from model import DriverBehaviorModel
+# Load model
+model = DriverBehaviorModel(num_classes=5, pretrained=False)
+checkpoint = torch.load("pytorch_model.bin", map_location="cpu")
+model.load_state_dict(checkpoint["model"])
+model.eval()
+# Inference
+# input: [1, 3, 30, 224, 224] - 30 frames, 224x224, RGB normalized
+with torch.no_grad():
+    output = model(video_tensor)
+    prediction = output.argmax(dim=1)
+```
+## Dataset
+- **Total Videos**: 243,979
+- **Total Samples (windows)**: 1,371,062
+- **Window Size**: 30 frames
+- **Stride**: 15 frames
+- **Resolution**: 224×224
+## Augmentation (Training)
+- RandomResizedCrop (scale 0.8-1.0)
+- HorizontalFlip (p=0.5)
+- ColorJitter, HueSaturationValue
+- Temporal Augmentation (speed change, frame drop)
+- Mixup (α=0.4)
+- CoarseDropout
+## License
+This model is for research purposes only.
+## Citation
+```
+@misc{driver-behavior-detection-2026,
+  title={Driver Behavior Detection using Video Swin Transformer},
+  author={C-Team},
+  year={2026}
+}
+```

config.json CHANGED Viewed

@@ -1,52 +1,32 @@
-{
-  "architectures": [
-    "VideoSwinTransformer"
-  ],
-  "model_type": "video-swin-transformer",
-  "backbone": "swin3d_t",
-  "pretrained_source": "kinetics400",
-  "num_classes": 5,
-  "class_names": [
-    "정상",
-    "졸음운전",
-    "물건찾기",
-    "휴대폰 사용",
-    "운전자 폭행"
-  ],
-  "input_size": {
-    "frames": 30,
-    "height": 224,
-    "width": 224,
-    "channels": 3
-  },
-  "input_format": "CTHW",
-  "training": {
-    "epochs_trained": 1,
-    "total_epochs": 5,
-    "batch_size": 16,
-    "effective_batch_size": 128,
-    "learning_rate": 0.001,
-    "optimizer": "AdamW",
-    "scheduler": "OneCycleLR",
-    "mixed_precision": true,
-    "gradient_accumulation_steps": 4
-  },
-  "metrics": {
-    "accuracy": 0.9551,
-    "macro_f1": 0.9436,
-    "per_class_f1": {
-      "정상": 0.92,
-      "졸음운전": 0.98,
-      "물건찾기": 0.92,
-      "휴대폰 사용": 0.9,
-      "운전자 폭행": 1.0
-    }
-  },
-  "dataset": {
-    "name": "Korean Driver Behavior Dataset",
-    "total_samples": 1371062,
-    "num_videos": 243979,
-    "sliding_window": 30,
-    "stride": 15
-  }
-}

+{
+  "architectures": ["DriverBehaviorModel"],
+  "model_type": "video-swin-transformer",
+  "backbone": "swin3d_t",
+  "num_classes": 5,
+  "class_names": ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"],
+  "input_size": [3, 30, 224, 224],
+  "pretrained_backbone": "Kinetics-400",
+  "head": {
+    "type": "Sequential",
+    "layers": ["LayerNorm(768)", "Dropout(0.3)", "Linear(768, 5)"]
+  },
+  "training": {
+    "epoch": 2,
+    "accuracy": 0.9515,
+    "macro_f1": 0.9392,
+    "batch_size": 32,
+    "optimizer": "AdamW",
+    "learning_rate": 1e-3,
+    "weight_decay": 0.05,
+    "scheduler": "OneCycleLR",
+    "mixed_precision": "fp16",
+    "augmentation": ["Mixup(0.4)", "RandomResizedCrop", "HorizontalFlip", "ColorJitter", "TemporalAugmentation"]
+  },
+  "performance": {
+    "정상": {"precision": 0.91, "recall": 0.95, "f1": 0.93},
+    "졸음운전": {"precision": 0.99, "recall": 0.97, "f1": 0.98},
+    "물건찾기": {"precision": 0.92, "recall": 0.88, "f1": 0.90},
+    "휴대폰 사용": {"precision": 0.84, "recall": 0.93, "f1": 0.88},
+    "운전자 폭행": {"precision": 1.00, "recall": 1.00, "f1": 1.00}
+  }
+}

model.py CHANGED Viewed

@@ -48,6 +48,7 @@ class DriverBehaviorModel(nn.Module):
         in_features = self.backbone.head.in_features  # 768
         self.backbone.head = nn.Sequential(
             nn.LayerNorm(in_features),
             nn.Linear(in_features, num_classes),
         )

         in_features = self.backbone.head.in_features  # 768
         self.backbone.head = nn.Sequential(
             nn.LayerNorm(in_features),
+            nn.Dropout(p=0.3),  # 오버피팅 방지
             nn.Linear(in_features, num_classes),
         )

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc7eb66a00e43a79a4db83cad13a36dc97b87d500a1a6f0bcec72779d22fdaf9
 size 126244047

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae9125be6e38460b5519ca5fc0bad96e952297b1858a95bd15ebaa7d0a772f3f
 size 126244047