koreashin
/

Driver_monitoring

@@ -17,13 +17,13 @@ datasets:
 - custom
 ---
-# 🚗 Driver Abnormal Behavior Detection Model
 **운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
 차량 내 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
-## 📊 Model Performance
 | Metric | Score |
 |--------|-------|
@@ -33,75 +33,85 @@ datasets:
 ### Per-Class Performance
-| Class | Korean | Precision | Recall | F1-Score | Support |
-|-------|--------|-----------|--------|----------|---------|
-| 0 | 정상 (Normal) | 0.93 | 0.92 | 0.92 | 159,224 |
-| 1 | 졸음운전 (Drowsy) | 0.99 | 0.98 | 0.98 | 619,450 |
-| 2 | 물건찾기 (Searching) | 0.90 | 0.94 | 0.92 | 261,435 |
-| 3 | 휴대폰 사용 (Phone) | 0.91 | 0.88 | 0.90 | 150,981 |
-| 4 | 운전자 폭행 (Assault) | 1.00 | 1.00 | 1.00 | 179,972 |
 ---
-## 🛠️ Installation
-```bash
-# PyTorch 2.0+ 필요
-pip install torch torchvision
-# 추가 dependencies
-pip install opencv-python numpy
-# (선택) HuggingFace에서 다운로드
-pip install huggingface_hub
 ```
 ---
-## 🚀 Quick Start
-### 1. 모델 다운로드 및 로드
-```python
-import torch
-from torchvision.models.video import swin3d_t
-# ===== 방법 1: 로컬 파일에서 로드 =====
-model = swin3d_t(weights=None)
-model.head = torch.nn.Linear(model.head.in_features, 5)  # 5 classes
-state_dict = torch.load("pytorch_model.bin", map_location="cpu", weights_only=True)
-model.load_state_dict(state_dict)
-model.eval()
-# ===== 방법 2: HuggingFace Hub에서 로드 =====
-from huggingface_hub import hf_hub_download
-model_path = hf_hub_download(
-    repo_id="YOUR_USERNAME/driver-behavior-swin-t",
-    filename="pytorch_model.bin"
-)
-state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
-model = swin3d_t(weights=None)
-model.head = torch.nn.Linear(model.head.in_features, 5)
 model.load_state_dict(state_dict)
 model.eval()
 ```
-### 2. 단일 비디오 추론
 ```python
 import cv2
 import torch
 import numpy as np
-# 클래스 정의
 CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
 CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
-def load_video_frames(video_path, num_frames=30, size=(224, 224)):
-    """비디오에서 프레임 추출 및 전처리"""
     cap = cv2.VideoCapture(video_path)
     frames = []
@@ -109,12 +119,9 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
         ret, frame = cap.read()
         if not ret:
             break
-        # BGR -> RGB
         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        # Resize
         frame = cv2.resize(frame, size)
         frames.append(frame)
     cap.release()
     # 프레임 부족 시 마지막 프레임 복제
@@ -123,10 +130,7 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
     # [T, H, W, C] -> [C, T, H, W]
     frames = np.array(frames[:num_frames], dtype=np.float32)
-    frames = frames.transpose(3, 0, 1, 2)  # [C, T, H, W]
-    # Normalize to [0, 1]
-    frames = frames / 255.0
     # ImageNet normalization
     mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
@@ -135,16 +139,15 @@ def load_video_frames(video_path, num_frames=30, size=(224, 224)):
     return torch.FloatTensor(frames)
 def predict(model, video_path, device="cuda"):
     """단일 비디오 추론"""
     model = model.to(device)
     model.eval()
-    # 프레임 로드
-    frames = load_video_frames(video_path)
-    frames = frames.unsqueeze(0).to(device)  # [1, C, T, H, W]
-    # 추론
     with torch.no_grad():
         outputs = model(frames)
         probs = torch.softmax(outputs, dim=1)
@@ -156,20 +159,19 @@ def predict(model, video_path, device="cuda"):
         "class_name_ko": CLASS_NAMES[pred_idx],
         "class_name_en": CLASS_NAMES_EN[pred_idx],
         "confidence": confidence,
-        "all_probabilities": {
-            CLASS_NAMES[i]: probs[0, i].item()
-            for i in range(len(CLASS_NAMES))
-        }
     }
 # 사용 예시
-result = predict(model, "test_video.mp4")
-print(f"예측: {result['class_name_ko']} ({result['confidence']:.2%})")
 ```
 ---
-## 📹 Real-time Inference (실시간 추론)
 ```python
 import cv2
@@ -177,103 +179,77 @@ import torch
 import numpy as np
 from collections import deque
-class RealtimeDriverBehaviorDetector:
     """실시간 운전자 이상행동 탐지기"""
     CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
-    def __init__(self, model_path, device="cuda", window_size=30, stride=15):
-        """
-        Args:
-            model_path: pytorch_model.bin 경로
-            device: 'cuda' 또는 'cpu'
-            window_size: 분석할 프레임 수 (기본 30 = 1초 @30fps)
-            stride: 슬라이딩 윈도우 간격 (기본 15 = 0.5초)
-        """
         self.device = device
         self.window_size = window_size
         self.stride = stride
         # 모델 로드
-        from torchvision.models.video import swin3d_t
-        self.model = swin3d_t(weights=None)
-        self.model.head = torch.nn.Linear(self.model.head.in_features, 5)
-        state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         self.model.load_state_dict(state_dict)
         self.model.to(device)
         self.model.eval()
         # 프레임 버퍼
-        self.frame_buffer = deque(maxlen=window_size)
         self.frame_count = 0
-        # Normalization 파라미터
         self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
         self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
-    def preprocess_frame(self, frame):
-        """단일 프레임 전처리"""
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frame = cv2.resize(frame, (224, 224))
-        return frame
-    def predict(self):
-        """현재 버퍼의 프레임으로 추론"""
-        if len(self.frame_buffer) < self.window_size:
-            return None
-        # [T, H, W, C] -> [C, T, H, W]
-        frames = np.array(list(self.frame_buffer), dtype=np.float32)
         frames = frames.transpose(3, 0, 1, 2) / 255.0
         frames = (frames - self.mean) / self.std
-        # 추론
         with torch.no_grad():
             inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
             outputs = self.model(inputs)
             probs = torch.softmax(outputs, dim=1)
             pred_idx = torch.argmax(probs, dim=1).item()
-            confidence = probs[0, pred_idx].item()
         return {
             "class_id": pred_idx,
             "class_name": self.CLASS_NAMES[pred_idx],
-            "confidence": confidence,
-            "is_abnormal": pred_idx != 0,  # 0 = 정상
-            "probabilities": probs[0].cpu().numpy()
-        }
-    def process_frame(self, frame):
-        """프레임 처리 (stride마다 추론)"""
-        processed = self.preprocess_frame(frame)
-        self.frame_buffer.append(processed)
-        self.frame_count += 1
-        # stride마다 추론
-        if self.frame_count % self.stride == 0:
-            return self.predict()
-        return None
-    def run_on_video(self, video_source=0, show_display=True):
-        """
-        비디오 소스에서 실시간 추론
-        Args:
-            video_source: 웹캠(0) 또는 비디오 파일 경로
-            show_display: 화면 출력 여부
-        """
-        cap = cv2.VideoCapture(video_source)
-        # 색상 정의 (BGR)
-        colors = {
-            "정상": (0, 255, 0),        # 초록
-            "졸음운전": (0, 165, 255),   # 주황
-            "물건찾기": (0, 255, 255),   # 노랑
-            "휴대폰 사용": (0, 0, 255),  # 빨강
-            "운전자 폭행": (255, 0, 255) # 보라
         }
         current_result = None
         while True:
@@ -281,29 +257,24 @@ class RealtimeDriverBehaviorDetector:
             if not ret:
                 break
-            # 추론
             result = self.process_frame(frame)
             if result:
                 current_result = result
-            # 화면 출력
-            if show_display and current_result:
                 label = current_result["class_name"]
                 conf = current_result["confidence"]
-                color = colors.get(label, (255, 255, 255))
-                # 상태 표시
-                text = f"{label}: {conf:.1%}"
-                cv2.putText(frame, text, (10, 40),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
-                # 경고 (이상행동 탐지 시)
                 if current_result["is_abnormal"]:
                     cv2.putText(frame, "WARNING!", (10, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
-                cv2.imshow("Driver Behavior Detection", frame)
             if cv2.waitKey(1) & 0xFF == ord('q'):
                 break
@@ -311,19 +282,15 @@ class RealtimeDriverBehaviorDetector:
         cv2.destroyAllWindows()
-# ===== 사용 예시 =====
-# 1. 웹캠 실시간 추론
-detector = RealtimeDriverBehaviorDetector("pytorch_model.bin", device="cuda")
-detector.run_on_video(video_source=0)  # 웹캠
-# 2. 비디오 파일 추론
-detector.run_on_video(video_source="test_video.mp4")
 ```
 ---
-## 🔧 Batch Inference (배치 추론)
 ```python
 import torch
@@ -331,8 +298,6 @@ from pathlib import Path
 from torch.utils.data import Dataset, DataLoader
 class VideoDataset(Dataset):
-    """비디오 파일 배치 처리용 Dataset"""
     def __init__(self, video_paths, num_frames=30):
         self.video_paths = video_paths
         self.num_frames = num_frames
@@ -343,9 +308,8 @@ class VideoDataset(Dataset):
         return len(self.video_paths)
     def __getitem__(self, idx):
-        video_path = self.video_paths[idx]
-        cap = cv2.VideoCapture(str(video_path))
         frames = []
         while len(frames) < self.num_frames:
@@ -355,7 +319,6 @@ class VideoDataset(Dataset):
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = cv2.resize(frame, (224, 224))
             frames.append(frame)
         cap.release()
         while len(frames) < self.num_frames:
@@ -365,130 +328,120 @@ class VideoDataset(Dataset):
         frames = frames.transpose(3, 0, 1, 2) / 255.0
         frames = (frames - self.mean) / self.std
-        return torch.FloatTensor(frames), str(video_path)
-def batch_inference(model, video_folder, batch_size=8, device="cuda"):
-    """
-    폴더 내 모든 비디오 배치 추론
-    Args:
-        model: 로드된 모델
-        video_folder: 비디오 폴더 경로
-        batch_size: 배치 크기
-        device: 'cuda' 또는 'cpu'
-    Returns:
-        List of (video_path, prediction) tuples
-    """
     CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
-    video_folder = Path(video_folder)
-    video_paths = list(video_folder.glob("*.mp4")) + list(video_folder.glob("*.avi"))
     dataset = VideoDataset(video_paths)
-    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
     model = model.to(device)
     model.eval()
     results = []
     with torch.no_grad():
-        for frames, paths in dataloader:
             frames = frames.to(device)
             outputs = model(frames)
             probs = torch.softmax(outputs, dim=1)
             preds = torch.argmax(probs, dim=1)
-            for path, pred_idx, prob in zip(paths, preds, probs):
                 results.append({
-                    "video_path": path,
-                    "class_id": pred_idx.item(),
-                    "class_name": CLASS_NAMES[pred_idx.item()],
-                    "confidence": prob[pred_idx].item()
                 })
     return results
 # 사용 예시
-results = batch_inference(model, "./videos/", batch_size=16)
 for r in results:
-    print(f"{r['video_path']}: {r['class_name']} ({r['confidence']:.2%})")
 ```
 ---
-## 📐 Input/Output Specification
-### Input Format
 | Parameter | Value |
 |-----------|-------|
-| **Shape** | `[batch, 3, 30, 224, 224]` |
-| **Format** | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
-| **Channels** | RGB (not BGR) |
-| **Normalization** | ImageNet (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
-| **Value Range** | After normalization: approximately [-2.5, 2.5] |
-### Output Format
 | Parameter | Value |
 |-----------|-------|
-| **Shape** | `[batch, 5]` |
-| **Format** | Raw logits (use softmax for probabilities) |
-| **Classes** | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
 ---
-## ⚙️ Model Architecture
 ```
-VideoSwinTransformer (swin3d_t)
-├── patch_embed: PatchEmbed3d
-│   └── proj: Conv3d(3, 96, kernel_size=(2,4,4), stride=(2,4,4))
-├── layers: Sequential
-│   ├── BasicLayer (depth=2, heads=3, dim=96)
-│   ├── BasicLayer (depth=2, heads=6, dim=192)
-│   ├── BasicLayer (depth=6, heads=12, dim=384)
-│   └── BasicLayer (depth=2, heads=24, dim=768)
-├── norm: LayerNorm(768)
-├── avgpool: AdaptiveAvgPool3d(1)
-└── head: Linear(768, 5)  # Modified for 5 classes
-Total Parameters: 27,855,851
-Trainable Parameters: 27,855,851
 ```
 ---
-## 🏋️ Training Details
 | Parameter | Value |
 |-----------|-------|
-| **Base Model** | swin3d_t (Kinetics-400 pretrained) |
-| **Framework** | PyTorch 2.0+ |
-| **GPUs** | 2x NVIDIA A6000 (48GB each) |
-| **Training Method** | DistributedDataParallel (DDP) |
-| **Batch Size** | 128 effective (16 per GPU × 2 GPUs × 4 accumulation) |
-| **Optimizer** | AdamW (lr=1e-3, weight_decay=1e-4) |
-| **Scheduler** | OneCycleLR (pct_start=0.2, anneal=cosine) |
-| **Mixed Precision** | FP16 (torch.amp) |
-| **Epochs** | 1 (of 5 total) |
 ---
-## 📁 Dataset Information
 | Property | Value |
 |----------|-------|
-| **Name** | Korean Driver Behavior Dataset |
-| **Total Videos** | 243,979 |
-| **Total Samples** | 1,371,062 (sliding window) |
-| **Window Size** | 30 frames |
-| **Stride** | 15 frames |
-| **Resolution** | Various (resized to 224×224) |
-| **FPS** | 30 |
 ### Class Distribution
@@ -502,34 +455,28 @@ Trainable Parameters: 27,855,851
 ---
-## ⚠️ Limitations & Considerations
-1. **카메라 위치**: 운전석 정면 또는 측면 카메라에 최적화됨
-2. **조명 조건**: 야간/터널 등 저조도 환경에서 성능 저하 가능
-3. **가림 현상**: 선글라스, 마스크 착용 시 정확도 감소 가능
-4. **실시간 요구사항**: GPU 필요 (CPU에서는 느림)
 ---
-## 📜 License
 Apache 2.0
 ---
-## 🔗 Citation
 ```bibtex
-@misc{driver-behavior-detection-2025,
   title={Driver Abnormal Behavior Detection using Video Swin Transformer},
   author={C-Team},
   year={2025},
-  howpublished={\url{https://huggingface.co/YOUR_USERNAME/driver-behavior-swin-t}}
 }
 ```
----
-## 📞 Contact
-Issues and questions: [GitHub Issues](https://github.com/YOUR_USERNAME/driver-behavior-detection/issues)

 - custom
 ---
+# Driver Abnormal Behavior Detection Model
 **운전자 이상행동 탐지 모델** - Video Swin Transformer 기반
 차량 내 카메라 영상에서 운전자의 이상행동을 실시간으로 탐지하는 딥러닝 모델입니다.
+## Model Performance
 | Metric | Score |
 |--------|-------|
 ### Per-Class Performance
+| Class ID | Korean | English | Precision | Recall | F1-Score |
+|----------|--------|---------|-----------|--------|----------|
+| 0 | 정상 | Normal | 0.93 | 0.92 | 0.92 |
+| 1 | 졸음운전 | Drowsy Driving | 0.99 | 0.98 | 0.98 |
+| 2 | 물건찾기 | Searching Objects | 0.90 | 0.94 | 0.92 |
+| 3 | 휴대폰 사용 | Phone Usage | 0.91 | 0.88 | 0.90 |
+| 4 | 운전자 폭행 | Driver Assault | 1.00 | 1.00 | 1.00 |
 ---
+## Files in This Repository
+```
+driver-behavior-model-epoch1/
+├── pytorch_model.bin   # 모델 가중치 (120MB)
+├── model.py            # 모델 클래스 정의 (필수!)
+├── config.json         # 설정 파일
+└── README.md           # 이 파일
+```
+**중요: `model.py`와 `pytorch_model.bin` 둘 다 필요합니다!**
+---
+## Installation
+```bash
+pip install torch torchvision opencv-python numpy
+pip install huggingface_hub  # HuggingFace에서 다운로드 시
 ```
 ---
+## Quick Start
+### 1. 모델 다운로드
+```bash
+# HuggingFace CLI로 다운로드
+huggingface-cli download YOUR_USERNAME/driver-behavior-swin-t --local-dir ./model
+# 또는 Python으로
+from huggingface_hub import snapshot_download
+snapshot_download(repo_id="YOUR_USERNAME/driver-behavior-swin-t", local_dir="./model")
+```
+### 2. 모델 로드
+```python
+import torch
+import sys
+# model.py가 있는 경로 추가
+sys.path.insert(0, "./model")
+from model import DriverBehaviorModel
+# 모델 생성 (pretrained=False: Kinetics 가중치 다운로드 안함)
+model = DriverBehaviorModel(num_classes=5, pretrained=False)
+# 학습된 가중치 로드
+state_dict = torch.load("./model/pytorch_model.bin", map_location="cpu", weights_only=True)
 model.load_state_dict(state_dict)
 model.eval()
+print("모델 로드 완료!")
 ```
+### 3. 단일 비디오 추론
 ```python
 import cv2
 import torch
 import numpy as np
 CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
 CLASS_NAMES_EN = ["Normal", "Drowsy Driving", "Searching Objects", "Phone Usage", "Driver Assault"]
+def preprocess_video(video_path, num_frames=30, size=(224, 224)):
+    """비디오 전처리"""
     cap = cv2.VideoCapture(video_path)
     frames = []
         ret, frame = cap.read()
         if not ret:
             break
         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         frame = cv2.resize(frame, size)
         frames.append(frame)
     cap.release()
     # 프레임 부족 시 마지막 프레임 복제
     # [T, H, W, C] -> [C, T, H, W]
     frames = np.array(frames[:num_frames], dtype=np.float32)
+    frames = frames.transpose(3, 0, 1, 2) / 255.0
     # ImageNet normalization
     mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
     return torch.FloatTensor(frames)
 def predict(model, video_path, device="cuda"):
     """단일 비디오 추론"""
     model = model.to(device)
     model.eval()
+    frames = preprocess_video(video_path)
+    frames = frames.unsqueeze(0).to(device)  # [1, 3, 30, 224, 224]
     with torch.no_grad():
         outputs = model(frames)
         probs = torch.softmax(outputs, dim=1)
         "class_name_ko": CLASS_NAMES[pred_idx],
         "class_name_en": CLASS_NAMES_EN[pred_idx],
         "confidence": confidence,
+        "probabilities": {name: probs[0, i].item() for i, name in enumerate(CLASS_NAMES)}
     }
 # 사용 예시
+result = predict(model, "test_video.mp4", device="cuda")
+print(f"예측: {result['class_name_ko']} ({result['confidence']:.1%})")
+print(f"전체 확률: {result['probabilities']}")
 ```
 ---
+## Real-time Inference (실시간 추론)
 ```python
 import cv2
 import numpy as np
 from collections import deque
+class RealtimeDetector:
     """실시간 운전자 이상행동 탐지기"""
     CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
+    COLORS = {
+        "정상": (0, 255, 0),        # 초록
+        "졸음운전": (0, 165, 255),   # 주황
+        "물건찾기": (0, 255, 255),   # 노랑
+        "휴대폰 사용": (0, 0, 255),  # 빨강
+        "운전자 폭행": (255, 0, 255) # 보라
+    }
+    def __init__(self, model_dir, device="cuda", window_size=30, stride=15):
         self.device = device
         self.window_size = window_size
         self.stride = stride
         # 모델 로드
+        import sys
+        sys.path.insert(0, model_dir)
+        from model import DriverBehaviorModel
+        self.model = DriverBehaviorModel(num_classes=5, pretrained=False)
+        state_dict = torch.load(f"{model_dir}/pytorch_model.bin",
+                               map_location="cpu", weights_only=True)
         self.model.load_state_dict(state_dict)
         self.model.to(device)
         self.model.eval()
         # 프레임 버퍼
+        self.buffer = deque(maxlen=window_size)
         self.frame_count = 0
+        # Normalization
         self.mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1, 1)
         self.std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1, 1)
+    def process_frame(self, frame):
+        """프레임 처리 및 추론"""
+        # 전처리
+        processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        processed = cv2.resize(processed, (224, 224))
+        self.buffer.append(processed)
+        self.frame_count += 1
+        # stride마다 추론
+        if self.frame_count % self.stride == 0 and len(self.buffer) == self.window_size:
+            return self._predict()
+        return None
+    def _predict(self):
+        frames = np.array(list(self.buffer), dtype=np.float32)
         frames = frames.transpose(3, 0, 1, 2) / 255.0
         frames = (frames - self.mean) / self.std
         with torch.no_grad():
             inputs = torch.FloatTensor(frames).unsqueeze(0).to(self.device)
             outputs = self.model(inputs)
             probs = torch.softmax(outputs, dim=1)
             pred_idx = torch.argmax(probs, dim=1).item()
         return {
             "class_id": pred_idx,
             "class_name": self.CLASS_NAMES[pred_idx],
+            "confidence": probs[0, pred_idx].item(),
+            "is_abnormal": pred_idx != 0
         }
+    def run(self, source=0):
+        """실시간 추론 실행 (source: 0=웹캠, 또는 비디오 경로)"""
+        cap = cv2.VideoCapture(source)
         current_result = None
         while True:
             if not ret:
                 break
             result = self.process_frame(frame)
             if result:
                 current_result = result
+            # 화면 표시
+            if current_result:
                 label = current_result["class_name"]
                 conf = current_result["confidence"]
+                color = self.COLORS.get(label, (255, 255, 255))
+                cv2.putText(frame, f"{label}: {conf:.1%}", (10, 40),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
                 if current_result["is_abnormal"]:
                     cv2.putText(frame, "WARNING!", (10, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
+            cv2.imshow("Driver Behavior Detection", frame)
             if cv2.waitKey(1) & 0xFF == ord('q'):
                 break
         cv2.destroyAllWindows()
+# 사용 예시
+detector = RealtimeDetector("./model", device="cuda")
+detector.run(source=0)  # 웹캠
+# detector.run(source="video.mp4")  # 비디오 파일
 ```
 ---
+## Batch Inference (대량 처리)
 ```python
 import torch
 from torch.utils.data import Dataset, DataLoader
 class VideoDataset(Dataset):
     def __init__(self, video_paths, num_frames=30):
         self.video_paths = video_paths
         self.num_frames = num_frames
         return len(self.video_paths)
     def __getitem__(self, idx):
+        path = str(self.video_paths[idx])
+        cap = cv2.VideoCapture(path)
         frames = []
         while len(frames) < self.num_frames:
             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frame = cv2.resize(frame, (224, 224))
             frames.append(frame)
         cap.release()
         while len(frames) < self.num_frames:
         frames = frames.transpose(3, 0, 1, 2) / 255.0
         frames = (frames - self.mean) / self.std
+        return torch.FloatTensor(frames), path
+def batch_predict(model, video_folder, batch_size=8, device="cuda"):
+    """폴더 내 모든 비디오 배치 추론"""
     CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
+    video_paths = list(Path(video_folder).glob("*.mp4")) + list(Path(video_folder).glob("*.avi"))
     dataset = VideoDataset(video_paths)
+    loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
     model = model.to(device)
     model.eval()
     results = []
     with torch.no_grad():
+        for frames, paths in loader:
             frames = frames.to(device)
             outputs = model(frames)
             probs = torch.softmax(outputs, dim=1)
             preds = torch.argmax(probs, dim=1)
+            for path, pred, prob in zip(paths, preds, probs):
                 results.append({
+                    "path": path,
+                    "class_id": pred.item(),
+                    "class_name": CLASS_NAMES[pred.item()],
+                    "confidence": prob[pred].item()
                 })
     return results
 # 사용 예시
+results = batch_predict(model, "./videos/", batch_size=16)
 for r in results:
+    print(f"{r['path']}: {r['class_name']} ({r['confidence']:.1%})")
 ```
 ---
+## Input/Output Specification
+### Input
 | Parameter | Value |
 |-----------|-------|
+| Shape | `[batch, 3, 30, 224, 224]` |
+| Format | `[B, C, T, H, W]` (Batch, Channel, Time, Height, Width) |
+| Color | RGB (not BGR!) |
+| Normalization | ImageNet: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
+| Frame Count | 30 frames (1 second at 30fps) |
+### Output
 | Parameter | Value |
 |-----------|-------|
+| Shape | `[batch, 5]` |
+| Type | Raw logits (use `softmax` for probabilities) |
+| Classes | 0=정상, 1=졸음운전, 2=물건찾기, 3=휴대폰사용, 4=운전자폭행 |
 ---
+## Model Architecture
 ```
+DriverBehaviorModel
+└── backbone: SwinTransformer3d (swin3d_t)
+    ├── patch_embed: Conv3d(3, 96, kernel=(2,4,4), stride=(2,4,4))
+    ├── features: Sequential
+    │   ├── BasicLayer (depth=2, heads=3, dim=96)
+    │   ├── PatchMerging
+    │   ├── BasicLayer (depth=2, heads=6, dim=192)
+    │   ├── PatchMerging
+    │   ├── BasicLayer (depth=6, heads=12, dim=384)
+    │   ├── PatchMerging
+    │   └── BasicLayer (depth=2, heads=24, dim=768)
+    ├── norm: LayerNorm(768)
+    ├── avgpool: AdaptiveAvgPool3d(1)
+    └── head: Sequential
+        ├── LayerNorm(768)
+        └── Linear(768, 5)
+Parameters: 29,699,819
 ```
 ---
+## Training Details
 | Parameter | Value |
 |-----------|-------|
+| Base Model | swin3d_t (Kinetics-400 pretrained) |
+| Framework | PyTorch 2.0+ |
+| GPUs | 2x NVIDIA A6000 (48GB each) |
+| Training | DistributedDataParallel (DDP) |
+| Batch Size | 128 effective (16/GPU × 2 GPUs × 4 accum) |
+| Optimizer | AdamW (lr=1e-3, weight_decay=1e-4) |
+| Scheduler | OneCycleLR (pct_start=0.2) |
+| Mixed Precision | FP16 |
+| Epochs | 1 (of 5 total) |
 ---
+## Dataset
 | Property | Value |
 |----------|-------|
+| Name | Korean Driver Behavior Dataset |
+| Videos | 243,979 |
+| Samples | 1,371,062 (sliding window) |
+| Window | 30 frames |
+| Stride | 15 frames |
+| Classes | 5 |
 ### Class Distribution
 ---
+## Limitations
+1. **Camera Position**: Optimized for front/side dashboard cameras
+2. **Lighting**: May degrade in low-light conditions (night, tunnels)
+3. **Occlusion**: Sunglasses, masks may reduce accuracy
+4. **Hardware**: GPU recommended for real-time inference
 ---
+## License
 Apache 2.0
 ---
+## Citation
 ```bibtex
+@misc{driver-behavior-2025,
   title={Driver Abnormal Behavior Detection using Video Swin Transformer},
   author={C-Team},
   year={2025},
+  publisher={HuggingFace}
 }
 ```

model.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+운전자 이상행동 감지 모델
+- 백본: TorchVision Video Swin-T (Kinetics-400 사전학습)
+- 입력: [B, 3, 30, 224, 224] (배치, 채널, 프레임, 높이, 너비)
+- 출력: 5클래스 분류 (정상, 졸음운전, 물건찾기, 휴대폰 사용, 운전자 폭행)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models.video import swin3d_t, Swin3D_T_Weights
+from typing import Dict, Optional
+class DriverBehaviorModel(nn.Module):
+    """
+    운전자 이상행동 감지 모델
+    Args:
+        num_classes: 출력 클래스 수 (기본값: 5, 전체 버전)
+        pretrained: Kinetics-400 사전학습 가중치 사용 여부
+        freeze_backbone: 백본 파라미터 동결 여부 (전이학습 시)
+    """
+    # 전체 5클래스
+    CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
+    def __init__(
+        self,
+        num_classes: int = 5,
+        pretrained: bool = True,
+        freeze_backbone: bool = False,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        # TorchVision Video Swin-T 백본 로드
+        if pretrained:
+            print("Loading Kinetics-400 pretrained weights...")
+            self.backbone = swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1)
+        else:
+            self.backbone = swin3d_t(weights=None)
+        # 원본 head 교체 (Kinetics-400: 400클래스 → 5클래스)
+        # swin3d_t의 head는 nn.Linear(768, 400)
+        in_features = self.backbone.head.in_features  # 768
+        self.backbone.head = nn.Sequential(
+            nn.LayerNorm(in_features),
+            nn.Linear(in_features, num_classes),
+        )
+        # 백본 동결 옵션
+        if freeze_backbone:
+            self._freeze_backbone()
+        # Head 가중치 초기화
+        self._init_head()
+    def _freeze_backbone(self):
+        """백본 파라미터 동결 (head 제외)"""
+        for name, param in self.backbone.named_parameters():
+            if 'head' not in name:
+                param.requires_grad = False
+        print("Backbone parameters frozen (head trainable)")
+    def _init_head(self):
+        """Head 가중치 초기화"""
+        for m in self.backbone.head.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        순전파
+        Args:
+            x: [B, C, T, H, W] 형태의 비디오 텐서
+               - B: 배치 크기
+               - C: 채널 (3)
+               - T: 프레임 수 (30)
+               - H, W: 높이, 너비 (224, 224)
+        Returns:
+            logits: [B, num_classes] 형태의 로짓
+        """
+        return self.backbone(x)
+    def predict(self, x: torch.Tensor) -> Dict:
+        """
+        추론용 예측 (단일 샘플)
+        Args:
+            x: [1, 3, 30, 224, 224] 형태의 비디오 텐서
+        Returns:
+            {
+                "class": int (0~4),
+                "confidence": float (0~1),
+                "class_name": str
+            }
+        """
+        self.eval()
+        with torch.no_grad():
+            logits = self.forward(x)
+            probs = F.softmax(logits, dim=-1)[0]
+            class_idx = probs.argmax().item()
+            confidence = probs[class_idx].item()
+            return {
+                "class": class_idx,
+                "confidence": confidence,
+                "class_name": self.CLASS_NAMES[class_idx],
+            }
+    def get_all_probs(self, x: torch.Tensor) -> Dict:
+        """
+        모든 클래스의 확률 반환
+        Args:
+            x: [1, 3, 30, 224, 224] 형태의 비디오 텐서
+        Returns:
+            {
+                "predictions": [{"class": int, "class_name": str, "probability": float}, ...],
+                "top_class": int,
+                "top_confidence": float
+            }
+        """
+        self.eval()
+        with torch.no_grad():
+            logits = self.forward(x)
+            probs = F.softmax(logits, dim=-1)[0]
+            predictions = []
+            for i, prob in enumerate(probs):
+                predictions.append({
+                    "class": i,
+                    "class_name": self.CLASS_NAMES[i],
+                    "probability": prob.item(),
+                })
+            # 확률 내림차순 정렬
+            predictions.sort(key=lambda x: x["probability"], reverse=True)
+            return {
+                "predictions": predictions,
+                "top_class": predictions[0]["class"],
+                "top_confidence": predictions[0]["probability"],
+            }
+def create_model(
+    num_classes: int = 3,
+    pretrained: bool = True,
+    freeze_backbone: bool = False,
+    checkpoint_path: Optional[str] = None,
+) -> DriverBehaviorModel:
+    """
+    모델 생성 헬퍼 함수
+    Args:
+        num_classes: 출력 클래스 수
+        pretrained: 사전학습 가중치 사용 여부
+        freeze_backbone: 백본 동결 여부
+        checkpoint_path: 체크포인트 경로 (학습된 가중치 로드)
+    Returns:
+        DriverBehaviorModel 인스턴스
+    """
+    model = DriverBehaviorModel(
+        num_classes=num_classes,
+        pretrained=pretrained,
+        freeze_backbone=freeze_backbone,
+    )
+    if checkpoint_path:
+        print(f"Loading checkpoint from {checkpoint_path}...")
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+        print("Checkpoint loaded successfully")
+    return model
+if __name__ == "__main__":
+    # 모델 테스트
+    print("=" * 60)
+    print("Model Test (3 classes - Demo)")
+    print("=" * 60)
+    # 모델 생성
+    model = DriverBehaviorModel(num_classes=5, pretrained=True)
+    # 파라미터 수 출력
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    # 더미 입력으로 테스트
+    dummy_input = torch.randn(2, 3, 30, 224, 224)
+    print(f"\nInput shape: {dummy_input.shape}")
+    # Forward pass
+    model.eval()
+    with torch.no_grad():
+        output = model(dummy_input)
+    print(f"Output shape: {output.shape}")
+    # 단일 샘플 예측 테스트
+    single_input = torch.randn(1, 3, 30, 224, 224)
+    prediction = model.predict(single_input)
+    print(f"\nPrediction: {prediction}")
+    # 모든 확률 출력 테스트
+    all_probs = model.get_all_probs(single_input)
+    print(f"\nAll probabilities:")
+    for pred in all_probs["predictions"]:
+        print(f"  {pred['class_name']}: {pred['probability']:.4f}")
+    print("\nModel test passed!")