File size: 4,143 Bytes
65ec53e
 
 
 
2a3d108
5ba2d63
65ec53e
 
 
5ba2d63
 
 
2a3d108
 
 
 
 
 
65ec53e
5ba2d63
65ec53e
 
 
5ba2d63
65ec53e
 
 
 
2a3d108
 
65ec53e
 
ef376b2
65ec53e
 
 
 
 
ef376b2
65ec53e
 
 
 
 
 
2a3d108
 
65ec53e
 
2a3d108
 
 
 
65ec53e
 
 
2a3d108
 
65ec53e
 
2a3d108
 
65ec53e
 
 
2a3d108
65ec53e
 
 
 
 
5ba2d63
65ec53e
 
 
 
 
 
2a3d108
 
65ec53e
 
 
2a3d108
65ec53e
2a3d108
 
65ec53e
 
 
2a3d108
65ec53e
 
 
14677da
65ec53e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b45ccc7
 
 
65ec53e
 
 
 
2a3d108
 
 
5ba2d63
2a3d108
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import os
import cv2
import numpy as np
import torch
import spaces
from ultralytics import YOLO
from tqdm import tqdm

# Fix for Ultralytics config write error in Hugging Face environment
os.environ["YOLO_CONFIG_DIR"] = "/tmp"

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load models onto the appropriate device
extract_model = YOLO("best.pt").to(device)
detect_model  = YOLO("yolov8n.pt").to(device)

@spaces.GPU
def process_video(video_path):
    os.makedirs("frames", exist_ok=True)

    # Step 1: Extract board-only frames
    cap = cv2.VideoCapture(video_path)
    frames, idx = [], 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        results = extract_model(frame)
        labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
        if "board" in labels:
            frames.append(frame)
            cv2.imwrite(f"frames/frame_{idx:04d}.jpg", frame)
        idx += 1
    cap.release()
    if not frames:
        raise RuntimeError("No frames with 'board' found.")

    # Step 2: Align
    def align_frames(ref, tgt):
        orb = cv2.ORB_create(500)
        k1, d1 = orb.detectAndCompute(ref, None)
        k2, d2 = orb.detectAndCompute(tgt, None)
        if d1 is None or d2 is None:
            return None
        matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        matches = matcher.match(d1, d2)
        if len(matches) < 10:
            return None
        src = np.float32([k2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
        dst = np.float32([k1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
        H, _ = cv2.findHomography(src, dst, cv2.RANSAC)
        return None if H is None else cv2.warpPerspective(tgt, H, (ref.shape[1], ref.shape[0]))

    base = frames[0]
    aligned = [base]
    for f in tqdm(frames[1:], desc="Aligning"):
        a = align_frames(base, f)
        if a is not None:
            aligned.append(a)
    if not aligned:
        raise RuntimeError("Alignment failed for all frames.")

    # Step 3: Median-fuse
    stack = np.stack(aligned, axis=0).astype(np.float32)
    median_board = np.median(stack, axis=0).astype(np.uint8)
    cv2.imwrite("clean_board.jpg", median_board)

    # Step 4: Mask persons & selective fuse
    sum_img = np.zeros_like(aligned[0], dtype=np.float32)
    count = np.zeros(aligned[0].shape[:2], dtype=np.float32)
    for f in tqdm(aligned, desc="Masking persons"):
        res = detect_model(f, verbose=False)
        m = np.zeros(f.shape[:2], dtype=np.uint8)
        for box in res[0].boxes:
            if detect_model.names[int(box.cls)] == "person":
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cv2.rectangle(m, (x1, y1), (x2, y2), 255, -1)
        inv = cv2.bitwise_not(m)
        masked = cv2.bitwise_and(f, f, mask=inv)
        sum_img += masked.astype(np.float32)
        count += (inv > 0).astype(np.float32)

    count[count == 0] = 1
    selective = (sum_img / count[:, :, None]).astype(np.uint8)
    cv2.imwrite("fused_board_selective.jpg", selective)

    # Step 5: Sharpen
    blur = cv2.GaussianBlur(selective, (5, 5), 0)
    sharp = cv2.addWeighted(selective, 1.5, blur, -0.5, 0)
    cv2.imwrite("sharpened_board_color.jpg", sharp)

    return "sharpened_board_color.jpg"


demo = gr.Interface(
    fn=process_video,
    inputs=[
        gr.File(
            label="Upload Classroom Video (.mp4)",
            file_types=['.mp4'],
            file_count="single",
            type="filepath"
        )
    ],
    outputs=[
        gr.Image(label="Sharpened Final Board")
    ],
    title="📹 Classroom Board Cleaner",
    description=(
        "Upload your classroom video (.mp4). \n"
        "Automatic extraction, alignment, masking, fusion & sharpening. \n"
        "View three stages of the cleaned board output."
    )
)

if __name__ == "__main__":
    if device == "cuda":
        print(f"[INFO] ✅ Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        print("[INFO] ⚠️ Using CPU (GPU not available or not assigned)")
    demo.launch()