Spaces:
Sleeping
Sleeping
File size: 3,341 Bytes
81185a4 f2f00f4 4478675 0a834b8 81185a4 d1bb125 f2f00f4 81185a4 f2f00f4 560fe44 f2f00f4 d1bb125 560fe44 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 f2f00f4 d1bb125 0a834b8 f2f00f4 d1bb125 40b38be d78bcd5 f2f00f4 d1bb125 f2f00f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import os
import cv2
import numpy as np
import torch
import spaces
from ultralytics import YOLO
from tqdm import tqdm
from PIL import Image
# Prevent config warnings
os.environ["YOLO_CONFIG_DIR"] = "/tmp"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load detection models
extract_model = YOLO("best.pt").to(device)
detect_model = YOLO("yolov8n.pt").to(device)
@spaces.GPU
def process_video(video_path):
os.makedirs("frames", exist_ok=True)
cap = cv2.VideoCapture(video_path)
frames, idx = [], 0
while cap.isOpened():
ret, frame = cap.read()
if not ret: break
results = extract_model(frame)
labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
if "board" in labels and "person" not in labels:
frames.append(frame)
cv2.imwrite(f"frames/frame_{idx:04d}.jpg", frame)
idx += 1
cap.release()
if not frames:
raise RuntimeError("No frames with only 'board' and no 'person' found.")
base = frames[0]
aligned = [base]
def align(ref, tgt):
orb = cv2.ORB_create(500)
k1,d1 = orb.detectAndCompute(ref,None)
k2,d2 = orb.detectAndCompute(tgt,None)
if d1 is None or d2 is None: return None
m = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True).match(d1,d2)
if len(m)<10: return None
src = np.float32([k2[m.trainIdx].pt for m in m]).reshape(-1,1,2)
dst = np.float32([k1[m.queryIdx].pt for m in m]).reshape(-1,1,2)
H,_ = cv2.findHomography(src,dst,cv2.RANSAC)
return None if H is None else cv2.warpPerspective(tgt,H,(ref.shape[1],ref.shape[0]))
from tqdm import tqdm
for f in tqdm(frames[1:], desc="Aligning"):
a = align(base, f)
if a is not None: aligned.append(a)
stack = np.stack(aligned,axis=0).astype(np.float32)
median_board = np.median(stack,axis=0).astype(np.uint8)
cv2.imwrite("clean_board.jpg", median_board)
sum_img = np.zeros_like(aligned[0],dtype=np.float32)
count = np.zeros(aligned[0].shape[:2],dtype=np.float32)
for f in tqdm(aligned, desc="Masking persons"):
res = detect_model(f, verbose=False)
m = np.zeros(f.shape[:2],dtype=np.uint8)
for box in res[0].boxes:
if detect_model.names[int(box.cls)]=="person":
x1,y1,x2,y2 = map(int,box.xyxy[0])
cv2.rectangle(m,(x1,y1),(x2,y2),255,-1)
inv = cv2.bitwise_not(m)
masked = cv2.bitwise_and(f,f,mask=inv)
sum_img += masked.astype(np.float32)
count += (inv>0).astype(np.float32)
count[count==0] = 1
selective = (sum_img/count[:,:,None]).astype(np.uint8)
blur = cv2.GaussianBlur(selective,(3,3),0)
sharp = cv2.addWeighted(selective,2.0,blur,-1.0,0)
out_img = "sharpened_board_color.jpg"
cv2.imwrite(out_img, sharp)
demo = gr.Interface(
fn=process_video,
inputs=[gr.File(label="Upload Classroom Video (.mp4)", file_types=['.mp4'], file_count="single", type="filepath")],
outputs=[gr.Image(label="Sharpened Final Board")],
title="Obstruction remover",
description="Remove the obstructions while retaining the exact text on the board!"
)
if __name__=="__main__":
print(f"[INFO] {'GPU' if device=='cuda' else 'CPU'} mode")
demo.launch()
|