import gradio as gr import os import cv2 import numpy as np import torch import spaces from ultralytics import YOLO from tqdm import tqdm from PIL import Image from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer # Fix for Ultralytics config write error in Hugging Face environment os.environ["YOLO_CONFIG_DIR"] = "/tmp" # Use GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" # Load detection models extract_model = YOLO("best.pt").to(device) detect_model = YOLO("yolov8n.pt").to(device) # Load captioning model (lightweight + free) caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device) caption_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Captioning function def caption_image(image_path): image = Image.open(image_path).convert("RGB") pixel_values = caption_processor(images=image, return_tensors="pt").pixel_values.to(device) output_ids = caption_model.generate(pixel_values, max_length=50, num_beams=4) caption = caption_tokenizer.decode(output_ids[0], skip_special_tokens=True) return caption @spaces.GPU def process_video(video_path): os.makedirs("frames", exist_ok=True) # Step 1: Extract board-only frames cap = cv2.VideoCapture(video_path) frames, idx = [], 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break results = extract_model(frame) labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()] if "board" in labels and "person" not in labels: frames.append(frame) cv2.imwrite(f"frames/frame_{idx:04d}.jpg", frame) idx += 1 cap.release() if not frames: raise RuntimeError("No frames with only 'board' and no 'person' found.") # Step 2: Align def align_frames(ref, tgt): orb = cv2.ORB_create(500) k1, d1 = orb.detectAndCompute(ref, None) k2, d2 = orb.detectAndCompute(tgt, None) if d1 is None or d2 is None: return None matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) matches = matcher.match(d1, d2) if len(matches) < 10: return None src = np.float32([k2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2) dst = np.float32([k1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2) H, _ = cv2.findHomography(src, dst, cv2.RANSAC) return None if H is None else cv2.warpPerspective(tgt, H, (ref.shape[1], ref.shape[0])) base = frames[0] aligned = [base] for f in tqdm(frames[1:], desc="Aligning"): a = align_frames(base, f) if a is not None: aligned.append(a) if not aligned: raise RuntimeError("Alignment failed for all frames.") # Step 3: Median-fuse stack = np.stack(aligned, axis=0).astype(np.float32) median_board = np.median(stack, axis=0).astype(np.uint8) cv2.imwrite("clean_board.jpg", median_board) # Step 4: Mask persons & selective fuse sum_img = np.zeros_like(aligned[0], dtype=np.float32) count = np.zeros(aligned[0].shape[:2], dtype=np.float32) for f in tqdm(aligned, desc="Masking persons"): res = detect_model(f, verbose=False) m = np.zeros(f.shape[:2], dtype=np.uint8) for box in res[0].boxes: if detect_model.names[int(box.cls)] == "person": x1, y1, x2, y2 = map(int, box.xyxy[0]) cv2.rectangle(m, (x1, y1), (x2, y2), 255, -1) inv = cv2.bitwise_not(m) masked = cv2.bitwise_and(f, f, mask=inv) sum_img += masked.astype(np.float32) count += (inv > 0).astype(np.float32) count[count == 0] = 1 selective = (sum_img / count[:, :, None]).astype(np.uint8) cv2.imwrite("fused_board_selective.jpg", selective) # Step 5: Sharpen blur = cv2.GaussianBlur(selective, (3, 3), 0) sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0) output_image_path = "sharpened_board_color.jpg" cv2.imwrite(output_image_path, sharp) # Step 6: Generate caption caption = caption_image(output_image_path) return output_image_path, caption demo = gr.Interface( fn=process_video, inputs=[ gr.File( label="Upload Classroom Video (.mp4)", file_types=['.mp4'], file_count="single", type="filepath" ) ], outputs=[ gr.Image(label="Sharpened Final Board"), gr.Textbox(label="Generated Caption") ], title="📹 Classroom Board Cleaner + 🧠 Captioning", description=( "1️⃣ Upload your classroom video (.mp4)\n" "2️⃣ AI extracts, aligns, fuses, sharpens and removes people\n" "3️⃣ Get a clean board image and automatic caption" ) ) if __name__ == "__main__": if device == "cuda": print(f"[INFO] ✅ Using GPU: {torch.cuda.get_device_name(0)}") else: print("[INFO] ⚠️ Using CPU (GPU not available or not assigned)") demo.launch()