imperiusrex commited on
Commit
d1bb125
·
verified ·
1 Parent(s): 8d55cf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -93
app.py CHANGED
@@ -7,43 +7,41 @@ import spaces
7
  from ultralytics import YOLO
8
  from tqdm import tqdm
9
  from PIL import Image
10
- from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
11
 
12
- # Fix for Ultralytics config write error in Hugging Face environment
13
  os.environ["YOLO_CONFIG_DIR"] = "/tmp"
14
 
15
- # Use GPU if available
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
  # Load detection models
19
  extract_model = YOLO("best.pt").to(device)
20
- detect_model = YOLO("yolov8n.pt").to(device)
21
 
22
- # Load captioning model (lightweight + free)
23
- caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
24
- caption_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
25
- caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
 
 
26
 
27
- # Captioning function
28
- def caption_image(image_path):
29
  image = Image.open(image_path).convert("RGB")
30
- pixel_values = caption_processor(images=image, return_tensors="pt").pixel_values.to(device)
31
- output_ids = caption_model.generate(pixel_values, max_length=50, num_beams=4)
32
- caption = caption_tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
33
  return caption
34
 
35
-
36
  @spaces.GPU
37
  def process_video(video_path):
38
  os.makedirs("frames", exist_ok=True)
39
-
40
- # Step 1: Extract board-only frames
41
  cap = cv2.VideoCapture(video_path)
42
  frames, idx = [], 0
43
  while cap.isOpened():
44
  ret, frame = cap.read()
45
- if not ret:
46
- break
47
  results = extract_model(frame)
48
  labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
49
  if "board" in labels and "person" not in labels:
@@ -53,93 +51,56 @@ def process_video(video_path):
53
  cap.release()
54
  if not frames:
55
  raise RuntimeError("No frames with only 'board' and no 'person' found.")
56
-
57
- # Step 2: Align
58
- def align_frames(ref, tgt):
59
- orb = cv2.ORB_create(500)
60
- k1, d1 = orb.detectAndCompute(ref, None)
61
- k2, d2 = orb.detectAndCompute(tgt, None)
62
- if d1 is None or d2 is None:
63
- return None
64
- matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
65
- matches = matcher.match(d1, d2)
66
- if len(matches) < 10:
67
- return None
68
- src = np.float32([k2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
69
- dst = np.float32([k1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
70
- H, _ = cv2.findHomography(src, dst, cv2.RANSAC)
71
- return None if H is None else cv2.warpPerspective(tgt, H, (ref.shape[1], ref.shape[0]))
72
-
73
  base = frames[0]
74
  aligned = [base]
 
 
 
 
 
 
 
 
 
 
 
 
75
  for f in tqdm(frames[1:], desc="Aligning"):
76
- a = align_frames(base, f)
77
- if a is not None:
78
- aligned.append(a)
79
- if not aligned:
80
- raise RuntimeError("Alignment failed for all frames.")
81
-
82
- # Step 3: Median-fuse
83
- stack = np.stack(aligned, axis=0).astype(np.float32)
84
- median_board = np.median(stack, axis=0).astype(np.uint8)
85
  cv2.imwrite("clean_board.jpg", median_board)
86
-
87
- # Step 4: Mask persons & selective fuse
88
- sum_img = np.zeros_like(aligned[0], dtype=np.float32)
89
- count = np.zeros(aligned[0].shape[:2], dtype=np.float32)
90
  for f in tqdm(aligned, desc="Masking persons"):
91
  res = detect_model(f, verbose=False)
92
- m = np.zeros(f.shape[:2], dtype=np.uint8)
93
  for box in res[0].boxes:
94
- if detect_model.names[int(box.cls)] == "person":
95
- x1, y1, x2, y2 = map(int, box.xyxy[0])
96
- cv2.rectangle(m, (x1, y1), (x2, y2), 255, -1)
97
  inv = cv2.bitwise_not(m)
98
- masked = cv2.bitwise_and(f, f, mask=inv)
99
  sum_img += masked.astype(np.float32)
100
- count += (inv > 0).astype(np.float32)
101
-
102
- count[count == 0] = 1
103
- selective = (sum_img / count[:, :, None]).astype(np.uint8)
104
- cv2.imwrite("fused_board_selective.jpg", selective)
105
-
106
- # Step 5: Sharpen
107
- blur = cv2.GaussianBlur(selective, (3, 3), 0)
108
- sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0)
109
- output_image_path = "sharpened_board_color.jpg"
110
- cv2.imwrite(output_image_path, sharp)
111
-
112
- # Step 6: Generate caption
113
- caption = caption_image(output_image_path)
114
-
115
- return output_image_path, caption
116
-
117
 
118
  demo = gr.Interface(
119
  fn=process_video,
120
- inputs=[
121
- gr.File(
122
- label="Upload Classroom Video (.mp4)",
123
- file_types=['.mp4'],
124
- file_count="single",
125
- type="filepath"
126
- )
127
- ],
128
- outputs=[
129
- gr.Image(label="Sharpened Final Board"),
130
- gr.Textbox(label="Generated Caption")
131
- ],
132
- title="📹 Classroom Board Cleaner + 🧠 Captioning",
133
- description=(
134
- "1️⃣ Upload your classroom video (.mp4)\n"
135
- "2️⃣ AI extracts, aligns, fuses, sharpens and removes people\n"
136
- "3️⃣ Get a clean board image and automatic caption"
137
- )
138
  )
139
 
140
- if __name__ == "__main__":
141
- if device == "cuda":
142
- print(f"[INFO] ✅ Using GPU: {torch.cuda.get_device_name(0)}")
143
- else:
144
- print("[INFO] ⚠️ Using CPU (GPU not available or not assigned)")
145
  demo.launch()
 
7
  from ultralytics import YOLO
8
  from tqdm import tqdm
9
  from PIL import Image
10
+ from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
11
 
12
+ # Prevent config warnings
13
  os.environ["YOLO_CONFIG_DIR"] = "/tmp"
14
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
  # Load detection models
18
  extract_model = YOLO("best.pt").to(device)
19
+ detect_model = YOLO("yolov8n.pt").to(device)
20
 
21
+ # Load LLaVA-HF LLM and processor
22
+ processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.5-7b-hf")
23
+ llava = LlavaNextForConditionalGeneration.from_pretrained(
24
+ "llava-hf/llava-v1.5-7b-hf",
25
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
26
+ low_cpu_mem_usage=True
27
+ ).to(device)
28
 
29
+ def caption_image_with_llava(image_path):
 
30
  image = Image.open(image_path).convert("RGB")
31
+ prompt = "[INST] <image>\nDescribe what is visible in the image in a concise, factual sentence. [/INST]"
32
+ inputs = processor(prompt, images=image, return_tensors="pt").to(device)
33
+ outputs = llava.generate(**inputs, max_new_tokens=100, do_sample=False)
34
+ caption = processor.decode(outputs[0], skip_special_tokens=True)
35
  return caption
36
 
 
37
  @spaces.GPU
38
  def process_video(video_path):
39
  os.makedirs("frames", exist_ok=True)
 
 
40
  cap = cv2.VideoCapture(video_path)
41
  frames, idx = [], 0
42
  while cap.isOpened():
43
  ret, frame = cap.read()
44
+ if not ret: break
 
45
  results = extract_model(frame)
46
  labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
47
  if "board" in labels and "person" not in labels:
 
51
  cap.release()
52
  if not frames:
53
  raise RuntimeError("No frames with only 'board' and no 'person' found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  base = frames[0]
55
  aligned = [base]
56
+ def align(ref, tgt):
57
+ orb = cv2.ORB_create(500)
58
+ k1,d1 = orb.detectAndCompute(ref,None)
59
+ k2,d2 = orb.detectAndCompute(tgt,None)
60
+ if d1 is None or d2 is None: return None
61
+ m = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True).match(d1,d2)
62
+ if len(m)<10: return None
63
+ src = np.float32([k2[m.trainIdx].pt for m in m]).reshape(-1,1,2)
64
+ dst = np.float32([k1[m.queryIdx].pt for m in m]).reshape(-1,1,2)
65
+ H,_ = cv2.findHomography(src,dst,cv2.RANSAC)
66
+ return None if H is None else cv2.warpPerspective(tgt,H,(ref.shape[1],ref.shape[0]))
67
+ from tqdm import tqdm
68
  for f in tqdm(frames[1:], desc="Aligning"):
69
+ a = align(base, f)
70
+ if a is not None: aligned.append(a)
71
+ stack = np.stack(aligned,axis=0).astype(np.float32)
72
+ median_board = np.median(stack,axis=0).astype(np.uint8)
 
 
 
 
 
73
  cv2.imwrite("clean_board.jpg", median_board)
74
+ sum_img = np.zeros_like(aligned[0],dtype=np.float32)
75
+ count = np.zeros(aligned[0].shape[:2],dtype=np.float32)
 
 
76
  for f in tqdm(aligned, desc="Masking persons"):
77
  res = detect_model(f, verbose=False)
78
+ m = np.zeros(f.shape[:2],dtype=np.uint8)
79
  for box in res[0].boxes:
80
+ if detect_model.names[int(box.cls)]=="person":
81
+ x1,y1,x2,y2 = map(int,box.xyxy[0])
82
+ cv2.rectangle(m,(x1,y1),(x2,y2),255,-1)
83
  inv = cv2.bitwise_not(m)
84
+ masked = cv2.bitwise_and(f,f,mask=inv)
85
  sum_img += masked.astype(np.float32)
86
+ count += (inv>0).astype(np.float32)
87
+ count[count==0] = 1
88
+ selective = (sum_img/count[:,:,None]).astype(np.uint8)
89
+ blur = cv2.GaussianBlur(selective,(3,3),0)
90
+ sharp = cv2.addWeighted(selective,2.0,blur,-1.0,0)
91
+ out_img = "sharpened_board_color.jpg"
92
+ cv2.imwrite(out_img, sharp)
93
+ caption = caption_image_with_llava(out_img)
94
+ return out_img, caption
 
 
 
 
 
 
 
 
95
 
96
  demo = gr.Interface(
97
  fn=process_video,
98
+ inputs=[gr.File(label="Upload Classroom Video (.mp4)", file_types=['.mp4'], file_count="single", type="filepath")],
99
+ outputs=[gr.Image(label="Sharpened Final Board"), gr.Textbox(label="LLaVA Caption")],
100
+ title="Board Cleaner + LLaVA Captioning",
101
+ description="Clean the board from video and generate a descriptive caption with LLaVA."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
+ if __name__=="__main__":
105
+ print(f"[INFO] {'GPU' if device=='cuda' else 'CPU'} mode")
 
 
 
106
  demo.launch()