imperiusrex commited on
Commit
560fe44
·
verified ·
1 Parent(s): cf2d588

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -18
app.py CHANGED
@@ -7,7 +7,7 @@ import spaces
7
  from ultralytics import YOLO
8
  from tqdm import tqdm
9
  from PIL import Image
10
- from transformers import BlipProcessor, BlipForConditionalGeneration
11
 
12
  # Fix for Ultralytics config write error in Hugging Face environment
13
  os.environ["YOLO_CONFIG_DIR"] = "/tmp"
@@ -15,13 +15,23 @@ os.environ["YOLO_CONFIG_DIR"] = "/tmp"
15
  # Use GPU if available
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
- # Load YOLO models onto the appropriate device
19
  extract_model = YOLO("best.pt").to(device)
20
- detect_model = YOLO("yolov8n.pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Load BLIP captioning model and processor
23
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
24
- caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
25
 
26
  @spaces.GPU
27
  def process_video(video_path):
@@ -96,18 +106,15 @@ def process_video(video_path):
96
  # Step 5: Sharpen
97
  blur = cv2.GaussianBlur(selective, (3, 3), 0)
98
  sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0)
99
- cv2.imwrite("sharpened_board_color.jpg", sharp)
 
100
 
101
- # Step 6: Generate Caption
102
- image = Image.open("sharpened_board_color.jpg").convert("RGB")
103
- inputs = processor(images=image, return_tensors="pt").to(device)
104
- out = caption_model.generate(**inputs, max_new_tokens=30)
105
- caption = processor.decode(out[0], skip_special_tokens=True)
106
 
107
- return "sharpened_board_color.jpg", caption
108
 
109
 
110
- # Build Gradio interface
111
  demo = gr.Interface(
112
  fn=process_video,
113
  inputs=[
@@ -120,13 +127,13 @@ demo = gr.Interface(
120
  ],
121
  outputs=[
122
  gr.Image(label="Sharpened Final Board"),
123
- gr.Textbox(label="Generated Caption (BLIP)")
124
  ],
125
- title="📹 Classroom Board Cleaner + Captioning",
126
  description=(
127
  "1️⃣ Upload your classroom video (.mp4)\n"
128
- "2️⃣ Extracts, aligns, masks, fuses, sharpens board frames\n"
129
- "3️⃣ Generates a caption describing the cleaned board output"
130
  )
131
  )
132
 
 
7
  from ultralytics import YOLO
8
  from tqdm import tqdm
9
  from PIL import Image
10
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
11
 
12
  # Fix for Ultralytics config write error in Hugging Face environment
13
  os.environ["YOLO_CONFIG_DIR"] = "/tmp"
 
15
  # Use GPU if available
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
+ # Load detection models
19
  extract_model = YOLO("best.pt").to(device)
20
+ detect_model = YOLO("yolov8n.pt").to(device)
21
+
22
+ # Load captioning model (lightweight + free)
23
+ caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
24
+ caption_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
25
+ caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
26
+
27
+ # Captioning function
28
+ def caption_image(image_path):
29
+ image = Image.open(image_path).convert("RGB")
30
+ pixel_values = caption_processor(images=image, return_tensors="pt").pixel_values.to(device)
31
+ output_ids = caption_model.generate(pixel_values, max_length=50, num_beams=4)
32
+ caption = caption_tokenizer.decode(output_ids[0], skip_special_tokens=True)
33
+ return caption
34
 
 
 
 
35
 
36
  @spaces.GPU
37
  def process_video(video_path):
 
106
  # Step 5: Sharpen
107
  blur = cv2.GaussianBlur(selective, (3, 3), 0)
108
  sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0)
109
+ output_image_path = "sharpened_board_color.jpg"
110
+ cv2.imwrite(output_image_path, sharp)
111
 
112
+ # Step 6: Generate caption
113
+ caption = caption_image(output_image_path)
 
 
 
114
 
115
+ return output_image_path, caption
116
 
117
 
 
118
  demo = gr.Interface(
119
  fn=process_video,
120
  inputs=[
 
127
  ],
128
  outputs=[
129
  gr.Image(label="Sharpened Final Board"),
130
+ gr.Textbox(label="Generated Caption")
131
  ],
132
+ title="📹 Classroom Board Cleaner + 🧠 Captioning",
133
  description=(
134
  "1️⃣ Upload your classroom video (.mp4)\n"
135
+ "2️⃣ AI extracts, aligns, fuses, sharpens and removes people\n"
136
+ "3️⃣ Get a clean board image and automatic caption"
137
  )
138
  )
139