Spaces:

imperiusrex
/

classroomAPI

Sleeping

App Files Files Community

imperiusrex commited on Aug 2, 2025

Commit

560fe44

verified ·

1 Parent(s): cf2d588

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -18

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import spaces
 from ultralytics import YOLO
 from tqdm import tqdm
 from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration
 # Fix for Ultralytics config write error in Hugging Face environment
 os.environ["YOLO_CONFIG_DIR"] = "/tmp"
@@ -15,13 +15,23 @@ os.environ["YOLO_CONFIG_DIR"] = "/tmp"
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load YOLO models onto the appropriate device
 extract_model = YOLO("best.pt").to(device)
-detect_model  = YOLO("yolov8n.pt").to(device)
-# Load BLIP captioning model and processor
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
 @spaces.GPU
 def process_video(video_path):
@@ -96,18 +106,15 @@ def process_video(video_path):
     # Step 5: Sharpen
     blur = cv2.GaussianBlur(selective, (3, 3), 0)
     sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0)
-    cv2.imwrite("sharpened_board_color.jpg", sharp)
-    # Step 6: Generate Caption
-    image = Image.open("sharpened_board_color.jpg").convert("RGB")
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    out = caption_model.generate(**inputs, max_new_tokens=30)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return "sharpened_board_color.jpg", caption
-# Build Gradio interface
 demo = gr.Interface(
     fn=process_video,
     inputs=[
@@ -120,13 +127,13 @@ demo = gr.Interface(
     ],
     outputs=[
         gr.Image(label="Sharpened Final Board"),
-        gr.Textbox(label="Generated Caption (BLIP)")
     ],
-    title="📹 Classroom Board Cleaner + Captioning",
     description=(
         "1️⃣ Upload your classroom video (.mp4)\n"
-        "2️⃣ Extracts, aligns, masks, fuses, sharpens board frames\n"
-        "3️⃣ Generates a caption describing the cleaned board output"
     )
 )

 from ultralytics import YOLO
 from tqdm import tqdm
 from PIL import Image
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 # Fix for Ultralytics config write error in Hugging Face environment
 os.environ["YOLO_CONFIG_DIR"] = "/tmp"
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load detection models
 extract_model = YOLO("best.pt").to(device)
+detect_model = YOLO("yolov8n.pt").to(device)
+# Load captioning model (lightweight + free)
+caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
+caption_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+# Captioning function
+def caption_image(image_path):
+    image = Image.open(image_path).convert("RGB")
+    pixel_values = caption_processor(images=image, return_tensors="pt").pixel_values.to(device)
+    output_ids = caption_model.generate(pixel_values, max_length=50, num_beams=4)
+    caption = caption_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return caption
 @spaces.GPU
 def process_video(video_path):
     # Step 5: Sharpen
     blur = cv2.GaussianBlur(selective, (3, 3), 0)
     sharp = cv2.addWeighted(selective, 2.0, blur, -1.0, 0)
+    output_image_path = "sharpened_board_color.jpg"
+    cv2.imwrite(output_image_path, sharp)
+    # Step 6: Generate caption
+    caption = caption_image(output_image_path)
+    return output_image_path, caption
 demo = gr.Interface(
     fn=process_video,
     inputs=[
     ],
     outputs=[
         gr.Image(label="Sharpened Final Board"),
+        gr.Textbox(label="Generated Caption")
     ],
+    title="📹 Classroom Board Cleaner + 🧠 Captioning",
     description=(
         "1️⃣ Upload your classroom video (.mp4)\n"
+        "2️⃣ AI extracts, aligns, fuses, sharpens and removes people\n"
+        "3️⃣ Get a clean board image and automatic caption"
     )
 )