Spaces:

owinymarvin
/

SW_AI_deployment

Sleeping

App Files Files Community

owinymarvin commited on May 23

Commit

6f472e5

1 Parent(s): 9d0ee1c

latest changes

Browse files

Files changed (1) hide show

app.py +79 -58

app.py CHANGED Viewed

@@ -12,10 +12,19 @@ from collections import deque
 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
-NUM_FRAMES = 16 # Still expecting 16 frames for a batch
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
 # --- Load Model and Processor ---
 print(f"Loading model and image processor from {HF_MODEL_REPO_ID}...")
 try:
@@ -23,7 +32,6 @@ try:
     model = TimesformerForVideoClassification.from_pretrained(HF_MODEL_REPO_ID)
 except Exception as e:
     print(f"Error loading model from Hugging Face Hub: {e}")
-    # Handle error - exit or raise exception for Space to fail gracefully
     exit()
 model.eval() # Set model to evaluation mode
@@ -32,75 +40,81 @@ model.to(device)
 print(f"Model loaded successfully on {device}.")
 print(f"Model's class labels: {model.config.id2label}")
-# Initialize a global buffer for frames that the webcam continuously captures
-# This buffer will hold the *latest* NUM_FRAMES.
-# We use a global variable to persist state across Gradio calls.
 captured_frames_buffer = deque(maxlen=NUM_FRAMES)
-# This flag will control the 5-minute wait (if still needed for testing)
-wait_duration_seconds = 300 # 5 minutes
-# --- Function to continuously capture frames (without immediate processing) ---
-def capture_frame_into_buffer(image_np_array):
-    global captured_frames_buffer
     # Convert Gradio's numpy array (RGB) to PIL Image
     pil_image = Image.fromarray(image_np_array)
     captured_frames_buffer.append(pil_image)
-    # Return a message showing how many frames are buffered
-    return f"Frames buffered: {len(captured_frames_buffer)}/{NUM_FRAMES}"
-# --- Function to trigger prediction with the buffered frames ---
-def make_prediction_from_buffer():
-    global captured_frames_buffer
-    if len(captured_frames_buffer) < NUM_FRAMES:
-        return "Not enough frames buffered yet. Please capture more frames."
-    # Take a snapshot of the current frames in the buffer for prediction
-    # Convert deque to a list for the processor
-    frames_for_prediction = list(captured_frames_buffer)
-    # --- Perform Inference ---
-    print(f"Triggered inference on {len(frames_for_prediction)} frames...")
-    processed_input = processor(images=frames_for_prediction, return_tensors="pt")
-    pixel_values = processed_input.pixel_values.to(device)
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-    predicted_class_id = logits.argmax(-1).item()
-    predicted_label = model.config.id2label[predicted_class_id]
-    confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
-    prediction_text = f"Predicted: {predicted_label} ({confidence:.2f})"
-    print(prediction_text) # Print to Space logs
-    # Clear the buffer after prediction if you want to capture a *new* set of frames for the next click
-    # captured_frames_buffer.clear()
-    # If you *don't* clear, the next click will re-predict on the same last 16 frames.
-    # --- Introduce the artificial 5-minute wait (if still desired) ---
-    # This will pause the *return* from this function, effectively blocking the UI update
-    # If you remove this, the prediction will show immediately.
-    # print(f"Initiating {wait_duration_seconds} second wait...")
-    # time.sleep(wait_duration_seconds)
-    # print("Wait finished.")
-    return prediction_text
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
-        # TimesFormer Crime Detection Live Demo (Manual Trigger)
         This demo uses a finetuned TimesFormer model ({HF_MODEL_REPO_ID}) to predict crime actions from a live webcam feed.
-        It continuously buffers frames, but **only makes a prediction when you click the 'Predict' button**.
-        The model requires **{NUM_FRAMES} frames** for a prediction.
         Please allow webcam access.
         """
     )
@@ -111,22 +125,29 @@ with gr.Blocks() as demo:
                 streaming=True,
                 label="Live Webcam Feed"
             )
-            # This textbox will show the buffering status dynamically
-            buffer_status = gr.Textbox(label="Frame Buffer Status", value=f"Frames buffered: 0/{NUM_FRAMES}")
-            # Button to trigger prediction
-            predict_button = gr.Button("Predict Latest Frames")
         with gr.Column():
-            prediction_output = gr.Textbox(label="Prediction Result", value="Click 'Predict Latest Frames' to start.")
     # Define actions
-    # This continuously updates the buffer_status as frames come in
-    webcam_input.stream(capture_frame_into_buffer, inputs=[webcam_input], outputs=[buffer_status])
-    # This triggers the prediction when the button is clicked
-    predict_button.click(make_prediction_from_buffer, inputs=[], outputs=[prediction_output])
 if __name__ == "__main__":
     demo.launch()

 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
+NUM_FRAMES = 8 # Changed back to 8 as that was your original training setup for this model
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
+# --- Prediction Timing ---
+# How long to record (in seconds) before making a prediction
+RECORDING_DURATION_SECONDS = 3.0
+# How often the model should predict (after the recording duration)
+# Setting this to a very high number (like 9999) means it essentially predicts only once
+# after the recording is done until reset. Or you can leave it at 1.0 if you want it to trigger often.
+INFERENCE_INTERVAL_SECONDS = 1.0 # This will be the minimum time between predictions if not controlled by reset.
 # --- Load Model and Processor ---
 print(f"Loading model and image processor from {HF_MODEL_REPO_ID}...")
 try:
     model = TimesformerForVideoClassification.from_pretrained(HF_MODEL_REPO_ID)
 except Exception as e:
     print(f"Error loading model from Hugging Face Hub: {e}")
     exit()
 model.eval() # Set model to evaluation mode
 print(f"Model loaded successfully on {device}.")
 print(f"Model's class labels: {model.config.id2label}")
+# --- Global State Variables ---
+# Use a global deque to store captured frames
 captured_frames_buffer = deque(maxlen=NUM_FRAMES)
+recording_start_time = None # To track when recording for a clip started
+last_prediction_time = time.time() # To control prediction frequency after recording
+# --- Functions for Gradio Interface ---
+def process_frame_and_predict(image_np_array):
+    global captured_frames_buffer, recording_start_time, last_prediction_time
+    # Initialize recording_start_time if it's the first frame for a new recording cycle
+    if recording_start_time is None:
+        recording_start_time = time.time()
+        captured_frames_buffer.clear() # Clear buffer to start a new clip
     # Convert Gradio's numpy array (RGB) to PIL Image
     pil_image = Image.fromarray(image_np_array)
     captured_frames_buffer.append(pil_image)
+    current_time = time.time()
+    elapsed_recording_time = current_time - recording_start_time
+    output_status = f"Recording: {elapsed_recording_time:.1f}/{RECORDING_DURATION_SECONDS}s | Frames: {len(captured_frames_buffer)}/{NUM_FRAMES}"
+    prediction_text = "Recording..." # Default text while recording
+    # Check if enough time has passed and we have enough frames
+    if elapsed_recording_time >= RECORDING_DURATION_SECONDS and len(captured_frames_buffer) >= NUM_FRAMES:
+        if (current_time - last_prediction_time) >= INFERENCE_INTERVAL_SECONDS: # Limit prediction frequency
+            # --- Perform Inference ---
+            print(f"Triggered inference on {len(captured_frames_buffer)} frames after {RECORDING_DURATION_SECONDS}s recording...")
+            frames_for_prediction = list(captured_frames_buffer) # Take a snapshot
+            # The image_processor will handle the resizing to TARGET_IMAGE_HEIGHT x TARGET_IMAGE_WIDTH
+            processed_input = processor(images=frames_for_prediction, return_tensors="pt")
+            pixel_values = processed_input.pixel_values.to(device)
+            with torch.no_grad():
+                outputs = model(pixel_values)
+                logits = outputs.logits
+            predicted_class_id = logits.argmax(-1).item()
+            predicted_label = model.config.id2label[predicted_class_id]
+            confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
+            prediction_text = f"Predicted: {predicted_label} ({confidence:.2f})"
+            print(prediction_text) # Print to Space logs
+            last_prediction_time = current_time # Update time of last successful prediction
+            # Reset recording_start_time to allow a new recording cycle
+            recording_start_time = None
+            captured_frames_buffer.clear() # Clear buffer for next clip
+        else:
+            prediction_text = "Prediction done. Waiting for next interval..." # Message if prediction recently made
+    return output_status, prediction_text
+def reset_app_state():
+    """Resets the global state variables to start a new recording/prediction cycle."""
+    global captured_frames_buffer, recording_start_time, last_prediction_time
+    captured_frames_buffer.clear()
+    recording_start_time = None
+    last_prediction_time = time.time()
+    print("App state reset.")
+    return "Ready to record...", "Ready for new prediction."
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
+        # TimesFormer Crime Detection Live Demo (Auto-Triggered Clip Prediction)
         This demo uses a finetuned TimesFormer model ({HF_MODEL_REPO_ID}) to predict crime actions from a live webcam feed.
+        It records **{RECORDING_DURATION_SECONDS} seconds** of video, then automatically triggers a prediction.
+        The model processes **{NUM_FRAMES} frames** per prediction.
         Please allow webcam access.
         """
     )
                 streaming=True,
                 label="Live Webcam Feed"
             )
+            # Textboxes for status and prediction
+            status_output = gr.Textbox(label="Status", value="Ready to record...")
+            # Reset Button
+            reset_button = gr.Button("Reset / Start New Recording Cycle")
         with gr.Column():
+            prediction_output = gr.Textbox(label="Prediction Result", value="Recording will start automatically.")
     # Define actions
+    # This continuously processes frames from the webcam
+    webcam_input.stream(
+        process_frame_and_predict,
+        inputs=[webcam_input],
+        outputs=[status_output, prediction_output] # Now outputs both status and prediction
+    )
+    # This triggers the reset function when the button is clicked
+    reset_button.click(
+        reset_app_state,
+        inputs=[],
+        outputs=[status_output, prediction_output] # Updates both output textboxes
+    )
 if __name__ == "__main__":
     demo.launch()