Spaces:

Reaper200
/

ContextAwareObjectDetector

Runtime error

App Files Files Community

Reaper200 commited on Oct 29, 2024

Commit

26de1f0

verified ·

1 Parent(s): 2cf4d1c

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -94

app.py CHANGED Viewed

@@ -1,95 +1,73 @@
-import streamlit as st
-from PIL import Image
-import numpy as np
 import torch
-from transformers import DetrImageProcessor, DetrForObjectDetection
-from gtts import gTTS
-# Load the model and processor
-@st.cache_resource  # Cache the model to improve performance
-def load_model():
-    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
-    return processor, model
-# Function to detect objects in the image
-def detect_objects(image, processor, model):
-    # Preprocess the image and make predictions
-    inputs = processor(images=image, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Process the outputs
-    target_sizes = torch.tensor([image.size[::-1]])  # Convert to (height, width)
-    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
-    detected_objects = []
-    for score, label in zip(results["scores"], results["labels"]):
-        if score > 0.9:  # Confidence threshold
-            detected_objects.append(label.item())
-    return detected_objects, results
-# Function to convert label IDs to class names
-def get_object_names(class_ids):
-    # Sample mapping (extend this according to your model's output labels)
-    COCO_INSTANCE_CATEGORY_NAMES = [
-        "N/A", "person", "bicycle", "car", "motorcycle", "airplane",
-        "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
-        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
-        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
-        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-        "skis", "snowboard", "sports ball", "kite", "baseball bat",
-        "baseball glove", "skateboard", "surfboard", "tennis racket",
-        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
-        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
-        "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
-        "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
-        "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
-        "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    ]
-    return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]
-# Mock summarization function
-def generate_summary(relevant_objects):
-    st.write("Generating summary for relevant objects...")
-    summary = f"This is an {len(relevant_objects)}-item scene including: {', '.join(relevant_objects)}."
-    return summary
-# Mock text-to-speech function
-def text_to_speech(text):
-    st.write("Converting summary to speech...")
-    tts = gTTS(text)
-    tts.save("summary.mp3")
-    st.audio("summary.mp3")
-# Streamlit app main function
-def main():
-    st.title("Context-Aware Object Detection App with Hugging Face")
-    # Load model
-    processor, model = load_model()
-    # Step 1: Capture Image from Camera
-    captured_image = st.camera_input("Take a picture")
-    if captured_image is not None:
-        # Open the captured image
-        image = Image.open(captured_image)
-        st.image(image, caption="Captured Image", use_column_width=True)
-        # Step 2: Detect Objects
-        detected_ids, results = detect_objects(image, processor, model)
-        detected_objects = get_object_names(detected_ids)
-        st.write(f"Detected Objects: {detected_objects}")
-        # Step 3: Generate Summary
-        summary = generate_summary(detected_objects)
-        st.write(f"Summary: {summary}")
-        # Step 4: Convert Summary to Speech
-        text_to_speech(summary)
-if __name__ == "__main__":
-    main()

 import torch
+import cv2
+import pyttsx3
+import random
+# Download model from GitHub
+model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
+# Initialize video capture
+cap = cv2.VideoCapture('cars.mp4')
+# Initialize text-to-speech engine
+engine = pyttsx3.init()
+# Simulated GPS location (latitude, longitude)
+gps_location = (37.7749, -122.4194)  # Example coordinates for San Francisco
+# Function to speak the detected object
+def speak(text):
+    engine.say(text)
+    engine.runAndWait()
+while True:
+    ret, img = cap.read()
+    if not ret:
+        break
+    # Perform detection on the image
+    result = model(img)
+    print('result: ', result)
+    # Convert detected result to pandas DataFrame
+    data_frame = result.pandas().xyxy[0]
+    print('data_frame:')
+    print(data_frame)
+    # Get indexes of all the rows
+    indexes = data_frame.index
+    for index in indexes:
+        # Find the coordinate of top left corner of bounding box
+        x1 = int(data_frame['xmin'][index])
+        y1 = int(data_frame['ymin'][index])
+        # Find the coordinate of bottom right corner of bounding box
+        x2 = int(data_frame['xmax'][index])
+        y2 = int(data_frame['ymax'][index])
+        # Find label name and confidence score
+        label = data_frame['name'][index]
+        conf = data_frame['confidence'][index]
+        text = f"{label} {conf:.2f}"
+        # Draw bounding box and label on the image
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2)
+        cv2.putText(img, text, (x1, y1 - 5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
+        # Context-aware actions based on detected objects
+        if label == "car" and conf > 0.5:
+            # Announce detected car and GPS location
+            speak(f"Car detected at GPS location: {gps_location[0]}, {gps_location[1]}")
+            # Here you can add more context-based features (e.g., alerting, saving data, etc.)
+    # Display GPS coordinates on the image
+    gps_text = f"GPS: {gps_location[0]:.4f}, {gps_location[1]:.4f}"
+    cv2.putText(img, gps_text, (10, 30), cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0), 2)
+    # Show the processed image
+    cv2.imshow('IMAGE', img)
+    if cv2.waitKey(1) & 0xFF == ord('q'):
+        break
+# Release resources
+cap.release()
+cv2.destroyAllWindows()