Spaces:

Reaper200
/

ContextAwareObjectDetector

Runtime error

App Files Files Community

Reaper200 commited on Oct 29, 2024

Commit

3b94370

verified ·

1 Parent(s): e988fbf

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -64

app.py CHANGED Viewed

@@ -1,56 +1,56 @@
 import streamlit as st
-from PIL import Image
-import cv2
 import numpy as np
 from gtts import gTTS
 import os
-# Load pre-trained model and classes
 def load_model():
-    # Load YOLO model from OpenCV
-    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")  # Ensure these files are in the working directory
-    layer_names = net.getLayerNames()
-    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
-    return net, output_layers
-# Object detection function
-def detect_objects(image, net, output_layers):
-    height, width, _ = image.shape
-    # Prepare the image for detection
-    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-    net.setInput(blob)
-    outputs = net.forward(output_layers)
     # Process the outputs
-    detected_objects = []
-    for out in outputs:
-        for detection in out:
-            scores = detection[5:]  # Get the scores for detected objects
-            class_id = np.argmax(scores)
-            confidence = scores[class_id]
-            # Filter out weak predictions
-            if confidence > 0.5:  # Adjust threshold as needed
-                detected_objects.append(class_id)
-    return detected_objects
-# Mock function to convert class IDs to object names
 def get_object_names(class_ids):
-    # Sample mapping (extend this according to your class IDs)
-    class_names = {0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane",
-                   5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light",
-                   10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench",
-                   14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow"}
-    return [class_names[id] for id in class_ids if id in class_names]
-# Mock context-aware filter function
-def filter_relevant_objects(detected_objects, setting):
-    st.write(f"Filtering relevant objects for setting: {setting}")
-    if setting == "indoor":
-        return [obj for obj in detected_objects if obj in ["table", "lamp"]]
-    return detected_objects
 # Mock summarization function
 def generate_summary(relevant_objects):
@@ -65,17 +65,12 @@ def text_to_speech(text):
     tts.save("summary.mp3")
     st.audio("summary.mp3")
-# Mock GPS navigation function
-def get_distance_to_object(address):
-    st.write(f"Calculating distance to address: {address}")
-    return "5 km", "15 mins"
 # Streamlit app main function
 def main():
-    st.title("Context-Aware Object Detection App")
-    # Load the YOLO model
-    net, output_layers = load_model()
     # Step 1: Capture Image from Camera
     captured_image = st.camera_input("Take a picture")
@@ -83,31 +78,19 @@ def main():
     if captured_image is not None:
         # Open the captured image
         image = Image.open(captured_image)
-        image_np = np.array(image)  # Convert PIL image to numpy array
         st.image(image, caption="Captured Image", use_column_width=True)
         # Step 2: Detect Objects
-        detected_ids = detect_objects(image_np, net, output_layers)
         detected_objects = get_object_names(detected_ids)
         st.write(f"Detected Objects: {detected_objects}")
-        # Step 3: Filter Relevant Objects
-        setting = st.selectbox("Select Setting", ["indoor", "outdoor"], index=0)
-        relevant_objects = filter_relevant_objects(detected_objects, setting)
-        st.write(f"Relevant Objects: {relevant_objects}")
-        # Step 4: Generate Summary
-        summary = generate_summary(relevant_objects)
         st.write(f"Summary: {summary}")
-        # Step 5: Convert Summary to Speech
         text_to_speech(summary)
-        # Step 6: GPS Navigation (simulated)
-        address = st.text_input("Enter Object's Address", "1600 Amphitheatre Parkway, Mountain View, CA")
-        if st.button("Get Distance to Object"):
-            distance, duration = get_distance_to_object(address)
-            st.write(f"Distance to Object: {distance}, Duration: {duration}")
 if __name__ == "__main__":
     main()

 import streamlit as st
+from PIL import Image, ImageDraw
 import numpy as np
+import torch
+from transformers import DetrImageProcessor, DetrForObjectDetection
 from gtts import gTTS
 import os
+# Load the model and processor
+@st.cache_resource  # Cache the model to improve performance
 def load_model():
+    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+    return processor, model
+# Function to detect objects in the image
+def detect_objects(image, processor, model):
+    # Preprocess the image and make predictions
+    inputs = processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
     # Process the outputs
+    target_sizes = torch.tensor([image.size[::-1]])  # Convert to (height, width)
+    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
+    detected_objects = []
+    for score, label in zip(results["scores"], results["labels"]):
+        if score > 0.9:  # Confidence threshold
+            detected_objects.append(label.item())
+    return detected_objects, results
+# Function to convert label IDs to class names
 def get_object_names(class_ids):
+    # Sample mapping (extend this according to your model's output labels)
+    COCO_INSTANCE_CATEGORY_NAMES = [
+        "N/A", "person", "bicycle", "car", "motorcycle", "airplane",
+        "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
+        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+        "baseball glove", "skateboard", "surfboard", "tennis racket",
+        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+        "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
+        "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
+        "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+        "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    ]
+    return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]
 # Mock summarization function
 def generate_summary(relevant_objects):
     tts.save("summary.mp3")
     st.audio("summary.mp3")
 # Streamlit app main function
 def main():
+    st.title("Context-Aware Object Detection App with Hugging Face")
+    # Load model
+    processor, model = load_model()
     # Step 1: Capture Image from Camera
     captured_image = st.camera_input("Take a picture")
     if captured_image is not None:
         # Open the captured image
         image = Image.open(captured_image)
         st.image(image, caption="Captured Image", use_column_width=True)
         # Step 2: Detect Objects
+        detected_ids, results = detect_objects(image, processor, model)
         detected_objects = get_object_names(detected_ids)
         st.write(f"Detected Objects: {detected_objects}")
+        # Step 3: Generate Summary
+        summary = generate_summary(detected_objects)
         st.write(f"Summary: {summary}")
+        # Step 4: Convert Summary to Speech
         text_to_speech(summary)
 if __name__ == "__main__":
     main()