Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,56 +1,56 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from PIL import Image
|
| 3 |
-
import cv2
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
| 5 |
from gtts import gTTS
|
| 6 |
import os
|
| 7 |
|
| 8 |
-
# Load
|
|
|
|
| 9 |
def load_model():
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
|
| 14 |
-
return net, output_layers
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
def detect_objects(image,
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
net.setInput(blob)
|
| 23 |
-
outputs = net.forward(output_layers)
|
| 24 |
|
| 25 |
# Process the outputs
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
for detection in out:
|
| 29 |
-
scores = detection[5:] # Get the scores for detected objects
|
| 30 |
-
class_id = np.argmax(scores)
|
| 31 |
-
confidence = scores[class_id]
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
return detected_objects
|
| 38 |
|
| 39 |
-
#
|
| 40 |
def get_object_names(class_ids):
|
| 41 |
-
# Sample mapping (extend this according to your
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# Mock summarization function
|
| 56 |
def generate_summary(relevant_objects):
|
|
@@ -65,17 +65,12 @@ def text_to_speech(text):
|
|
| 65 |
tts.save("summary.mp3")
|
| 66 |
st.audio("summary.mp3")
|
| 67 |
|
| 68 |
-
# Mock GPS navigation function
|
| 69 |
-
def get_distance_to_object(address):
|
| 70 |
-
st.write(f"Calculating distance to address: {address}")
|
| 71 |
-
return "5 km", "15 mins"
|
| 72 |
-
|
| 73 |
# Streamlit app main function
|
| 74 |
def main():
|
| 75 |
-
st.title("Context-Aware Object Detection App")
|
| 76 |
|
| 77 |
-
# Load
|
| 78 |
-
|
| 79 |
|
| 80 |
# Step 1: Capture Image from Camera
|
| 81 |
captured_image = st.camera_input("Take a picture")
|
|
@@ -83,31 +78,19 @@ def main():
|
|
| 83 |
if captured_image is not None:
|
| 84 |
# Open the captured image
|
| 85 |
image = Image.open(captured_image)
|
| 86 |
-
image_np = np.array(image) # Convert PIL image to numpy array
|
| 87 |
st.image(image, caption="Captured Image", use_column_width=True)
|
| 88 |
|
| 89 |
# Step 2: Detect Objects
|
| 90 |
-
detected_ids = detect_objects(
|
| 91 |
detected_objects = get_object_names(detected_ids)
|
| 92 |
st.write(f"Detected Objects: {detected_objects}")
|
| 93 |
|
| 94 |
-
# Step 3:
|
| 95 |
-
|
| 96 |
-
relevant_objects = filter_relevant_objects(detected_objects, setting)
|
| 97 |
-
st.write(f"Relevant Objects: {relevant_objects}")
|
| 98 |
-
|
| 99 |
-
# Step 4: Generate Summary
|
| 100 |
-
summary = generate_summary(relevant_objects)
|
| 101 |
st.write(f"Summary: {summary}")
|
| 102 |
|
| 103 |
-
# Step
|
| 104 |
text_to_speech(summary)
|
| 105 |
|
| 106 |
-
# Step 6: GPS Navigation (simulated)
|
| 107 |
-
address = st.text_input("Enter Object's Address", "1600 Amphitheatre Parkway, Mountain View, CA")
|
| 108 |
-
if st.button("Get Distance to Object"):
|
| 109 |
-
distance, duration = get_distance_to_object(address)
|
| 110 |
-
st.write(f"Distance to Object: {distance}, Duration: {duration}")
|
| 111 |
-
|
| 112 |
if __name__ == "__main__":
|
| 113 |
main()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from PIL import Image, ImageDraw
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import DetrImageProcessor, DetrForObjectDetection
|
| 6 |
from gtts import gTTS
|
| 7 |
import os
|
| 8 |
|
| 9 |
+
# Load the model and processor
|
| 10 |
+
@st.cache_resource # Cache the model to improve performance
|
| 11 |
def load_model():
|
| 12 |
+
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
|
| 13 |
+
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
|
| 14 |
+
return processor, model
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# Function to detect objects in the image
|
| 17 |
+
def detect_objects(image, processor, model):
|
| 18 |
+
# Preprocess the image and make predictions
|
| 19 |
+
inputs = processor(images=image, return_tensors="pt")
|
| 20 |
+
with torch.no_grad():
|
| 21 |
+
outputs = model(**inputs)
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Process the outputs
|
| 24 |
+
target_sizes = torch.tensor([image.size[::-1]]) # Convert to (height, width)
|
| 25 |
+
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
detected_objects = []
|
| 28 |
+
for score, label in zip(results["scores"], results["labels"]):
|
| 29 |
+
if score > 0.9: # Confidence threshold
|
| 30 |
+
detected_objects.append(label.item())
|
| 31 |
|
| 32 |
+
return detected_objects, results
|
| 33 |
|
| 34 |
+
# Function to convert label IDs to class names
|
| 35 |
def get_object_names(class_ids):
|
| 36 |
+
# Sample mapping (extend this according to your model's output labels)
|
| 37 |
+
COCO_INSTANCE_CATEGORY_NAMES = [
|
| 38 |
+
"N/A", "person", "bicycle", "car", "motorcycle", "airplane",
|
| 39 |
+
"bus", "train", "truck", "boat", "traffic light", "fire hydrant",
|
| 40 |
+
"stop sign", "parking meter", "bench", "bird", "cat", "dog",
|
| 41 |
+
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
|
| 42 |
+
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
| 43 |
+
"skis", "snowboard", "sports ball", "kite", "baseball bat",
|
| 44 |
+
"baseball glove", "skateboard", "surfboard", "tennis racket",
|
| 45 |
+
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
|
| 46 |
+
"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
|
| 47 |
+
"hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
|
| 48 |
+
"bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
|
| 49 |
+
"keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
|
| 50 |
+
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
|
| 51 |
+
"hair drier", "toothbrush"
|
| 52 |
+
]
|
| 53 |
+
return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]
|
| 54 |
|
| 55 |
# Mock summarization function
|
| 56 |
def generate_summary(relevant_objects):
|
|
|
|
| 65 |
tts.save("summary.mp3")
|
| 66 |
st.audio("summary.mp3")
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Streamlit app main function
|
| 69 |
def main():
|
| 70 |
+
st.title("Context-Aware Object Detection App with Hugging Face")
|
| 71 |
|
| 72 |
+
# Load model
|
| 73 |
+
processor, model = load_model()
|
| 74 |
|
| 75 |
# Step 1: Capture Image from Camera
|
| 76 |
captured_image = st.camera_input("Take a picture")
|
|
|
|
| 78 |
if captured_image is not None:
|
| 79 |
# Open the captured image
|
| 80 |
image = Image.open(captured_image)
|
|
|
|
| 81 |
st.image(image, caption="Captured Image", use_column_width=True)
|
| 82 |
|
| 83 |
# Step 2: Detect Objects
|
| 84 |
+
detected_ids, results = detect_objects(image, processor, model)
|
| 85 |
detected_objects = get_object_names(detected_ids)
|
| 86 |
st.write(f"Detected Objects: {detected_objects}")
|
| 87 |
|
| 88 |
+
# Step 3: Generate Summary
|
| 89 |
+
summary = generate_summary(detected_objects)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
st.write(f"Summary: {summary}")
|
| 91 |
|
| 92 |
+
# Step 4: Convert Summary to Speech
|
| 93 |
text_to_speech(summary)
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if __name__ == "__main__":
|
| 96 |
main()
|