Reaper200 commited on
Commit
3b94370
·
verified ·
1 Parent(s): e988fbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -64
app.py CHANGED
@@ -1,56 +1,56 @@
1
  import streamlit as st
2
- from PIL import Image
3
- import cv2
4
  import numpy as np
 
 
5
  from gtts import gTTS
6
  import os
7
 
8
- # Load pre-trained model and classes
 
9
  def load_model():
10
- # Load YOLO model from OpenCV
11
- net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg") # Ensure these files are in the working directory
12
- layer_names = net.getLayerNames()
13
- output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
14
- return net, output_layers
15
 
16
- # Object detection function
17
- def detect_objects(image, net, output_layers):
18
- height, width, _ = image.shape
19
-
20
- # Prepare the image for detection
21
- blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
22
- net.setInput(blob)
23
- outputs = net.forward(output_layers)
24
 
25
  # Process the outputs
26
- detected_objects = []
27
- for out in outputs:
28
- for detection in out:
29
- scores = detection[5:] # Get the scores for detected objects
30
- class_id = np.argmax(scores)
31
- confidence = scores[class_id]
32
 
33
- # Filter out weak predictions
34
- if confidence > 0.5: # Adjust threshold as needed
35
- detected_objects.append(class_id)
 
36
 
37
- return detected_objects
38
 
39
- # Mock function to convert class IDs to object names
40
  def get_object_names(class_ids):
41
- # Sample mapping (extend this according to your class IDs)
42
- class_names = {0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane",
43
- 5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light",
44
- 10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench",
45
- 14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow"}
46
- return [class_names[id] for id in class_ids if id in class_names]
47
-
48
- # Mock context-aware filter function
49
- def filter_relevant_objects(detected_objects, setting):
50
- st.write(f"Filtering relevant objects for setting: {setting}")
51
- if setting == "indoor":
52
- return [obj for obj in detected_objects if obj in ["table", "lamp"]]
53
- return detected_objects
 
 
 
 
 
54
 
55
  # Mock summarization function
56
  def generate_summary(relevant_objects):
@@ -65,17 +65,12 @@ def text_to_speech(text):
65
  tts.save("summary.mp3")
66
  st.audio("summary.mp3")
67
 
68
- # Mock GPS navigation function
69
- def get_distance_to_object(address):
70
- st.write(f"Calculating distance to address: {address}")
71
- return "5 km", "15 mins"
72
-
73
  # Streamlit app main function
74
  def main():
75
- st.title("Context-Aware Object Detection App")
76
 
77
- # Load the YOLO model
78
- net, output_layers = load_model()
79
 
80
  # Step 1: Capture Image from Camera
81
  captured_image = st.camera_input("Take a picture")
@@ -83,31 +78,19 @@ def main():
83
  if captured_image is not None:
84
  # Open the captured image
85
  image = Image.open(captured_image)
86
- image_np = np.array(image) # Convert PIL image to numpy array
87
  st.image(image, caption="Captured Image", use_column_width=True)
88
 
89
  # Step 2: Detect Objects
90
- detected_ids = detect_objects(image_np, net, output_layers)
91
  detected_objects = get_object_names(detected_ids)
92
  st.write(f"Detected Objects: {detected_objects}")
93
 
94
- # Step 3: Filter Relevant Objects
95
- setting = st.selectbox("Select Setting", ["indoor", "outdoor"], index=0)
96
- relevant_objects = filter_relevant_objects(detected_objects, setting)
97
- st.write(f"Relevant Objects: {relevant_objects}")
98
-
99
- # Step 4: Generate Summary
100
- summary = generate_summary(relevant_objects)
101
  st.write(f"Summary: {summary}")
102
 
103
- # Step 5: Convert Summary to Speech
104
  text_to_speech(summary)
105
 
106
- # Step 6: GPS Navigation (simulated)
107
- address = st.text_input("Enter Object's Address", "1600 Amphitheatre Parkway, Mountain View, CA")
108
- if st.button("Get Distance to Object"):
109
- distance, duration = get_distance_to_object(address)
110
- st.write(f"Distance to Object: {distance}, Duration: {duration}")
111
-
112
  if __name__ == "__main__":
113
  main()
 
1
  import streamlit as st
2
+ from PIL import Image, ImageDraw
 
3
  import numpy as np
4
+ import torch
5
+ from transformers import DetrImageProcessor, DetrForObjectDetection
6
  from gtts import gTTS
7
  import os
8
 
9
+ # Load the model and processor
10
+ @st.cache_resource # Cache the model to improve performance
11
  def load_model():
12
+ processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
13
+ model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
14
+ return processor, model
 
 
15
 
16
+ # Function to detect objects in the image
17
+ def detect_objects(image, processor, model):
18
+ # Preprocess the image and make predictions
19
+ inputs = processor(images=image, return_tensors="pt")
20
+ with torch.no_grad():
21
+ outputs = model(**inputs)
 
 
22
 
23
  # Process the outputs
24
+ target_sizes = torch.tensor([image.size[::-1]]) # Convert to (height, width)
25
+ results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
 
 
 
 
26
 
27
+ detected_objects = []
28
+ for score, label in zip(results["scores"], results["labels"]):
29
+ if score > 0.9: # Confidence threshold
30
+ detected_objects.append(label.item())
31
 
32
+ return detected_objects, results
33
 
34
+ # Function to convert label IDs to class names
35
  def get_object_names(class_ids):
36
+ # Sample mapping (extend this according to your model's output labels)
37
+ COCO_INSTANCE_CATEGORY_NAMES = [
38
+ "N/A", "person", "bicycle", "car", "motorcycle", "airplane",
39
+ "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
40
+ "stop sign", "parking meter", "bench", "bird", "cat", "dog",
41
+ "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
42
+ "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
43
+ "skis", "snowboard", "sports ball", "kite", "baseball bat",
44
+ "baseball glove", "skateboard", "surfboard", "tennis racket",
45
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
46
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
47
+ "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
48
+ "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
49
+ "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
50
+ "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
51
+ "hair drier", "toothbrush"
52
+ ]
53
+ return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]
54
 
55
  # Mock summarization function
56
  def generate_summary(relevant_objects):
 
65
  tts.save("summary.mp3")
66
  st.audio("summary.mp3")
67
 
 
 
 
 
 
68
  # Streamlit app main function
69
  def main():
70
+ st.title("Context-Aware Object Detection App with Hugging Face")
71
 
72
+ # Load model
73
+ processor, model = load_model()
74
 
75
  # Step 1: Capture Image from Camera
76
  captured_image = st.camera_input("Take a picture")
 
78
  if captured_image is not None:
79
  # Open the captured image
80
  image = Image.open(captured_image)
 
81
  st.image(image, caption="Captured Image", use_column_width=True)
82
 
83
  # Step 2: Detect Objects
84
+ detected_ids, results = detect_objects(image, processor, model)
85
  detected_objects = get_object_names(detected_ids)
86
  st.write(f"Detected Objects: {detected_objects}")
87
 
88
+ # Step 3: Generate Summary
89
+ summary = generate_summary(detected_objects)
 
 
 
 
 
90
  st.write(f"Summary: {summary}")
91
 
92
+ # Step 4: Convert Summary to Speech
93
  text_to_speech(summary)
94
 
 
 
 
 
 
 
95
  if __name__ == "__main__":
96
  main()