Spaces:

Reaper200
/

ContextAwareObjectDetector

Runtime error

App Files Files Community

ContextAwareObjectDetector / app.py

Reaper200

Update app.py

05dfa7e verified over 1 year ago

raw

history blame

3.77 kB

	import streamlit as st
	from PIL import Image
	import numpy as np
	import torch
	from transformers import DetrImageProcessor, DetrForObjectDetection
	from gtts import gTTS

	# Load the model and processor
	@st.cache_resource # Cache the model to improve performance
	def load_model():
	processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
	model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
	return processor, model

	# Function to detect objects in the image
	def detect_objects(image, processor, model):
	# Preprocess the image and make predictions
	inputs = processor(images=image, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)

	# Process the outputs
	target_sizes = torch.tensor([image.size[::-1]]) # Convert to (height, width)
	results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

	detected_objects = []
	for score, label in zip(results["scores"], results["labels"]):
	if score > 0.9: # Confidence threshold
	detected_objects.append(label.item())

	return detected_objects, results

	# Function to convert label IDs to class names
	def get_object_names(class_ids):
	# Sample mapping (extend this according to your model's output labels)
	COCO_INSTANCE_CATEGORY_NAMES = [
	"N/A", "person", "bicycle", "car", "motorcycle", "airplane",
	"bus", "train", "truck", "boat", "traffic light", "fire hydrant",
	"stop sign", "parking meter", "bench", "bird", "cat", "dog",
	"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
	"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat",
	"baseball glove", "skateboard", "surfboard", "tennis racket",
	"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
	"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
	"hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
	"bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
	"keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
	"refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush"
	]
	return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]

	# Mock summarization function
	def generate_summary(relevant_objects):
	st.write("Generating summary for relevant objects...")
	summary = f"This is an {len(relevant_objects)}-item scene including: {', '.join(relevant_objects)}."
	return summary

	# Mock text-to-speech function
	def text_to_speech(text):
	st.write("Converting summary to speech...")
	tts = gTTS(text)
	tts.save("summary.mp3")
	st.audio("summary.mp3")

	# Streamlit app main function
	def main():
	st.title("Context-Aware Object Detection App with Hugging Face")

	# Load model
	processor, model = load_model()

	# Step 1: Capture Image from Camera
	captured_image = st.camera_input("Take a picture")

	if captured_image is not None:
	# Open the captured image
	image = Image.open(captured_image)
	st.image(image, caption="Captured Image", use_column_width=True)

	# Step 2: Detect Objects
	detected_ids, results = detect_objects(image, processor, model)
	detected_objects = get_object_names(detected_ids)
	st.write(f"Detected Objects: {detected_objects}")

	# Step 3: Generate Summary
	summary = generate_summary(detected_objects)
	st.write(f"Summary: {summary}")

	# Step 4: Convert Summary to Speech
	text_to_speech(summary)

	if __name__ == "__main__":
	main()