Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from PIL import Image | |
| import numpy as np | |
| import torch | |
| from transformers import DetrImageProcessor, DetrForObjectDetection | |
| from gtts import gTTS | |
| # Load the model and processor | |
| # Cache the model to improve performance | |
| def load_model(): | |
| processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") | |
| model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") | |
| return processor, model | |
| # Function to detect objects in the image | |
| def detect_objects(image, processor, model): | |
| # Preprocess the image and make predictions | |
| inputs = processor(images=image, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Process the outputs | |
| target_sizes = torch.tensor([image.size[::-1]]) # Convert to (height, width) | |
| results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] | |
| detected_objects = [] | |
| for score, label in zip(results["scores"], results["labels"]): | |
| if score > 0.9: # Confidence threshold | |
| detected_objects.append(label.item()) | |
| return detected_objects, results | |
| # Function to convert label IDs to class names | |
| def get_object_names(class_ids): | |
| # Sample mapping (extend this according to your model's output labels) | |
| COCO_INSTANCE_CATEGORY_NAMES = [ | |
| "N/A", "person", "bicycle", "car", "motorcycle", "airplane", | |
| "bus", "train", "truck", "boat", "traffic light", "fire hydrant", | |
| "stop sign", "parking meter", "bench", "bird", "cat", "dog", | |
| "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", | |
| "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", | |
| "skis", "snowboard", "sports ball", "kite", "baseball bat", | |
| "baseball glove", "skateboard", "surfboard", "tennis racket", | |
| "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", | |
| "banana", "apple", "sandwich", "orange", "broccoli", "carrot", | |
| "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", | |
| "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote", | |
| "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", | |
| "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", | |
| "hair drier", "toothbrush" | |
| ] | |
| return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids] | |
| # Mock summarization function | |
| def generate_summary(relevant_objects): | |
| st.write("Generating summary for relevant objects...") | |
| summary = f"This is an {len(relevant_objects)}-item scene including: {', '.join(relevant_objects)}." | |
| return summary | |
| # Mock text-to-speech function | |
| def text_to_speech(text): | |
| st.write("Converting summary to speech...") | |
| tts = gTTS(text) | |
| tts.save("summary.mp3") | |
| st.audio("summary.mp3") | |
| # Streamlit app main function | |
| def main(): | |
| st.title("Context-Aware Object Detection App with Hugging Face") | |
| # Load model | |
| processor, model = load_model() | |
| # Step 1: Capture Image from Camera | |
| captured_image = st.camera_input("Take a picture") | |
| if captured_image is not None: | |
| # Open the captured image | |
| image = Image.open(captured_image) | |
| st.image(image, caption="Captured Image", use_column_width=True) | |
| # Step 2: Detect Objects | |
| detected_ids, results = detect_objects(image, processor, model) | |
| detected_objects = get_object_names(detected_ids) | |
| st.write(f"Detected Objects: {detected_objects}") | |
| # Step 3: Generate Summary | |
| summary = generate_summary(detected_objects) | |
| st.write(f"Summary: {summary}") | |
| # Step 4: Convert Summary to Speech | |
| text_to_speech(summary) | |
| if __name__ == "__main__": | |
| main() | |