Reaper200's picture
Update app.py
05dfa7e verified
raw
history blame
3.77 kB
import streamlit as st
from PIL import Image
import numpy as np
import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
from gtts import gTTS
# Load the model and processor
@st.cache_resource # Cache the model to improve performance
def load_model():
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
return processor, model
# Function to detect objects in the image
def detect_objects(image, processor, model):
# Preprocess the image and make predictions
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# Process the outputs
target_sizes = torch.tensor([image.size[::-1]]) # Convert to (height, width)
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
detected_objects = []
for score, label in zip(results["scores"], results["labels"]):
if score > 0.9: # Confidence threshold
detected_objects.append(label.item())
return detected_objects, results
# Function to convert label IDs to class names
def get_object_names(class_ids):
# Sample mapping (extend this according to your model's output labels)
COCO_INSTANCE_CATEGORY_NAMES = [
"N/A", "person", "bicycle", "car", "motorcycle", "airplane",
"bus", "train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard", "tennis racket",
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
"hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
"bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote",
"keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
]
return [COCO_INSTANCE_CATEGORY_NAMES[id] for id in class_ids]
# Mock summarization function
def generate_summary(relevant_objects):
st.write("Generating summary for relevant objects...")
summary = f"This is an {len(relevant_objects)}-item scene including: {', '.join(relevant_objects)}."
return summary
# Mock text-to-speech function
def text_to_speech(text):
st.write("Converting summary to speech...")
tts = gTTS(text)
tts.save("summary.mp3")
st.audio("summary.mp3")
# Streamlit app main function
def main():
st.title("Context-Aware Object Detection App with Hugging Face")
# Load model
processor, model = load_model()
# Step 1: Capture Image from Camera
captured_image = st.camera_input("Take a picture")
if captured_image is not None:
# Open the captured image
image = Image.open(captured_image)
st.image(image, caption="Captured Image", use_column_width=True)
# Step 2: Detect Objects
detected_ids, results = detect_objects(image, processor, model)
detected_objects = get_object_names(detected_ids)
st.write(f"Detected Objects: {detected_objects}")
# Step 3: Generate Summary
summary = generate_summary(detected_objects)
st.write(f"Summary: {summary}")
# Step 4: Convert Summary to Speech
text_to_speech(summary)
if __name__ == "__main__":
main()