Spaces:

Saiky2k
/

DepthPro_CVProject

Sleeping

App Files Files Community

DepthPro_CVProject / app.py

Saiky2k

Update app.py

606a3bb verified about 1 year ago

raw

history blame

19.4 kB

	# app.py
	import streamlit as st
	from PIL import Image
	import cv2
	import numpy as np
	import torch
	from ultralytics import YOLO
	import time
	import tempfile
	import os
	import requests
	from io import BytesIO

	# Cấu hình trang
	st.set_page_config(page_title="Phát hiện người và độ sâu", layout="wide")

	# Giả lập module depth_pro
	class Depth_pro:
	@staticmethod
	def create_model_and_transforms():
	# Import thư viện cần thiết
	import torch
	from transformers import AutoImageProcessor, AutoModelForDepthEstimation

	# Tải processor và model
	processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
	model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")

	# Thiết lập model và đưa về chế độ đánh giá
	model.eval()

	# Hàm transform cho ảnh đầu vào
	def transform(image):
	return processor(images=image, return_tensors="pt").pixel_values

	# Mở rộng model với phương thức infer (tương thích với mã ban đầu)
	def infer_method(self, image, f_px=None):
	with torch.no_grad():
	outputs = self(image)
	predicted_depth = outputs.predicted_depth

	# Chuẩn hóa độ sâu
	depth_min = torch.min(predicted_depth)
	depth_max = torch.max(predicted_depth)
	predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
	predicted_depth = predicted_depth * 10 # Nhân với 10 để có giá trị mét hợp lý

	return {"depth": predicted_depth}

	# Gắn phương thức infer vào model
	model.infer = infer_method.__get__(model)

	return model, transform

	# Tải model YOLO và cache
	@st.cache_resource
	def load_yolo_model():
	model = YOLO("yolov8n.pt")
	return model

	# Tạo bản đồ màu từ ảnh độ sâu
	def create_depth_colormap(depth_map):
	# Chuẩn hóa độ sâu
	depth_np_normalized = (depth_map - np.min(depth_map)) / (np.max(depth_map) - np.min(depth_map))
	inv_depth_np_normalized = 1 - depth_np_normalized # Đảo ngược (gần = sáng, xa = tối)

	# Chuyển đổi sang bản đồ màu
	depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)

	# Chuyển đổi từ BGR sang RGB
	depth_colormap_rgb = cv2.cvtColor(depth_colormap, cv2.COLOR_BGR2RGB)

	return depth_colormap_rgb

	# Vẽ nhãn trên ảnh
	def draw_depth_label(image, text, position):
	x1, y1 = position
	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 0.7
	font_thickness = 2
	text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]

	# Vẽ hình chữ nhật nền
	text_x = x1
	text_y = y1 - 10
	rect_x1 = text_x - 5
	rect_y1 = text_y - text_size[1] - 5
	rect_x2 = text_x + text_size[0] + 5
	rect_y2 = text_y + 5
	cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)

	# Vẽ văn bản
	cv2.putText(image, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)

	# Chức năng xử lý ảnh
	def process_image():
	st.header("Phát hiện người và Ước tính độ sâu trong Ảnh")

	# Tùy chọn cho ảnh
	upload_option = st.radio("Chọn nguồn ảnh:", ["Tải lên ảnh", "Sử dụng ảnh mẫu"])

	image = None

	if upload_option == "Tải lên ảnh":
	uploaded_image = st.file_uploader("Tải lên ảnh", type=["jpg", "jpeg", "png"])
	if uploaded_image is not None:
	image = Image.open(uploaded_image)
	image_np = np.array(image)

	# Hiển thị ảnh gốc
	st.image(image_np, caption="Ảnh đã tải lên", use_container_width=True)
	else:
	# Sử dụng ảnh mẫu
	sample_img_url = "https://storage.googleapis.com/sfr-vision-language-research/DINO/ground_truth_images/000000014439.jpg"
	try:
	response = requests.get(sample_img_url)
	image = Image.open(BytesIO(response.content))
	image_np = np.array(image)

	# Hiển thị ảnh mẫu
	st.image(image_np, caption="Ảnh mẫu", use_container_width=True)
	except Exception as e:
	st.error(f"Không thể tải ảnh mẫu: {e}")

	# Ngưỡng tin cậy cho phát hiện
	confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)

	# Chỉ tiếp tục nếu có ảnh
	if image is not None and st.button("Xử lý Ảnh"):
	with st.spinner("Đang xử lý ảnh..."):
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	st.info(f"Đang sử dụng thiết bị: {device}")

	# Tải models
	yolo_model = load_yolo_model()
	depth_pro = Depth_pro()
	depth_model, transform = depth_pro.create_model_and_transforms()
	if device.type == 'cuda':
	depth_model.to(device)

	# Phát hiện người
	results = yolo_model(image_np, conf=confidence)

	# Chuẩn bị ảnh để vẽ kết quả
	output_image = image_np.copy()

	# Chuẩn bị đầu vào cho model độ sâu
	if len(image_np.shape) == 3 and image_np.shape[2] == 3:
	if image_np.dtype == np.uint8:
	rgb_image = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) if image_np.shape[2] == 3 else image_np
	pil_image = Image.fromarray(rgb_image)
	else:
	pil_image = image
	else:
	pil_image = image

	# Chuyển đổi ảnh cho model độ sâu
	depth_input = transform(pil_image)
	if device.type == 'cuda':
	depth_input = depth_input.to(device)

	# Ước tính độ sâu
	focal_length_px = torch.tensor([max(image_np.shape[1], image_np.shape[0])], device=device)
	with torch.no_grad():
	predictions = depth_model.infer(depth_input, f_px=focal_length_px)
	depth = predictions["depth"]

	# Chuyển tensor sang numpy
	depth_np = depth.squeeze().cpu().numpy()

	# Điều chỉnh kích thước bản đồ độ sâu nếu cần
	if depth_np.shape[:2] != image_np.shape[:2]:
	depth_np = cv2.resize(depth_np, (image_np.shape[1], image_np.shape[0]), interpolation=cv2.INTER_LINEAR)

	# Tạo bản đồ màu độ sâu
	depth_colormap = create_depth_colormap(depth_np)

	# Đếm số người phát hiện được
	person_count = 0

	# Xử lý kết quả YOLO
	for result in results:
	boxes = result.boxes.xyxy.cpu().numpy()
	classes = result.boxes.cls.cpu().numpy()
	confs = result.boxes.conf.cpu().numpy()

	for box, cls, conf in zip(boxes, classes, confs):
	if result.names[int(cls)] == "person" and conf > confidence:
	person_count += 1
	x1, y1, x2, y2 = map(int, box[:4])

	# Vẽ khung giới hạn
	cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# Tính độ sâu ở vị trí trung tâm
	center_x = (x1 + x2) // 2
	center_y = (y1 + y2) // 2

	# Đảm bảo tọa độ trong giới hạn
	center_x = min(center_x, depth_np.shape[1] - 1)
	center_y = min(center_y, depth_np.shape[0] - 1)

	depth_value = depth_np[center_y, center_x]

	# Vẽ thông tin độ sâu
	text = f"Độ sâu: {depth_value:.2f}m"
	draw_depth_label(output_image, text, (x1, y1))

	# Hiển thị kết quả
	st.success(f"Đã phát hiện {person_count} người trong ảnh")

	col1, col2 = st.columns(2)
	col1.image(output_image, caption="Phát hiện người với độ sâu", use_container_width=True)
	col2.image(depth_colormap, caption="Bản đồ độ sâu", use_container_width=True)

	# Chức năng xử lý video
	def process_video():
	st.header("Phát hiện người và Ước tính độ sâu trong Video")

	# Tùy chọn cho video
	upload_option = st.radio("Chọn nguồn video:", ["Tải lên video", "Sử dụng video mẫu"])

	video_path = None

	if upload_option == "Tải lên video":
	uploaded_video = st.file_uploader("Tải lên video", type=["mp4", "avi", "mov"])
	if uploaded_video is not None:
	# Lưu video tải lên vào tệp tạm thời
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
	temp_file.write(uploaded_video.read())
	video_path = temp_file.name
	temp_file.close()

	# Hiển thị video gốc
	st.video(video_path)
	else:
	# Sử dụng video mẫu
	sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
	try:
	# Tải video mẫu
	response = requests.get(sample_video_url)
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
	temp_file.write(response.content)
	video_path = temp_file.name
	temp_file.close()

	# Hiển thị video mẫu
	st.video(video_path)
	except Exception as e:
	st.error(f"Không thể tải video mẫu: {e}")

	# Ngưỡng tin cậy cho phát hiện
	confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)

	# Mỗi bao nhiêu khung hình thì cập nhật độ sâu
	depth_update_interval = st.slider("Cập nhật độ sâu mỗi (số khung hình):", 1, 10, 5)

	# Chỉ tiếp tục nếu có video
	if video_path is not None and st.button("Xử lý Video"):
	# Hiển thị thanh tiến trình
	progress_bar = st.progress(0)
	status_text = st.empty()

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	st.info(f"Đang sử dụng thiết bị: {device}")

	# Tải models
	with st.spinner('Đang tải mô hình YOLO...'):
	yolo_model = load_yolo_model()
	if device.type == 'cuda':
	yolo_model.to(device)

	with st.spinner('Đang tải mô hình độ sâu...'):
	depth_pro = Depth_pro()
	depth_model, transform = depth_pro.create_model_and_transforms()
	if device.type == 'cuda':
	depth_model.to(device)

	# Mở video
	cap = cv2.VideoCapture(video_path)

	# Lấy thông tin video
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = cap.get(cv2.CAP_PROP_FPS)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Tạo tệp đầu ra
	temp_output_dir = tempfile.mkdtemp()
	output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
	output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")

	# Thiết lập writers
	fourcc = cv2.VideoWriter_fourcc(*'XVID')
	out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
	out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))

	# Ước tính chiều dài tiêu cự
	focal_length_px = torch.tensor([max(width, height)], device=device)

	# Cột hiển thị khung hình đang xử lý
	preview_col1, preview_col2 = st.columns(2)
	detection_placeholder = preview_col1.empty()
	depth_placeholder = preview_col2.empty()

	frame_counter = 0
	start_time = time.time()
	depth_np = None

	try:
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	frame_counter += 1

	# Cập nhật tiến trình
	progress = int(frame_counter / total_frames * 100)
	progress_bar.progress(progress)

	if frame_counter % 10 == 0:
	elapsed_time = time.time() - start_time
	frames_left = total_frames - frame_counter
	est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
	status_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")

	# Phát hiện người với YOLO
	results = yolo_model(frame, conf=confidence)

	person_boxes = []
	for result in results:
	boxes = result.boxes.xyxy.cpu().numpy()
	classes = result.boxes.cls.cpu().numpy()
	confs = result.boxes.conf.cpu().numpy()

	for box, cls, conf in zip(boxes, classes, confs):
	if result.names[int(cls)] == "person" and conf > confidence:
	x1, y1, x2, y2 = map(int, box[:4])
	person_boxes.append((x1, y1, x2, y2))
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# Cập nhật độ sâu theo khoảng đã thiết lập
	if frame_counter % depth_update_interval == 0 or frame_counter == 1 or depth_np is None:
	# Chuyển đổi khung hình cho model độ sâu
	rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(rgb_frame)
	depth_input = transform(pil_image)

	if device.type == 'cuda':
	depth_input = depth_input.to(device)

	# Ước tính độ sâu
	with torch.no_grad():
	predictions = depth_model.infer(depth_input, f_px=focal_length_px)
	depth = predictions["depth"]

	depth_np = depth.squeeze().cpu().numpy()

	# Điều chỉnh kích thước nếu cần
	if depth_np.shape[:2] != (height, width):
	depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)

	# Tạo bản đồ màu độ sâu
	depth_colormap = create_depth_colormap(depth_np)

	# Thêm thông tin độ sâu cho người đã phát hiện
	for x1, y1, x2, y2 in person_boxes:
	center_x = (x1 + x2) // 2
	center_y = (y1 + y2) // 2

	# Đảm bảo tọa độ trong giới hạn
	center_x = min(center_x, depth_np.shape[1] - 1)
	center_y = min(center_y, depth_np.shape[0] - 1)

	depth_value = depth_np[center_y, center_x]

	# Vẽ thông tin độ sâu
	text = f"Độ sâu: {depth_value:.2f}m"
	draw_depth_label(frame, text, (x1, y1))

	# Hiển thị khung hình trong Streamlit (cập nhật theo khoảng)
	if frame_counter % 5 == 0 or frame_counter == 1:
	detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_container_width=True)
	depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_container_width=True)

	# Ghi khung hình vào videos
	out_detection.write(frame)
	out_depth.write(cv2.cvtColor(depth_colormap, cv2.COLOR_RGB2BGR))

	finally:
	# Giải phóng tài nguyên
	cap.release()
	out_detection.release()
	out_depth.release()

	total_time = time.time() - start_time
	st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
	st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")

	# Hiển thị videos đã xử lý
	st.subheader("Videos kết quả")

	col1, col2 = st.columns(2)
	with col1:
	st.video(output_video_path)
	st.download_button(
	label="Tải xuống video phát hiện",
	data=open(output_video_path, 'rb').read(),
	file_name="person_detection_with_depth.mp4",
	mime="video/mp4"
	)

	with col2:
	st.video(output_depth_path)
	st.download_button(
	label="Tải xuống bản đồ độ sâu",
	data=open(output_depth_path, 'rb').read(),
	file_name="depth_colormap.mp4",
	mime="video/mp4"
	)

	# Xóa tệp tạm thời
	try:
	os.unlink(video_path)
	except:
	pass

	# Giao diện chính
	def main():
	st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")

	# Chọn chế độ xử lý
	app_mode = st.sidebar.selectbox("Chọn chế độ:", ["Xử lý Ảnh", "Xử lý Video"])

	# Hiển thị thông tin
	st.sidebar.header("Thông tin")
	st.sidebar.info("""
	Mô hình sử dụng:
	- Phát hiện người: YOLOv8n
	- Độ sâu: depth_pro (GLPN-NYU)

	Màu sắc trong bản đồ độ sâu:
	- Màu đỏ/vàng: Gần
	- Màu xanh: Xa
	""")

	# Chạy chức năng tương ứng
	if app_mode == "Xử lý Ảnh":
	process_image()
	else:
	process_video()

	# Tạo tệp requirements.txt cho Hugging Face Space
	def create_requirements():
	return """
	streamlit
	numpy
	Pillow
	opencv-python-headless
	torch
	torchvision
	transformers
	ultralytics
	requests
	"""

	if __name__ == "__main__":
	main()