Spaces:

Saiky2k
/

DepthPro_CVProject

Sleeping

App Files Files Community

Saiky2k commited on Mar 21, 2025

Commit

75b70fe

verified ·

1 Parent(s): 1fbf5cf

Update app.py

Browse files

Files changed (1) hide show

app.py +316 -243

app.py CHANGED Viewed

@@ -4,119 +4,151 @@ from PIL import Image
 import cv2
 import numpy as np
 import torch
-from ultralytics import YOLO
-import time
 import tempfile
 import os
 import requests
 from io import BytesIO
-# Tạo module depth_pro đơn giản (để thay thế module gốc)
-class DepthPro:
-    @staticmethod
-    def create_model_and_transforms():
-        # Nhập các thư viện cần thiết ở đây để tránh lỗi khi khởi tạo
-        import torch
-        from transformers import AutoImageProcessor, AutoModelForDepthEstimation
-        # Tải mô hình depth estimation từ Hugging Face
-        processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
-        model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
-        # Tạo hàm transform đơn giản
-        def transform(image):
-            return processor(images=image, return_tensors="pt").pixel_values
-        # Mở rộng model với phương thức infer
-        def infer_method(self, image, f_px=None):
-            with torch.no_grad():
-                outputs = self(image)
-                predicted_depth = outputs.predicted_depth
-            # Chuẩn hóa độ sâu
-            depth_min = torch.min(predicted_depth)
-            depth_max = torch.max(predicted_depth)
-            predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
-            predicted_depth = predicted_depth * 10  # Nhân với 10 để có giá trị mét hợp lý hơn
-            return {"depth": predicted_depth}
-        # Thêm phương thức infer vào model
-        model.infer = infer_method.__get__(model)
-        return model, transform
-# Hàm tải mô hình YOLO từ Hugging Face
 @st.cache_resource
 def load_yolo_model():
-    # Sử dụng mô hình YOLOv8n từ Hugging Face
     model = YOLO("yolov8n.pt")
     return model
-# Hàm tải và chuẩn bị mô hình độ sâu
-@st.cache_resource
-def load_depth_model():
-    depth_pro = DepthPro()
-    model, transform = depth_pro.create_model_and_transforms()
-    return model, transform
-# Hàm xử lý video
-def process_video(video_path):
-    # Kiểm tra CUDA
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    st.info(f"Đang sử dụng thiết bị: {device}")
-    # Tải mô hình YOLO
-    with st.spinner('Đang tải mô hình YOLO...'):
-        yolo_model = load_yolo_model()
-        if device.type == 'cuda':
-            yolo_model.to(device)
-    # Tải mô hình độ sâu
-    with st.spinner('Đang tải mô hình độ sâu...'):
-        depth_model, transform = load_depth_model()
-        depth_model.eval()
-        if device.type == 'cuda':
-            depth_model.to(device)
-    # Mở video để xử lý
     cap = cv2.VideoCapture(video_path)
-    # Lấy thuộc tính video cho đầu ra
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Tạo tệp tạm thời cho video đầu ra
     temp_output_dir = tempfile.mkdtemp()
-    output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
-    output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")
-    # Sử dụng codec phù hợp với môi trường Hugging Face
-    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # Thay đổi từ mp4v sang XVID cho tương thích tốt hơn
-    out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
-    out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))
-    # Ước tính chiều dài tiêu cự và chuyển đổi sang tensor
-    focal_length_px = torch.tensor([max(width, height)], device=device)
-    # Hiển thị thanh tiến trình
-    progress_bar = st.progress(0)
-    progress_text = st.empty()
     frame_counter = 0
-    start_time = time.time()
-    # Tạo cột để hiển thị khung video
-    col1, col2 = st.columns(2)
-    detection_placeholder = col1.empty()
-    depth_placeholder = col2.empty()
-    # Giảm kích thước frame để tăng tốc độ xử lý
-    target_width = 640  # Kích thước đích
-    scale_factor = target_width / width
-    target_height = int(height * scale_factor)
     try:
         while cap.isOpened():
@@ -127,208 +159,249 @@ def process_video(video_path):
             frame_counter += 1
             # Cập nhật tiến trình
-            progress = int(frame_counter / total_frames * 100)
-            progress_bar.progress(progress)
-            if frame_counter % 10 == 0:  # Hiển thị tiến trình mỗi 10 khung hình
-                elapsed_time = time.time() - start_time
-                frames_left = total_frames - frame_counter
-                est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
-                progress_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")
-            # Giảm kích thước khung hình để tăng tốc xử lý
-            if scale_factor < 1:
-                frame_resized = cv2.resize(frame, (target_width, target_height))
-            else:
-                frame_resized = frame
-            # Phát hiện YOLO
-            results = yolo_model(frame_resized)
-            person_boxes = []
-            for result in results:
-                boxes = result.boxes.xyxy.cpu().numpy()
-                classes = result.boxes.cls.cpu().numpy()
-                confs = result.boxes.conf.cpu().numpy()
-                for box, cls, conf in zip(boxes, classes, confs):
-                    if result.names[int(cls)] == "person" and conf > 0.5:  # Thêm ngưỡng tin cậy
-                        if scale_factor < 1:  # Điều chỉnh lại khung giới hạn nếu đã thay đổi kích thước
-                            x1, y1, x2, y2 = map(int, [box[0]/scale_factor, box[1]/scale_factor,
-                                                      box[2]/scale_factor, box[3]/scale_factor])
-                        else:
-                            x1, y1, x2, y2 = map(int, box[:4])
-                        person_boxes.append((x1, y1, x2, y2))
-                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-            # Chuyển đổi khung hình cho đầu vào mô hình độ sâu
-            rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(rgb_frame)
-            depth_input = transform(pil_image)
-            if device.type == 'cuda':
-                depth_input = depth_input.to(device)
-            # Ước tính độ sâu
-            with torch.no_grad():
-                predictions = depth_model.infer(depth_input, f_px=focal_length_px)
-                depth = predictions["depth"]  # Độ sâu theo [m]
-            depth_np = depth.squeeze().cpu().numpy()
-            # Điều chỉnh lại kích thước bản đồ độ sâu
-            if scale_factor < 1:
-                depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)
             # Tạo bản đồ màu độ sâu
-            depth_np_normalized = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min())
-            inv_depth_np_normalized = 1 - depth_np_normalized
-            depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
-            # Thêm giá trị độ sâu cho người được phát hiện
-            for x1, y1, x2, y2 in person_boxes:
                 center_x = (x1 + x2) // 2
                 center_y = (y1 + y2) // 2
                 # Đảm bảo tọa độ nằm trong giới hạn
-                center_x = min(center_x, depth_np.shape[1] - 1)
-                center_y = min(center_y, depth_np.shape[0] - 1)
-                depth_value = depth_np[center_y, center_x]
-                text = f"Độ sâu: {depth_value:.2f} m"
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                font_scale = 0.8  # Giảm kích thước font để phù hợp
-                font_thickness = 2
-                text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
-                text_x = x1
-                text_y = y1 - 10
-                rect_x1 = text_x - 5
-                rect_y1 = text_y - text_size[1] - 10
-                rect_x2 = text_x + text_size[0] + 5
-                rect_y2 = text_y + 5
-                cv2.rectangle(frame, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
-                cv2.putText(frame, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
-            # Hiển thị khung hình trong Streamlit (cập nhật mỗi 5 khung hình để tránh làm chậm)
             if frame_counter % 5 == 0:
                 detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_column_width=True)
                 depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
-            # Ghi khung hình vào video đầu ra
-            out_detection.write(frame)
-            out_depth.write(depth_colormap)
     finally:
         # Giải phóng tài nguyên
         cap.release()
-        out_detection.release()
-        out_depth.release()
-        total_time = time.time() - start_time
-        st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
-        st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")
-        return output_video_path, output_depth_path
-# Giao diện Streamlit chính
 def main():
-    st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")
-    st.write("Tải lên video để phát hiện người và hiển thị thông tin độ sâu")
-    # Tùy chọn video mẫu
     st.sidebar.header("Tùy chọn")
-    use_sample = st.sidebar.checkbox("Sử dụng video mẫu")
-    video_path = None
-    if use_sample:
-        st.info("Đang sử dụng video mẫu...")
-        # URL của video mẫu (đặt URL video mẫu của bạn ở đây)
-        sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
-        try:
-            # Tải video mẫu
-            response = requests.get(sample_video_url)
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-            temp_file.write(response.content)
-            video_path = temp_file.name
-            temp_file.close()
-            st.video(video_path)
-        except Exception as e:
-            st.error(f"Không thể tải video mẫu: {e}")
-            video_path = None
     else:
-        # Tải lên tệp video
-        uploaded_file = st.file_uploader("Chọn một tệp video", type=['mp4', 'avi', 'mov'])
-        if uploaded_file is not None:
-            # Lưu tệp đã tải lên vào thư mục tạm thời
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-            temp_file.write(uploaded_file.read())
-            video_path = temp_file.name
-            temp_file.close()
-            st.video(video_path)
-    # Hiển thị thông tin về mô hình
-    st.sidebar.header("Thông tin mô hình")
     st.sidebar.markdown("""
     - Phát hiện người: YOLOv8n
-    - Ước tính độ sâu: GLPN-NYU từ HuggingFace
-    """)
-    # Thêm tùy chọn cho độ tin cậy phát hiện
-    confidence = st.sidebar.slider("Ngưỡng tin cậy", 0.0, 1.0, 0.5)
-    # Nút để bắt đầu xử lý
-    if video_path and st.button("Xử lý Video"):
-        with st.spinner("Đang xử lý video..."):
-            detection_video_path, depth_video_path = process_video(video_path)
-        # Hiển thị video đã xử lý
-        st.subheader("Video đã xử lý")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.video(detection_video_path)
-            st.download_button(
-                label="Tải xuống video phát hiện",
-                data=open(detection_video_path, 'rb').read(),
-                file_name="person_detection_with_depth.mp4",
-                mime="video/mp4"
-            )
-        with col2:
-            st.video(depth_video_path)
-            st.download_button(
-                label="Tải xuống bản đồ độ sâu",
-                data=open(depth_video_path, 'rb').read(),
-                file_name="depth_colormap.mp4",
-                mime="video/mp4"
-            )
-        # Xóa tệp tạm thời
-        os.unlink(video_path)
-# Tệp requirements.txt
 def create_requirements():
-    requirements = """
-    streamlit
-    numpy
-    Pillow
-    opencv-python
-    torch
-    torchvision
-    transformers
-    ultralytics
-    requests
-    opencv-python
     """
-    return requirements
 if __name__ == "__main__":
     main()

 import cv2
 import numpy as np
 import torch
 import tempfile
 import os
 import requests
 from io import BytesIO
+# Cấu hình trang
+st.set_page_config(page_title="Phát hiện người và độ sâu", layout="wide")
+# Tạo module độ sâu đơn giản
+class DepthEstimator:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    def load_model(self):
+        if self.model is None:
+            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+            self.processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
+            self.model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
+            self.model.to(self.device)
+            self.model.eval()
+        return self.model, self.processor
+    def predict_depth(self, image):
+        model, processor = self.load_model()
+        # Chuẩn bị đầu vào
+        if isinstance(image, np.ndarray):
+            # Chuyển từ OpenCV (BGR) sang RGB
+            if image.shape[2] == 3:
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+        else:
+            pil_image = image
+        inputs = processor(images=pil_image, return_tensors="pt").to(self.device)
+        # Dự đoán độ sâu
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+        # Chuẩn hóa độ sâu để hiển thị tốt hơn
+        depth_min = torch.min(predicted_depth)
+        depth_max = torch.max(predicted_depth)
+        normalized_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
+        normalized_depth = normalized_depth * 10  # Nhân với 10 để có giá trị mét hợp lý hơn
+        # Chuyển đổi sang mảng numpy
+        depth_map = normalized_depth.squeeze().cpu().numpy()
+        return depth_map
+# Tải và cache mô hình YOLO
 @st.cache_resource
 def load_yolo_model():
+    from ultralytics import YOLO
     model = YOLO("yolov8n.pt")
     return model
+# Phát hiện người trong ảnh
+def detect_people(image, confidence_threshold=0.5):
+    yolo_model = load_yolo_model()
+    results = yolo_model(image, conf=confidence_threshold)
+    person_boxes = []
+    for result in results:
+        boxes = result.boxes.xyxy.cpu().numpy()
+        classes = result.boxes.cls.cpu().numpy()
+        confs = result.boxes.conf.cpu().numpy()
+        for box, cls, conf in zip(boxes, classes, confs):
+            if result.names[int(cls)] == "person" and conf > confidence_threshold:
+                x1, y1, x2, y2 = map(int, box[:4])
+                person_boxes.append((x1, y1, x2, y2, conf))
+    return person_boxes
+# Xử lý ảnh
+def process_image(image, confidence=0.5):
+    # Tạo bản sao của ảnh để vẽ lên
+    display_image = image.copy()
+    # Phát hiện người
+    person_boxes = detect_people(image, confidence)
+    # Ước tính độ sâu
+    depth_estimator = DepthEstimator()
+    depth_map = depth_estimator.predict_depth(image)
+    # Tạo bản đồ màu độ sâu
+    depth_colormap = create_depth_colormap(depth_map)
+    # Vẽ khung giới hạn và thông tin độ sâu
+    for x1, y1, x2, y2, conf in person_boxes:
+        # Vẽ khung giới hạn
+        cv2.rectangle(display_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+        # Tính toán độ sâu tại vị trí trung tâm
+        center_x = (x1 + x2) // 2
+        center_y = (y1 + y2) // 2
+        # Đảm bảo tọa độ nằm trong giới hạn
+        center_x = min(center_x, depth_map.shape[1] - 1) if center_x < depth_map.shape[1] else depth_map.shape[1] // 2
+        center_y = min(center_y, depth_map.shape[0] - 1) if center_y < depth_map.shape[0] else depth_map.shape[0] // 2
+        depth_value = depth_map[center_y, center_x]
+        # Vẽ nhãn độ sâu
+        text = f"Độ sâu: {depth_value:.2f}m ({conf:.2f})"
+        draw_label(display_image, text, (x1, y1))
+    return display_image, depth_colormap, len(person_boxes)
+# Xử lý video
+def process_video(video_path, confidence=0.5, progress_bar=None, progress_text=None):
+    # Mở video
     cap = cv2.VideoCapture(video_path)
+    # Lấy thuộc tính video
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Tạo tệp đầu ra
     temp_output_dir = tempfile.mkdtemp()
+    output_video_path = os.path.join(temp_output_dir, "detection_depth.mp4")
+    # Thiết lập writer
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width * 2, height))
+    # Đối tượng phát hiện và ước tính độ sâu
+    depth_estimator = DepthEstimator()
+    # Biến đếm
     frame_counter = 0
+    person_count = 0
+    # Tạo cột để hiển thị khung hình
+    preview_col1, preview_col2 = st.columns(2)
+    detection_placeholder = preview_col1.empty()
+    depth_placeholder = preview_col2.empty()
     try:
         while cap.isOpened():
             frame_counter += 1
             # Cập nhật tiến trình
+            if progress_bar:
+                progress = int(frame_counter / total_frames * 100)
+                progress_bar.progress(progress)
+            if frame_counter % 10 == 0 and progress_text:
+                progress_text.text(f"Đang xử lý: {frame_counter}/{total_frames} khung hình")
+            # Phát hiện người
+            person_boxes = detect_people(frame, confidence)
+            person_count += len(person_boxes)
+            # Ước tính độ sâu (chỉ xử lý mỗi 5 khung hình để tăng tốc độ)
+            if frame_counter % 5 == 0 or frame_counter == 1:
+                depth_map = depth_estimator.predict_depth(frame)
             # Tạo bản đồ màu độ sâu
+            depth_colormap = create_depth_colormap(depth_map)
+            # Vẽ khung giới hạn và thông tin độ sâu
+            for x1, y1, x2, y2, conf in person_boxes:
+                # Vẽ khung giới hạn
+                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+                # Tính toán độ sâu tại vị trí trung tâm
                 center_x = (x1 + x2) // 2
                 center_y = (y1 + y2) // 2
                 # Đảm bảo tọa độ nằm trong giới hạn
+                center_x = min(center_x, depth_map.shape[1] - 1) if center_x < depth_map.shape[1] else depth_map.shape[1] // 2
+                center_y = min(center_y, depth_map.shape[0] - 1) if center_y < depth_map.shape[0] else depth_map.shape[0] // 2
+                depth_value = depth_map[center_y, center_x]
+                # Vẽ nhãn độ sâu
+                text = f"Độ sâu: {depth_value:.2f}m ({conf:.2f})"
+                draw_label(frame, text, (x1, y1))
+            # Ghép hai khung hình lại với nhau
+            combined_frame = np.hstack((frame, cv2.cvtColor(depth_colormap, cv2.COLOR_RGB2BGR)))
+            # Ghi khung hình
+            out.write(combined_frame)
+            # Hiển thị khung hình trong Streamlit (cập nhật mỗi 5 khung hình)
             if frame_counter % 5 == 0:
                 detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_column_width=True)
                 depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
     finally:
         # Giải phóng tài nguyên
         cap.release()
+        out.release()
+        # Tính trung bình số người phát hiện được
+        avg_persons = person_count / frame_counter if frame_counter > 0 else 0
+        return output_video_path, avg_persons
+# Hàm tiện ích
+def create_depth_colormap(depth_map):
+    # Chuẩn hóa độ sâu từ 0-1
+    normalized = (depth_map - np.min(depth_map)) / (np.max(depth_map) - np.min(depth_map))
+    # Đảo ngược (gần = màu ấm, xa = màu lạnh)
+    inv_depth = 1 - normalized
+    # Chuyển đổi sang bản đồ màu
+    colored = cv2.applyColorMap((inv_depth * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
+    # Chuyển đổi từ BGR sang RGB
+    return cv2.cvtColor(colored, cv2.COLOR_BGR2RGB)
+def draw_label(image, text, position):
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.7
+    font_thickness = 2
+    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
+    x, y = position
+    text_x = x
+    text_y = y - 10
+    rect_x1 = text_x - 5
+    rect_y1 = text_y - text_size[1] - 5
+    rect_x2 = text_x + text_size[0] + 5
+    rect_y2 = text_y + 5
+    cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
+    cv2.putText(image, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
+# Giao diện người dùng chính
 def main():
+    st.title("Phát hiện người và Ước tính độ sâu")
+    # Sidebar với tùy chọn
     st.sidebar.header("Tùy chọn")
+    confidence = st.sidebar.slider("Ngưỡng tin cậy", 0.0, 1.0, 0.5)
+    # Chọn chế độ: Ảnh hoặc Video
+    mode = st.sidebar.radio("Chế độ", ["Ảnh", "Video"])
+    # Chọn nguồn: Tải lên hoặc Mẫu
+    source = st.sidebar.radio("Nguồn", ["Tải lên", "Mẫu"])
+    if mode == "Ảnh":
+        if source == "Tải lên":
+            uploaded_file = st.file_uploader("Tải lên ảnh", type=['jpg', 'jpeg', 'png'])
+            if uploaded_file is not None:
+                image = Image.open(uploaded_file)
+                image = np.array(image)
+                # Chuyển đổi sang RGB nếu là RGBA
+                if image.shape[2] == 4:
+                    image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+                st.image(image, caption="Ảnh đã tải lên", use_column_width=True)
+                if st.button("Xử lý Ảnh"):
+                    with st.spinner("Đang xử lý ảnh..."):
+                        result_image, depth_colormap, person_count = process_image(image, confidence)
+                    st.success(f"Phát hiện {person_count} người trong ảnh")
+                    col1, col2 = st.columns(2)
+                    col1.image(result_image, caption="Kết quả phát hiện", use_column_width=True)
+                    col2.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
+        else:
+            # Sử dụng ảnh mẫu
+            st.info("Đang sử dụng ảnh mẫu...")
+            sample_img_url = "https://storage.googleapis.com/sfr-vision-language-research/DINO/ground_truth_images/000000014439.jpg"
+            try:
+                response = requests.get(sample_img_url)
+                image = Image.open(BytesIO(response.content))
+                image = np.array(image)
+                st.image(image, caption="Ảnh mẫu", use_column_width=True)
+                if st.button("Xử lý Ảnh"):
+                    with st.spinner("Đang xử lý ảnh..."):
+                        result_image, depth_colormap, person_count = process_image(image, confidence)
+                    st.success(f"Phát hiện {person_count} người trong ảnh")
+                    col1, col2 = st.columns(2)
+                    col1.image(result_image, caption="Kết quả phát hiện", use_column_width=True)
+                    col2.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
+            except Exception as e:
+                st.error(f"Không thể tải ảnh mẫu: {e}")
     else:
+        # Chế độ Video
+        if source == "Tải lên":
+            uploaded_file = st.file_uploader("Tải lên video", type=['mp4', 'avi', 'mov'])
+            if uploaded_file is not None:
+                # Lưu tệp đã tải lên vào thư mục tạm thời
+                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+                temp_file.write(uploaded_file.read())
+                video_path = temp_file.name
+                temp_file.close()
+                st.video(video_path)
+                if st.button("Xử lý Video"):
+                    progress_bar = st.progress(0)
+                    progress_text = st.empty()
+                    with st.spinner("Đang xử lý video..."):
+                        output_path, avg_persons = process_video(video_path, confidence, progress_bar, progress_text)
+                    st.success(f"Xử lý video hoàn tất! Trung bình phát hiện {avg_persons:.1f} người/khung hình")
+                    st.video(output_path)
+                    # Nút tải xuống
+                    with open(output_path, 'rb') as file:
+                        st.download_button(
+                            label="Tải xuống video kết quả",
+                            data=file,
+                            file_name="detection_depth_result.mp4",
+                            mime="video/mp4"
+                        )
+                    # Xóa tệp tạm thời
+                    os.unlink(video_path)
+        else:
+            # Sử dụng video mẫu
+            st.info("Đang sử dụng video mẫu...")
+            sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
+            try:
+                response = requests.get(sample_video_url)
+                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+                temp_file.write(response.content)
+                video_path = temp_file.name
+                temp_file.close()
+                st.video(video_path)
+                if st.button("Xử lý Video"):
+                    progress_bar = st.progress(0)
+                    progress_text = st.empty()
+                    with st.spinner("Đang xử lý video..."):
+                        output_path, avg_persons = process_video(video_path, confidence, progress_bar, progress_text)
+                    st.success(f"Xử lý video hoàn tất! Trung bình phát hiện {avg_persons:.1f} người/khung hình")
+                    st.video(output_path)
+                    # Nút tải xuống
+                    with open(output_path, 'rb') as file:
+                        st.download_button(
+                            label="Tải xuống video kết quả",
+                            data=file,
+                            file_name="detection_depth_result.mp4",
+                            mime="video/mp4"
+                        )
+                    # Xóa tệp tạm thời
+                    os.unlink(video_path)
+            except Exception as e:
+                st.error(f"Không thể tải video mẫu: {e}")
+    # Thông tin
+    st.sidebar.header("Thông tin")
     st.sidebar.markdown("""
+    **Mô hình sử dụng:**
     - Phát hiện người: YOLOv8n
+    - Ước tính độ sâu: GLPN-NYU
+    **Cách sử dụng:**
+    1. Chọn chế độ (Ảnh/Video)
+    2. Chọn nguồn (Tải lên/Mẫu)
+    3. Điều chỉnh ngưỡng tin cậy
+    4. Nhấn nút xử lý
+    """)
+# Thiết lập requirements.txt
 def create_requirements():
+    return """
+    streamlit==1.30.0
+    numpy==1.24.3
+    Pillow==10.0.0
+    opencv-python-headless==4.8.0.76
+    torch==2.0.1
+    torchvision==0.15.2
+    transformers==4.35.2
+    ultralytics==8.0.43
+    requests==2.31.0
     """
 if __name__ == "__main__":
     main()