File size: 7,924 Bytes
177bbfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe7c93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177bbfb
 
 
a658e6a
177bbfb
 
 
 
a658e6a
 
 
 
177bbfb
 
 
 
 
 
 
 
 
6302caf
177bbfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f09c0c
177bbfb
 
 
 
 
 
 
 
 
61cf60c
 
177bbfb
 
 
 
 
 
fe6938d
177bbfb
 
6de1fe3
177bbfb
 
 
8fe7c93
 
a658e6a
8fe7c93
 
6de1fe3
8fe7c93
 
 
177bbfb
 
a658e6a
 
 
 
 
177bbfb
 
 
 
 
a658e6a
177bbfb
 
 
a658e6a
 
177bbfb
 
 
 
 
 
 
 
a658e6a
 
 
 
177bbfb
 
a658e6a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import gradio as gr
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Subset of landmarks to visualize
landmark_indices = [
    11, 13, 15,  # Left shoulder, elbow, wrist
    12, 14, 16,  # Right shoulder, elbow, wrist
    23, 25, 27,  # Left hip, knee, ankle
    24, 26, 28   # Right hip, knee, ankle
]

# Function to calculate frame similarity for the selected landmarks
def calculate_frame_similarity(ref_landmarks, inp_landmarks):
    ref = np.array([(ref_landmarks[idx].x, ref_landmarks[idx].y) for idx in landmark_indices])
    inp = np.array([(inp_landmarks[idx].x, inp_landmarks[idx].y) for idx in landmark_indices])

    ref_torso = np.linalg.norm(ref[0][:2] - ref[6][:2])
    inp_torso = np.linalg.norm(inp[0][:2] - inp[6][:2])
    ref_normalized = ref / ref_torso
    inp_normalized = inp / inp_torso

    distance = np.linalg.norm(ref_normalized - inp_normalized)
    similarity_score = max(0, 1 - distance)
    return similarity_score

# Function to draw selected landmarks
def draw_selected_landmarks(frame, landmarks):
    connections = [
        (11, 13), (13, 15),
        (12, 14), (14, 16),
        (23, 25), (25, 27),
        (24, 26), (26, 28)
    ]
    
    for idx in landmark_indices:
        landmark = landmarks[idx]
        if landmark.visibility > 0.5:
            x = int(landmark.x * frame.shape[1])
            y = int(landmark.y * frame.shape[0])
            cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)

    for start_idx, end_idx in connections:
        start = landmarks[start_idx]
        end = landmarks[end_idx]
        if start.visibility > 0.5 and end.visibility > 0.5:
            start_point = (int(start.x * frame.shape[1]), int(start.y * frame.shape[0]))
            end_point = (int(end.x * frame.shape[1]), int(end.y * frame.shape[0]))
            cv2.line(frame, start_point, end_point, (255, 0, 0), 2)

# Function to get color based on similarity score
def get_similarity_color(similarity_score):
    if similarity_score < 0.25:
        return (0, 0, 255)
    elif similarity_score < 0.75:
        return (0, 255, 255)
    else:
        return (0, 255, 0)

def generate_feedback(ref_landmarks, inp_landmarks, diff_threshold=0.05):
    # Mapping of landmark indices to joint names
    landmark_names = {
        11: "left shoulder", 13: "left elbow", 15: "left wrist",
        12: "right shoulder", 14: "right elbow", 16: "right wrist",
        23: "left hip", 25: "left knee", 27: "left ankle",
        24: "right hip", 26: "right knee", 28: "right ankle"
    }
    max_diff = 0
    max_idx = None
    # Compare each selected landmark
    for idx in landmark_indices:
        ref_point = np.array([ref_landmarks[idx].x, ref_landmarks[idx].y])
        inp_point = np.array([inp_landmarks[idx].x, inp_landmarks[idx].y])
        diff = np.linalg.norm(ref_point - inp_point)
        if diff > max_diff:
            max_diff = diff
            max_idx = idx
    advice = ""
    if max_idx is not None and max_diff > diff_threshold:
        joint = landmark_names.get(max_idx, f"landmark {max_idx}")
        ref_point = np.array([ref_landmarks[max_idx].x, ref_landmarks[max_idx].y])
        inp_point = np.array([inp_landmarks[max_idx].x, inp_landmarks[max_idx].y])
        advice = f"Your {joint} seems misaligned. "
        # Vertical adjustment (note: in image coordinates, a larger y means lower)
        if inp_point[1] > ref_point[1] + diff_threshold:
            advice += f"Try raising your {joint}."
        elif inp_point[1] < ref_point[1] - diff_threshold:
            advice += f"Try lowering your {joint}."
        # Horizontal adjustment
        if inp_point[0] < ref_point[0] - diff_threshold:
            advice += f" Also, move it to the right."
        elif inp_point[0] > ref_point[0] + diff_threshold:
            advice += f" Also, move it to the left."
    return advice

# Function to process videos and generate comparison
def process_videos(reference_video_path, input_video_path):
    output_video_path = "output_comparison.mp4"
    low_similarity_frames = []

    cap_ref = cv2.VideoCapture(reference_video_path)
    cap_inp = cv2.VideoCapture(input_video_path)
    fps = int(cap_ref.get(cv2.CAP_PROP_FPS))

    min_frame_gap = fps  # at least one second gap; adjust this as needed
    frame_index = 0
    last_saved_frame_index = -min_frame_gap
    
    ref_width = int(cap_ref.get(cv2.CAP_PROP_FRAME_WIDTH))
    ref_height = int(cap_ref.get(cv2.CAP_PROP_FRAME_HEIGHT))
    inp_width = int(cap_inp.get(cv2.CAP_PROP_FRAME_WIDTH))
    inp_height = int(cap_inp.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    output_width = max(ref_width, inp_width)
    output_height = max(ref_height, inp_height)
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (2 * output_width, output_height))

    while cap_ref.isOpened() and cap_inp.isOpened():
        ret_ref, frame_ref = cap_ref.read()
        ret_inp, frame_inp = cap_inp.read()
        if not ret_ref or not ret_inp:
            break

        frame_ref_resized = cv2.resize(frame_ref, (output_width, output_height))
        frame_inp_resized = cv2.resize(frame_inp, (output_width, output_height))

        frame_ref_rgb = cv2.cvtColor(frame_ref_resized, cv2.COLOR_BGR2RGB)
        frame_inp_rgb = cv2.cvtColor(frame_inp_resized, cv2.COLOR_BGR2RGB)

        results_ref = pose.process(frame_ref_rgb)
        results_inp = pose.process(frame_inp_rgb)

        similarity_score = 0
        feedback = "";
        if results_ref.pose_landmarks and results_inp.pose_landmarks:
            draw_selected_landmarks(frame_ref_resized, results_ref.pose_landmarks.landmark)
            draw_selected_landmarks(frame_inp_resized, results_inp.pose_landmarks.landmark)

            similarity_score = calculate_frame_similarity(
                results_ref.pose_world_landmarks.landmark, 
                results_inp.pose_world_landmarks.landmark
            )

            if (similarity_score<0.75) :
                feedback = generate_feedback(results_ref.pose_landmarks.landmark, results_inp.pose_landmarks.landmark)
        similarity_color = get_similarity_color(similarity_score)

        combined_frame = cv2.hconcat([frame_ref_resized, frame_inp_resized])

        cv2.putText(
            combined_frame, 
            f"Similarity: {similarity_score*100:.2f}", 
            (50, 50), 
            cv2.FONT_HERSHEY_SIMPLEX, 
            2, 
            similarity_color, 
            2
        )
        cv2.putText(
            combined_frame,
            "Feedback: " + feedback,
            (50,100),
            cv2.FONT_HERSHEY_SIMPLEX, 
            1, 
            (255,255,255), 
            2
        ) 

        out.write(combined_frame)
        if similarity_score < 0.5 and (frame_index - last_saved_frame_index) >= min_frame_gap:
            low_similarity_frames.append(cv2.cvtColor(combined_frame.copy(), cv2.COLOR_BGR2RGB))
            last_saved_frame_index = frame_index

        frame_index += 1

    cap_ref.release()
    cap_inp.release()
    out.release()

    return output_video_path, low_similarity_frames

# Gradio interface
def compare_dance_videos(reference_video, input_video):
    output_path, low_similarity_frames = process_videos(reference_video, input_video)
    return output_path, low_similarity_frames

# Gradio app setup
gr.Interface(
    fn=compare_dance_videos,
    inputs=[
        gr.Video(label="Reference Video"),
        gr.Video(label="Input Video")
    ],
    outputs=[
        gr.Video(label="Output Comparison Video"),
        gr.Gallery(label="Low Similarity Frames")
    ],
    title="Dance Comparison Tool",
    description="Upload two videos to compare the dance sequences and generate a similarity score visualization."
).launch()