| import torch |
| import numpy as np |
|
|
|
|
| from pdvc.dp.exact_dp import drop_dtw, double_drop_dtw |
| from pdvc.dp.dp_utils import compute_sim |
| import statistics |
| from sklearn.cluster import KMeans |
| from pdvc.util import find_center_value, compute_overlap |
| |
|
|
| ''' configs of original file ''' |
| config_eval_l2norm = True |
| config_eval_keep_percentile = 0.48 |
| config_eval_fixed_drop_sim = -1 |
|
|
|
|
| ''' |
| return value: |
| frame features: [num_frames, feature_dim] -> optimal_assignment: [num_steps], -1 means no match, otherwise means the index of the matched step/caption/query |
| |
| ''' |
| |
|
|
| def clip_array(arr, threshold): |
| clipped_arr = np.where(arr > threshold, arr, threshold) |
| return clipped_arr |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| def compute_filtered_indices(topk_indices, topk_values, threshold=0.5): |
| center_index = find_center_value(np.array(topk_indices)) |
| std_index = (sum((topk_indices - center_index) ** 2 * topk_values) / (sum(topk_values) + 1e-5)) ** 0.5 |
| boundary_width = std_index * threshold |
| filtered_indices = [i for i in topk_indices if abs(i - center_index) <= boundary_width] |
| return filtered_indices |
|
|
| def compute_bbox_loss(index_list, box, similarity_values): |
| left, right = box |
| distances = [] |
|
|
| for i, index in enumerate(index_list): |
| if left <= index <= right: |
| distance = -min(index - left, right - index) |
| else: |
| distance = max(left - index, index - right) |
| |
| weighted_distance = similarity_values[i] * distance |
| distances.append(weighted_distance) |
|
|
| return sum(distances) |
|
|
|
|
|
|
|
|
|
|
| def remove_outliers(indices, threshold, mode, w): |
| |
| if mode == 'median': |
| median = statistics.median(indices) |
| elif mode == 'mean': |
| mean = sum(indices) / len(indices) |
| elif mode == 'mode': |
| count_dict = {} |
| for p in range(min(indices), max(indices) + 1): |
| |
| count = sum(1 for c in indices if p - w <= c <= p + w) |
| count_dict[p] = count |
|
|
| max_count = max(count_dict.values()) |
| best_p_values = [p for p, count in count_dict.items() if count == max_count] |
| if len(best_p_values) % 2 == 0: |
| best_p_values.pop() |
| |
| mode_value = statistics.median(best_p_values) |
| std_dev = (sum((x - mean) ** 2 for x in indices) / len(indices)) ** 0.5 |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| threshold_value = threshold * std_dev |
|
|
| |
| |
|
|
| if mode == 'median': |
| filtered_indices = [i for i in indices if abs(i - median) <= threshold_value] |
| elif mode == 'mode': |
| filtered_indices = [i for i in indices if abs(i - mode_value) <= threshold_value] |
| return filtered_indices |
|
|
|
|
| def remove_outliers_v1(indices, threshold): |
| pass |
|
|
| def get_mode(indices, w): |
| count_dict = {} |
| for p in range(min(indices), max(indices) + 1): |
| |
| count = sum(1 for c in indices if p - w <= c <= p + w) |
| count_dict[p] = count |
|
|
| max_count = max(count_dict.values()) |
| best_p_values = [p for p, count in count_dict.items() if count == max_count] |
| if len(best_p_values) % 2 == 0: |
| best_p_values.pop() |
| |
| mode_value = statistics.median(best_p_values) |
| return mode_value |
|
|
| def get_mode_box(sim, topk, w, ratio): |
| ''' 注意这里算中心的时候使用前topk是因为更相信前topk的准确率 但是确定中心以后需要找边界 就需要使用全部的''' |
| avg_caption_length = sim.shape[1] // sim.shape[0] |
| sorted_idx = torch.argsort(-sim, dim=1) |
| top_indices = sorted_idx[:, :topk] |
| |
| |
| top_cap_indices = sorted_idx[:, :avg_caption_length] |
| |
| width = int(ratio * avg_caption_length / 2) |
| |
| bbox = [] |
| for i in range(top_indices.shape[0]): |
| |
| mode_value = get_mode(top_indices[i].tolist(), w) |
| filtered_indices = [i for i in top_cap_indices[i].tolist() if abs(i - mode_value) <= width] |
|
|
| |
| |
| |
| |
| |
| if len(filtered_indices) == 0: |
| bbox.append([mode_value-width, mode_value+width]) |
| else: |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
| return bbox |
|
|
| def compute_threshold(data, threshold): |
| mean = sum(data) / len(data) |
| std_dev = (sum((x - mean) ** 2 for x in data) / len(data)) ** 0.5 |
| threshold_value = threshold * std_dev |
| return threshold_value |
|
|
|
|
| |
| ''' find center globally, then find the boundary locally. |
| 1. find center: use the similarity as weight to find the center |
| 2. find boundary: use the center to find the boundary. steps are ''' |
| def step_retrieval_weight_sim(frame_features, step_features, topk=15, threshold=0.5, w=2): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| similarity_matrix = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| |
| window_sums = torch.nn.functional.conv1d(similarity_matrix.unsqueeze(1), torch.ones(1, 1, 2 * w + 1)).squeeze() |
|
|
| if len(window_sums.shape) == 1: |
| window_sums = window_sums.unsqueeze(0) |
| flag = 1 |
| else: |
| flag = 0 |
|
|
| top_values, top_indices = torch.topk(window_sums, topk, dim=1, largest=True, sorted=True) |
| |
|
|
| |
| _, step_center_frames = window_sums.max(dim=1) |
| step_center_frames = step_center_frames.squeeze() |
|
|
| if flag == 1: |
| step_center_frames = step_center_frames.unsqueeze(0).tolist() |
| else: |
| step_center_frames = step_center_frames.tolist() |
|
|
| bbox = [] |
| for i in range(top_indices.shape[0]): |
| threshold_value = compute_threshold(top_indices[i].tolist(), threshold) |
| filtered_indices = [frame for frame in top_indices[i].tolist() if abs(frame - step_center_frames[i]) <= threshold_value] |
| if len(filtered_indices) == 0: |
| bbox.append([step_center_frames[i] - w, step_center_frames[i] + w]) |
| else: |
| bbox.append([w + min(filtered_indices), w + max(filtered_indices)]) |
| |
| return bbox |
|
|
| ''' TODO: get the right weight using index''' |
| def step_retrieval_weight_index(frame_features, step_features, topk=15, threshold=0.5, w=2): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| similarity_matrix = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| sorted_idx = torch.argsort(-similarity_matrix, dim=1) |
| |
| window_sums = torch.nn.functional.conv1d(similarity_matrix.unsqueeze(1), torch.ones(1, 1, 2 * w + 1)).squeeze() |
|
|
| top_values, top_indices = torch.topk(window_sums, topk, dim=1, largest=True, sorted=True) |
| |
|
|
| |
| _, step_center_frames = window_sums.max(dim=1) |
| step_center_frames = step_center_frames.squeeze().tolist() |
|
|
| bbox = [] |
| for i in range(top_indices.shape[0]): |
| threshold_value = compute_threshold(top_indices[i].tolist(), threshold) |
| filtered_indices = [frame for frame in top_indices[i].tolist() if abs(frame - step_center_frames[i]) <= threshold_value] |
| bbox.append([w + min(filtered_indices), w + max(filtered_indices)]) |
| |
| return bbox |
|
|
| def uniform_box(frame_features, step_features, topk=15, threshold=0.5, w=2, mode='median'): |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
| return uniform_boxes |
| |
|
|
| def align_frame_into_steps(frame_features, step_features, topk=15, threshold=0.5, w=2, mode='median'): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| average_width = int(sim.shape[1] // sim.shape[0] / 2) |
| |
| |
|
|
| top_values, top_indices = torch.topk(sim, topk, dim=1, largest=True, sorted=True) |
| bbox = [] |
| for i in range(top_indices.shape[0]): |
| filtered_indices = remove_outliers(top_indices[i].tolist(), threshold, mode=mode, w=w) |
| if len(filtered_indices) < 2: |
| filtered_indices = remove_outliers(top_indices[i].tolist(), 2*threshold, mode=mode, w=w) |
| if len(filtered_indices) == 0: |
| bbox.append([top_indices[0] - average_width, top_indices[0] + average_width]) |
| continue |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
| return bbox |
|
|
| |
| def align_frame_into_steps_op(frame_features, step_features, topk=15, num_iterations=4, beta=1, order=False, scale=1): |
| |
| augment_ratio_list = np.arange(0.5, 2, 0.1) |
|
|
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| |
| |
| similarity_matrix = compute_sim(step_features, frame_features, config_eval_l2norm).cpu().numpy() |
|
|
| num_steps, num_frames = similarity_matrix.shape |
|
|
| |
| sorted_indices = np.argsort(similarity_matrix, axis=1) |
| |
| |
|
|
| |
| |
|
|
| |
| initial_boundary_width = num_frames / num_steps |
| |
| |
|
|
| for i in range(num_iterations): |
| if i == 0 and not order: |
| boundary_width_last = np.full(num_steps, initial_boundary_width).reshape(-1, 1, 1) |
| topk_indices = [index[-topk:] for index in sorted_indices] |
| topk_values = [similarity_matrix[i][index] for i, index in enumerate(topk_indices)] |
|
|
|
|
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
| previous_index_center = None |
| |
| else: |
| if i == 0: |
| segment_boundary = np.linspace(0, num_frames, num_steps + 1).round().astype(int) |
| start_indices, end_indices = segment_boundary[:-1], segment_boundary[1:] |
| start_indices = np.clip(start_indices - initial_boundary_width * scale, 0, num_frames) |
| end_indices = np.clip(end_indices + initial_boundary_width * scale, 0, num_frames) |
| boundary_width_last = (end_indices - start_indices).reshape(-1, 1, 1) |
|
|
| filtered_indices = [sorted_indices[i][(sorted_indices[i] >= start_indices[i]) & (sorted_indices[i] <= end_indices[i])] for i in range(num_steps)] |
| if sum(len(index) for index in filtered_indices) < topk * num_steps * 0.4: |
| boundary_width_last = np.full(num_steps, initial_boundary_width).reshape(-1, 1, 1) |
| topk_indices = [index[-topk:] for index in sorted_indices] |
| topk_values = [similarity_matrix[i][index] for i, index in enumerate(topk_indices)] |
|
|
|
|
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
| previous_index_center = None |
| else: |
| boundary_width_last = boundary_width.reshape(-1, 1, 1) |
| start_indices = np.clip(center_indexes - boundary_width // 2 - initial_boundary_width * scale, 0, num_frames) |
| end_indices = np.clip(center_indexes + boundary_width // 2 + initial_boundary_width * scale, 0, num_frames) |
|
|
| topk_indices = [] |
| topk_values = [] |
| for j, (start, end) in enumerate(zip(start_indices, end_indices)): |
| |
| filtered_indices = sorted_indices[j][(sorted_indices[j] >= start) & (sorted_indices[j] <= end)] |
| topk_index = filtered_indices[-topk:] |
| topk_indices.append(topk_index) |
| topk_values.append(similarity_matrix[j][topk_index]) |
| previous_index_center = center_indexes.copy() if i > 0 else None |
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
|
|
| |
| |
| |
| |
| boundary_width_candidates = augment_ratio_list * boundary_width_last |
| |
|
|
| index_distance = [np.abs(index - center_indexes[i] + 1e-3)[:, np.newaxis] for i, index in enumerate(topk_indices)] |
|
|
| loss_candidates_list = [value[:, np.newaxis] * (np.abs(index_distance[i] - 0.5 * boundary_width_candidates[i])) for i, value in enumerate(topk_values)] |
| |
|
|
|
|
| |
|
|
| |
| loss_sim = np.array([np.mean(loss, axis=0) for loss in loss_candidates_list]) |
|
|
| if i == 0: |
| loss = loss_sim |
| |
| else: |
| |
| overlap = compute_overlap(center_indexes, boundary_width_candidates, previous_index_center, boundary_width_last) |
| |
| |
| loss = loss_sim + beta * overlap |
| |
| |
| |
| |
| best_boundary_width_index = np.argmin(loss, axis=1) |
|
|
| |
| |
| |
| |
| boundary_width = [boundary_width_candidates[i, 0][best_boundary_width_index[i]] for i in range(num_steps)] |
| |
| boundary_width = np.array(boundary_width) |
| |
|
|
| bbox = [] |
| left_bound = np.clip(center_indexes - boundary_width // 2, 0, num_frames) |
| right_bound = np.clip(center_indexes + boundary_width // 2, 0, num_frames) |
| |
| bbox = np.stack([left_bound, right_bound], axis=1).round().astype(int) |
|
|
| return bbox.tolist() |
|
|
| |
| def align_frame_into_steps_op_v1(frame_features, step_features, topk=15, num_iterations=4, beta=1, order=False, scale=1): |
| |
| augment_ratio_list = np.arange(0.5, 2, 0.1) |
|
|
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| |
| |
| similarity_matrix = compute_sim(step_features, frame_features, config_eval_l2norm).cpu().numpy() |
|
|
| num_steps, num_frames = similarity_matrix.shape |
|
|
| |
| sorted_indices = np.argsort(similarity_matrix, axis=1) |
| |
| |
|
|
| |
| |
|
|
| |
| initial_boundary_width = num_frames / num_steps |
| |
| |
|
|
| for i in range(num_iterations): |
| if i == 0 and not order: |
| boundary_width_last = np.full(num_steps, initial_boundary_width).reshape(-1, 1, 1) |
| topk_indices = [index[-topk:] for index in sorted_indices] |
| topk_values = [similarity_matrix[i][index] for i, index in enumerate(topk_indices)] |
|
|
|
|
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
| previous_index_center = None |
| |
| else: |
| if i == 0: |
| segment_boundary = np.linspace(0, num_frames, num_steps + 1).round().astype(int) |
| start_indices, end_indices = segment_boundary[:-1], segment_boundary[1:] |
| start_indices = np.clip(start_indices - initial_boundary_width * scale, 0, num_frames) |
| end_indices = np.clip(end_indices + initial_boundary_width * scale, 0, num_frames) |
| boundary_width_last = (end_indices - start_indices).reshape(-1, 1, 1) |
|
|
| filtered_indices = [sorted_indices[i][(sorted_indices[i] >= start_indices[i]) & (sorted_indices[i] <= end_indices[i])] for i in range(num_steps)] |
| if sum(len(index) for index in filtered_indices) < topk * num_steps * 0.4: |
| boundary_width_last = np.full(num_steps, initial_boundary_width).reshape(-1, 1, 1) |
| topk_indices = [index[-topk:] for index in sorted_indices] |
| topk_values = [similarity_matrix[i][index] for i, index in enumerate(topk_indices)] |
|
|
|
|
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
| previous_index_center = None |
| else: |
| boundary_width_last = boundary_width.reshape(-1, 1, 1) |
| start_indices = np.clip(center_indexes - boundary_width // 2 - initial_boundary_width * scale, 0, num_frames) |
| end_indices = np.clip(center_indexes + boundary_width // 2 + initial_boundary_width * scale, 0, num_frames) |
|
|
| topk_indices = [] |
| topk_values = [] |
| for j, (start, end) in enumerate(zip(start_indices, end_indices)): |
| |
| filtered_indices = sorted_indices[j][(sorted_indices[j] >= start) & (sorted_indices[j] <= end)] |
| topk_index = filtered_indices[-topk:] |
| topk_indices.append(topk_index) |
| topk_values.append(similarity_matrix[j][topk_index]) |
| previous_index_center = center_indexes.copy() if i > 0 else None |
| center_indexes = np.array([find_center_value(index) for index in topk_indices]) |
|
|
| |
| |
| |
| |
| boundary_width_candidates = augment_ratio_list * boundary_width_last |
| |
|
|
| index_distance = [np.abs(index - center_indexes[i] + 1e-3)[:, np.newaxis] for i, index in enumerate(topk_indices)] |
| |
| weight_distance = [clip_array(index_distance[i], 0.5 * boundary_width_candidates[i]) for i in range(len(topk_indices))] |
|
|
| loss_candidates_list = [value[:, np.newaxis] / weight_distance[i] * (np.abs(index_distance[i] - 0.5 * boundary_width_candidates[i])) for i, value in enumerate(topk_values)] |
| |
|
|
|
|
| |
|
|
| |
| loss_sim = np.array([np.mean(loss, axis=0) for loss in loss_candidates_list]) |
|
|
| if i == 0: |
| loss = loss_sim |
| |
| else: |
| |
| overlap = compute_overlap(center_indexes, boundary_width_candidates, previous_index_center, boundary_width_last) |
| |
| |
| loss = loss_sim + beta * overlap |
| |
| |
| |
| |
| best_boundary_width_index = np.argmin(loss, axis=1) |
|
|
| |
| |
| |
| |
| boundary_width = [boundary_width_candidates[i, 0][best_boundary_width_index[i]] for i in range(num_steps)] |
| |
| boundary_width = np.array(boundary_width) |
| |
|
|
| bbox = [] |
| left_bound = np.clip(center_indexes - boundary_width // 2, 0, num_frames) |
| right_bound = np.clip(center_indexes + boundary_width // 2, 0, num_frames) |
| |
| bbox = np.stack([left_bound, right_bound], axis=1).round().astype(int) |
|
|
| return bbox.tolist() |
|
|
|
|
|
|
|
|
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| def align_frame_into_steps_op_order_v2(frame_features, step_features, topk=15, threshold=0.5, ratio=1, iteration=3): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| sorted_index = torch.argsort(-sim, dim=1) |
| top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])] |
| top_values_list_global = [sim[i][top_indices_list_global[i]] for i in range(sim.shape[0])] |
|
|
|
|
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
|
|
| iter_bbox_loss = {} |
| for iter in range(iteration): |
| if iter == 0: |
| refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio) |
| else: |
| refined_uniform_boxes = expand_window(bbox, frame_features.shape[0], step_features.shape[0], ratio) |
|
|
|
|
| |
| |
| top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])] |
| top_values_list_local = [sim[i][top_indices_list_local[i]] for i in range(sim.shape[0])] |
|
|
| size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])] |
| if sum(size_local) < (topk-2) * len(size_local): |
| top_indices_list = top_indices_list_global |
| top_values_list = top_values_list_global |
| else: |
| top_indices_list = top_indices_list_local |
| top_values_list = top_values_list_local |
|
|
| |
|
|
| bbox = [] |
| for i in range(len(top_indices_list)): |
| filtered_indices = compute_filtered_indices(top_indices_list[i].tolist(), top_values_list[i].tolist(), threshold) |
| if len(filtered_indices) == 0: |
| filtered_indices = compute_filtered_indices(top_indices_list_global[i].tolist(), top_indices_list_global[i].tolist(), threshold) |
| if len(filtered_indices) == 0: |
| bbox.append(uniform_boxes[i]) |
| continue |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
|
|
| |
| bbox_loss_list = [compute_bbox_loss(top_indices_list[i], bbox[i], top_values_list[i]) for i in range(len(top_indices_list))] |
| bbox_loss = sum(bbox_loss_list) |
| iter_bbox_loss[iter] = {'loss': bbox_loss, 'bbox': bbox} |
|
|
| |
| min_loss_iter = min(iter_bbox_loss.keys(), key=lambda k: iter_bbox_loss[k]['loss']) |
| min_loss = iter_bbox_loss[min_loss_iter]['loss'] |
| best_bbox = iter_bbox_loss[min_loss_iter]['bbox'] |
| |
|
|
| return (best_bbox, min_loss) |
|
|
| def align_frame_into_steps_op_v2(frame_features, step_features, topk=15, threshold=0.5, ratio=1, iteration=3): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| sorted_index = torch.argsort(-sim, dim=1) |
| top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])] |
| top_values_list_global = [sim[i][top_indices_list_global[i]] for i in range(sim.shape[0])] |
|
|
|
|
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
|
|
| iter_bbox_loss = {} |
| for iter in range(iteration): |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| bbox = [] |
| for i in range(len(top_indices_list_global)): |
| filtered_indices = compute_filtered_indices(top_indices_list_global[i].tolist(), top_values_list_global[i].tolist(), threshold) |
| if len(filtered_indices) == 0: |
| filtered_indices = compute_filtered_indices(top_indices_list_global[i].tolist(), top_indices_list_global[i].tolist(), threshold) |
| if len(filtered_indices) == 0: |
| bbox.append(uniform_boxes[i]) |
| continue |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
|
|
| |
| bbox_loss_list = [compute_bbox_loss(top_indices_list_global[i], bbox[i], top_values_list_global[i]) for i in range(len(top_indices_list_global))] |
| bbox_loss = sum(bbox_loss_list) |
| iter_bbox_loss[iter] = {'loss': bbox_loss, 'bbox': bbox} |
|
|
| |
| min_loss_iter = min(iter_bbox_loss.keys(), key=lambda k: iter_bbox_loss[k]['loss']) |
| min_loss = iter_bbox_loss[min_loss_iter]['loss'] |
| best_bbox = iter_bbox_loss[min_loss_iter]['bbox'] |
| |
|
|
| return (best_bbox, min_loss) |
|
|
|
|
|
|
| |
| def align_frame_into_steps_mode(frame_features, step_features, topk=15, w=2, ratio=1): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| |
|
|
|
|
| bbox = get_mode_box(sim, topk, w, ratio) |
| return bbox |
|
|
| def uniform_window(frame_num, step_num): |
| uniform_timestamps = torch.linspace(0, frame_num, step_num + 1) |
| uniform_timestamps = torch.round(uniform_timestamps).int().tolist() |
| bbox = [] |
| for i in range(step_num): |
| bbox.append([uniform_timestamps[i], uniform_timestamps[i+1] - 1]) |
|
|
| |
| |
| |
| |
| |
| return bbox |
|
|
| def expand_window(uniform_bbox, frame_num, step_num, ratio=1): |
| '''ratio: gt box相对uniform box的波动范围 超过这个范围视为不可能 ratio单位为一个caption的平均长度''' |
| window_size = frame_num // step_num |
| refined_bbox = [] |
| for bbox in uniform_bbox: |
| start = max(0, bbox[0] - ratio * window_size) |
| end = min(frame_num - 1, bbox[1] + ratio * window_size) |
| refined_bbox.append([start, end]) |
| return refined_bbox |
|
|
| |
| def align_frame_into_steps_order(frame_features, step_features, unordered=False, topk=15, threshold=2, w=2, mode='median', ratio=1): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
|
|
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
| refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio) |
| |
| |
| |
| |
| |
| |
|
|
| sorted_index = torch.argsort(-sim, dim=1) |
| |
| top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])] |
| top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])] |
|
|
| size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])] |
| if sum(size_local) < (topk-2) * len(size_local): |
| top_indices_list = top_indices_list_global |
| else: |
| top_indices_list = top_indices_list_local |
|
|
| |
|
|
| bbox = [] |
| for i in range(len(top_indices_list)): |
| filtered_indices = remove_outliers(top_indices_list[i].tolist(), threshold, mode=mode, w=w) |
| if len(filtered_indices) == 0: |
| filtered_indices = remove_outliers(top_indices_list_global[i].tolist(), 0.5, mode=mode, w=w) |
| if len(filtered_indices) == 0: |
| bbox.append(uniform_boxes[i]) |
| continue |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
|
|
| return bbox |
|
|
|
|
|
|
|
|
| |
| def align_frame_into_steps_order_adapt(frame_features, step_features, unordered=False, topk=15, threshold=2, w=2, mode='median', ratio=1): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
|
|
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
| refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio) |
| |
| |
| |
| |
| |
| |
|
|
| sorted_index = torch.argsort(-sim, dim=1) |
| |
| top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])] |
| top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])] |
|
|
| size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])] |
| if sum(size_local) < (topk-1) * len(size_local): |
| flag = 0 |
| for i in range(4): |
| refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio+i*0.5) |
| top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])] |
| size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])] |
| if sum(size_local) >= (topk-1) * len(size_local): |
| flag = 1 |
| break |
| if flag == 0: |
| top_indices_list = top_indices_list_global |
| else: |
| top_indices_list = top_indices_list_local |
|
|
| else: |
| top_indices_list = top_indices_list_local |
|
|
| |
|
|
| bbox = [] |
| for i in range(len(top_indices_list)): |
| filtered_indices = remove_outliers(top_indices_list[i].tolist(), threshold, mode=mode, w=w) |
| if len(filtered_indices) == 0: |
| filtered_indices = remove_outliers(top_indices_list_global[i].tolist(), 0.5, mode=mode, w=w) |
| if len(filtered_indices) == 0: |
| bbox.append(uniform_boxes[i]) |
| continue |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
|
|
| return bbox |
|
|
| def step_retrieval_weight_sim_order(frame_features, step_features, unordered=False, topk=15, threshold=2, w=2, ratio=1): |
| |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
| |
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| |
|
|
| window_sums = torch.nn.functional.conv1d(sim.unsqueeze(1), torch.ones(1, 1, 2 * w + 1)).squeeze() |
| if len(window_sums.shape) == 1: |
| window_sums = window_sums.unsqueeze(0) |
|
|
|
|
| sorted_index = torch.argsort(-window_sums, dim=1) + w |
| |
| |
|
|
| uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0]) |
| refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio) |
|
|
| top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])] |
| top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])] |
| |
|
|
| size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])] |
| if sum(size_local) < (topk-2) * len(size_local): |
| top_indices_list = top_indices_list_global |
| else: |
| top_indices_list = top_indices_list_local |
|
|
| |
|
|
| bbox = [] |
| for i in range(len(top_indices_list)): |
| threshold_value = compute_threshold(top_indices_list[i].tolist(), threshold) |
| filtered_indices = [frame for frame in top_indices_list[i].tolist() if abs(frame - top_indices_list[i][0]) <= threshold_value] |
| if len(filtered_indices) == 0: |
| bbox.append([top_indices_list[i] - w, top_indices_list[i] + w]) |
| else: |
| bbox.append([min(filtered_indices), max(filtered_indices)]) |
|
|
| return bbox |
|
|
| |
| def segment_video_into_steps(frame_features, step_features, unordered=False): |
| if step_features.shape[0] == 0: |
| return -np.ones(frame_features.shape[0]) |
|
|
| sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu() |
| frame_features, step_features = frame_features.cpu(), step_features.cpu() |
|
|
| k = max([1, int(torch.numel(sim) * config_eval_keep_percentile)]) |
| baseline_logit = torch.topk(sim.reshape([-1]), k).values[-1].detach() |
| baseline_logits = baseline_logit.repeat([1, sim.shape[1]])[0] |
| zx_costs, drop_costs = -sim, -baseline_logits |
| zx_costs, drop_costs = [t.detach().cpu().numpy() for t in [zx_costs, drop_costs]] |
| sim = sim.detach().cpu().numpy() |
|
|
| if unordered: |
| max_vals, optimal_assignment = np.max(sim, axis=0), np.argmax(sim, axis=0) |
| optimal_assignment[max_vals < baseline_logit.item()] = -1 |
| else: |
| optimal_assignment = drop_dtw(zx_costs, drop_costs, return_labels=True) - 1 |
| return optimal_assignment |
|
|
| def align_query_into_steps(query_features, step_features, unordered=False): |
| if step_features.shape[0] == 0: |
| return -np.ones(query_features.shape[0]) |
|
|
| sim = compute_sim(step_features, query_features, config_eval_l2norm).cpu() |
| query_features, step_features = query_features.cpu(), step_features.cpu() |
|
|
| k = max([1, int(torch.numel(sim) * config_eval_keep_percentile)]) |
| baseline_logit = torch.topk(sim.reshape([-1]), k).values[-1].detach() |
| baseline_logits = baseline_logit.repeat([1, sim.shape[1]])[0] |
| zx_costs, drop_costs = -sim, -baseline_logits |
| zx_costs, drop_costs = [t.detach().cpu().numpy() for t in [zx_costs, drop_costs]] |
| sim = sim.detach().cpu().numpy() |
|
|
| if unordered: |
| max_vals, optimal_assignment = np.max(sim, axis=0), np.argmax(sim, axis=0) |
| optimal_assignment[max_vals < baseline_logit.item()] = -1 |
| else: |
| optimal_assignment = drop_dtw(zx_costs, drop_costs, one_to_one=True, return_labels=True) - 1 |
| return optimal_assignment |
|
|
| |
| def segment_video_into_slots(video_features, pred_steps): |
| sim = compute_sim(pred_steps, video_features, l2_norm=config_eval_l2norm).detach() |
| if config_eval_fixed_drop_sim == -1: |
| k = max([1, int(torch.numel(sim) * config_eval_keep_percentile)]) |
| baseline_logit = torch.topk(sim.reshape([-1]), k).values[-1].detach() |
| else: |
| baseline_logit = torch.tensor(config_eval_fixed_drop_sim) |
| baseline_logits = baseline_logit.repeat([1, sim.shape[1]]) |
| x_drop_costs = -baseline_logits.squeeze() |
| zx_costs = -sim |
|
|
| z_drop_costs = -baseline_logit.repeat([1, sim.shape[0]]).squeeze() |
| zx_costs = zx_costs - z_drop_costs[0].reshape([1, 1]) |
| z_drop_costs = z_drop_costs - z_drop_costs[0] |
| x_drop_costs = x_drop_costs - x_drop_costs[0] |
| segmentation = double_drop_dtw(zx_costs.numpy(), x_drop_costs.numpy(), z_drop_costs.numpy(), return_labels=True) - 1 |
| return segmentation |
|
|
|
|
| |
| def get_index(alignment): |
| start_idx, end_idx = [], [] |
| for i in range(len(alignment)): |
| if alignment[i] == -1: |
| if i != 0 and alignment[i-1] != -1: |
| end_idx.append(i-1) |
| continue |
| if i == 0: |
| start_idx.append(i) |
| elif alignment[i] != alignment[i-1]: |
| start_idx.append(i) |
| if alignment[i-1] != -1: |
| end_idx.append(i-1) |
| if i == len(alignment) - 1: |
| end_idx.append(i) |
| assert len(start_idx) == len(end_idx) |
| for s, e in zip(start_idx, end_idx): |
| assert alignment[s] <= alignment[e] |
| return start_idx, end_idx |
|
|
| def alignment_to_boundary(alignment, video_frame_num): |
| start_idx, end_idx = get_index(alignment) |
| start_time = start_idx / video_frame_num |
| end_time = end_idx / video_frame_num |
| boundaries = list(zip(start_time, end_time)) |
|
|
| return np.float32(np.stack(boundaries, axis=0)) |
|
|
|
|
| def to_center_duration(alignments): |
| new_alignments = [] |
| for alignment in alignments: |
| start, end = alignment[:, 0], alignment[:, 1] |
| center = (start + end) / 2 |
| duration = end - start |
| alignment[:, 0], alignment[:, 1] = center, duration |
| new_alignments.append(alignment) |
| return new_alignments |