Spaces:
Configuration error
Configuration error
| import json | |
| import os | |
| from PIL import Image | |
| import math | |
| from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig | |
| import torch | |
| from tqdm import tqdm | |
| def divide_to_patches(image, patch_size): | |
| """ | |
| Divides an image into patches of a specified size. | |
| Args: | |
| image (PIL.Image.Image): The input image. | |
| patch_size (int): The size of each patch. | |
| Returns: | |
| list: A list of PIL.Image.Image objects representing the patches. | |
| """ | |
| patches = [] | |
| width, height = image.size | |
| for i in range(0, height, patch_size): | |
| for j in range(0, width, patch_size): | |
| box = (j, i, j + patch_size, i + patch_size) | |
| patch = image.crop(box) | |
| patches.append(patch) | |
| return patches | |
| def resize_and_pad_image(image, target_resolution): | |
| """ | |
| Resize and pad an image to a target resolution while maintaining aspect ratio. | |
| Args: | |
| image (PIL.Image.Image): The input image. | |
| target_resolution (tuple): The target resolution (width, height) of the image. | |
| Returns: | |
| PIL.Image.Image: The resized and padded image. | |
| """ | |
| original_width, original_height = image.size | |
| target_width, target_height = target_resolution | |
| # Determine which dimension (width or height) to fill | |
| scale_w = target_width / original_width | |
| scale_h = target_height / original_height | |
| if scale_w < scale_h: | |
| # Width will be filled completely | |
| new_width = target_width | |
| new_height = min(math.ceil(original_height * scale_w), target_height) | |
| else: | |
| # Height will be filled completely | |
| new_height = target_height | |
| new_width = min(math.ceil(original_width * scale_h), target_width) | |
| # Resize the image | |
| resized_image = image.resize((new_width, new_height)) | |
| # Create a new image with the target size and paste the resized image onto it | |
| new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0)) | |
| paste_x = (target_width - new_width) // 2 | |
| paste_y = (target_height - new_height) // 2 | |
| new_image.paste(resized_image, (paste_x, paste_y)) | |
| return new_image | |
| def select_best_resolution(original_size, possible_resolutions): | |
| """ | |
| Selects the best resolution from a list of possible resolutions based on the original size. | |
| Args: | |
| original_size (tuple): The original size of the image in the format (width, height). | |
| possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. | |
| Returns: | |
| tuple: The best fit resolution in the format (width, height). | |
| """ | |
| original_width, original_height = original_size | |
| best_fit = None | |
| max_effective_resolution = 0 | |
| min_wasted_resolution = float("inf") | |
| for width, height in possible_resolutions: | |
| # Calculate the downscaled size to keep the aspect ratio | |
| scale = min(width / original_width, height / original_height) | |
| downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) | |
| # Calculate effective and wasted resolutions | |
| effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) | |
| wasted_resolution = (width * height) - effective_resolution | |
| if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution): | |
| max_effective_resolution = effective_resolution | |
| min_wasted_resolution = wasted_resolution | |
| best_fit = (width, height) | |
| return best_fit | |
| def process_anyres_image(image, processor, grid_pinpoints): | |
| """ | |
| Process an image with variable resolutions. | |
| Args: | |
| image (PIL.Image.Image): The input image to be processed. | |
| processor: The image processor object. | |
| grid_pinpoints (str): A string representation of a list of possible resolutions. | |
| Returns: | |
| torch.Tensor: A tensor containing the processed image patches. | |
| """ | |
| # Convert grid_pinpoints from string to list | |
| if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: | |
| try: | |
| patch_size = processor.size[0] | |
| except Exception as e: | |
| patch_size = processor.size["shortest_edge"] | |
| assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]" | |
| # Use regex to extract the range from the input string | |
| matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) | |
| range_start = tuple(map(int, matches[0])) | |
| range_end = tuple(map(int, matches[-1])) | |
| # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1]) | |
| grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)] | |
| # Multiply all elements by patch_size | |
| grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] | |
| if type(grid_pinpoints) is list: | |
| possible_resolutions = grid_pinpoints | |
| else: | |
| possible_resolutions = ast.literal_eval(grid_pinpoints) | |
| best_resolution = select_best_resolution(image.size, possible_resolutions) | |
| image_padded = resize_and_pad_image(image, best_resolution) | |
| patches = divide_to_patches(image_padded, processor.crop_size["height"]) | |
| # FIXME: this seems to be a bug that it resizes instead of pad. | |
| # but to keep it consistent with previous, i will keep it as it is | |
| # TODO: uncomment below to ablate with the padding | |
| if isinstance(processor.size, dict): | |
| shortest_edge = processor.size["shortest_edge"] | |
| else: | |
| shortest_edge = min(processor.size) | |
| image_original_resize = image.resize((shortest_edge, shortest_edge)) | |
| # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean)) | |
| # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge'])) | |
| image_patches = [image_original_resize] + patches | |
| image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches] | |
| return torch.stack(image_patches, dim=0) | |
| json_path="/share/junjie/shuyan/video_traindata/anno/nextqa.json" | |
| with open(json_path, 'r') as file: | |
| data = json.load(file) | |
| result=[] | |
| for i in tqdm(data): | |
| # print(i) | |
| if "video" in i: | |
| result.append(i) | |
| print(len(result)) | |
| output_file = "/share/junjie/shuyan/video_traindata/anno/nextqa_pure.json" | |
| with open(output_file, 'w', encoding='utf-8') as f_out: | |
| json.dump(result, f_out, indent=4, ensure_ascii=False) | |