fcxfcx's picture
Upload 2446 files
1327f34 verified
# Copyright 2025 The Scenic Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of data preprocessing ops for VTAB.
All preprocessing ops should return a data processing functors. A data
is represented as a dictionary of tensors, where field "image" is reserved
for 3D images (height x width x channels). The functors output dictionary with
field "image" being modified. Potentially, other fields can also be modified
or added.
"""
import numpy as np
from scenic.dataset_lib.big_transfer.registry import Registry
import tensorflow.compat.v1 as tf
@Registry.register("preprocess_ops.dsprites_pp", "function")
def get_dsprites_pp(predicted_attribute, num_classes=None):
"""Data preprocess function for dsprites dataset."""
attribute_to_classes = {
"label_shape": 3,
"label_scale": 6,
"label_orientation": 40,
"label_x_position": 32,
"label_y_position": 32,
}
def _dsprites_pp(data):
# For consistency with other datasets, image needs to have three channels
# and be in [0, 255). # pylint: disable=unused-argument
# data["image"] = tf.tile(data["image"], [1, 1, 3]) * 255
data["image"] = data["image"] * 255
# If num_classes is set, we group together nearby integer values to arrive
# at the desired number of classes. This is useful for example for grouping
# together different spatial positions.
num_original_classes = attribute_to_classes[predicted_attribute]
n_cls = num_original_classes if num_classes is None else num_classes
if not isinstance(n_cls, int) or n_cls <= 1 or n_cls > num_original_classes:
raise ValueError(
"The number of classes should be None or in [2, ..., num_classes].")
class_division_factor = float(num_original_classes) / n_cls
data["label"] = tf.cast(
tf.math.floordiv(
tf.cast(data[predicted_attribute], tf.float32),
class_division_factor), data[predicted_attribute].dtype)
return data
return _dsprites_pp
@Registry.register("preprocess_ops.clevr_pp", "function")
def get_clevr_pp(task, outkey="label"):
"""Data preprocess function for clevr dataset."""
def _count_preprocess_fn(data):
data[outkey] = tf.size(data["objects"]["size"]) - 3
return data
def _closest_object_preprocess_fn(data):
dist = tf.reduce_min(data["objects"]["pixel_coords"][:, 2])
# These thresholds are uniformly spaced and result in more or less balanced
# distribution of classes.
thrs = np.array([0.0, 8.0, 8.5, 9.0, 9.5, 10.0, 100.0])
data[outkey] = tf.reduce_max(tf.where((thrs - dist) < 0))
return data
task_to_preprocess = {
"count_all": _count_preprocess_fn,
"closest_object_distance": _closest_object_preprocess_fn,
}
return task_to_preprocess[task]
@Registry.register("preprocess_ops.kitti_pp", "function")
def get_kitti_pp(task):
"""Data preprocess function for kitti dataset."""
def _closest_vehicle_distance_pp(data):
"""Predict the distance to the closest vehicle."""
# Location feature contains (x, y, z) in meters w.r.t. the camera.
vehicles = tf.where(data["objects"]["type"] < 3) # Car, Van, Truck.
vehicle_z = tf.gather(
params=data["objects"]["location"][:, 2], indices=vehicles)
vehicle_z = tf.concat([vehicle_z, tf.constant([[1000.0]])], axis=0)
dist = tf.reduce_min(vehicle_z)
# Results in a uniform distribution over three distances, plus one class for
# "no vehicle".
thrs = np.array([-100.0, 8.0, 20.0, 999.0])
label = tf.reduce_max(tf.where((thrs - dist) < 0))
return {"image": data["image"], "label": label}
task_to_preprocess = {
"closest_vehicle_distance": _closest_vehicle_distance_pp,
}
return task_to_preprocess[task]