yuvalkirstain's picture
add clip
17db41a
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#pragma once
#include "constant_folder-generated.h"
#include "model-generated.h"
#include "model_interface.h"
#include "raii_wrapper.h"
#include <condition_variable>
#include <cstring>
#include <future>
#include <mutex>
#include <numeric>
#include <shared_mutex>
#include <unordered_map>
#include <unordered_set>
namespace ait {
enum class BufferState {
CLEAN = 0,
CONSTANTS_UPDATED = 1,
CONSTANTS_FOLDED = 2
};
// ModelContainer inherits from this class; its implementation is
// generated at compilation time. Most of the ModelContainer
// logic does not need codegen; anything that does should be put
// into this class instead.
class ModelContainerBase {
public:
ModelContainerBase(
size_t num_inputs,
size_t num_outputs,
size_t num_bound_constants,
size_t num_unbound_constants,
size_t params_size,
AITemplateAllocator& allocator);
protected:
// The set of bounded constants/weights/parameters. These are constants which
// have value during compile time. We maintain it's size, and unlike unbound
// constants, we do not need to check whether they are set via SetConstant
// prior to inference.
std::unordered_map<std::string, size_t> bound_constant_name_to_idx_;
// The set of unbound constants/weights/parameters. These are constants which
// have no value at compile time and do not participate in constant folding.
// They must be set via SetConstant prior to inference.
std::unordered_map<std::string, size_t> unbound_constant_name_to_idx_;
// The names of all tensors that are required for constant folding, but are
// not necessarily in the final graph.
// constant_folding_optional_inputs_ are those that has initial value during
// compile time.
std::unordered_set<std::string> constant_folding_inputs_;
std::unordered_set<std::string> constant_folding_optional_inputs_;
// Offsets here correspond to the offsets of constants that were the outputs
// of constant folding. The indices are guaranteed to map to the correct
// indices in constant_folder_.
std::vector<size_t> constant_folding_outputs_offsets_;
// Offsets here correspond to the offsets of constants for bounded constants.
std::vector<size_t> bound_constant_offsets_;
// size for constants_ GPUPtr
size_t constants_size_;
// Pieces of memory for holding all constants, controled by
// use_constants_primary_buffer_
GPUPtr constants_primary_;
GPUPtr constants_secondary_;
bool use_constants_primary_buffer_;
// State of whether SetConstants/FoldConstants was called.
BufferState buffer_state_;
// Mapping for constant names to pointer
std::unordered_map<std::string, const void*> model_constants_;
// size of the containers below: # inputs + # outputs + # unbound constants.
size_t num_params_;
// These entries correspond to inputs/outputs/unbound constants in order;
// inputs first, then outputs, then constants.
std::vector<const char*> param_names_;
std::vector<std::vector<int64_t>> max_param_shapes_;
std::vector<AITemplateDtype> param_dtypes_;
// These are entries used for bound constants.
std::vector<size_t> bound_constant_size_;
std::vector<AITemplateDtype> bound_constant_dtypes_;
// NB: technically these could be derived from both the max shape and
// the dytpe, but it's easier to just cache them.
std::vector<size_t> max_param_storage_bytes_;
std::vector<size_t> max_param_numel_;
};
// This creates a new ModelContainer; its implementation is also
// codegened (the parameters passed to the ctor are determined
// at compilation time)
class ModelContainer;
ModelContainer* CreateModelContainer(
size_t num_runtimes,
AITemplateAllocator& allocator);
// Each ModelContainer contains num_models Models. Inference runs
// can be started by invoking Run() with lists of pre-allocated
// input/output tensors. GetOutputMaximumShape() can be used to
// determine how much memory is required for each output.
//
// If there are N tensors marked with is_output=True,
// the user will always be expected to pass N output pointers -
// extra copies will occur if the outputs are views of constants,
// inputs, or other outputs in this case to avoid surprises.
//
// Use stream = nullptr for default stream. ModelContainer/Model does not
// create or own any stream. The user is expected to create and manage streams.
//
// We can support at most num_models concurrent inferences.
// Run() takes a stream to run the inference on. For example,
// to start up two inferences on different streams concurrently,
// we can do this:
//
// model_container.Run(inputs0, num_inputs, outputs0, num_ouputs, stream0, ...);
// model_container.Run(inputs1, num_inputs, outputs1, num_ouputs, stream1, ...);
// StreamSynchronize(stream0);
// StreamSynchronize(stream1);
//
// Note that if there are no models available for inference, Run() will block
// until one becomes available.
//
// ModelContainer optionally takes an allocator argument, which it will use to
// allocate the space for the buffers used for intermediate tensors and
// constants. If it is nullptr, the default allocator will be used (e.g. just
// {cuda/hip}{Malloc/Free}).
// Important: we assume that the allocator lives until the ModelContainer is
// destroyed. The default allocator has a static lifetime.
class ModelContainer : ModelContainerBase {
public:
ModelContainer(
size_t num_models,
size_t num_inputs,
size_t num_outputs,
size_t num_bound_constants,
size_t num_unbound_constants,
size_t params_size,
AITemplateAllocator& allocator);
void Run(
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
StreamType stream,
bool sync,
bool graph_mode,
int64_t** output_shapes_out);
void RunWithOutputsOnHost(
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
StreamType stream,
bool graph_mode,
int64_t** output_shapes_out);
void Profile(
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
StreamType stream,
size_t num_iters,
const char* filename);
float Benchmark(
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
StreamType stream,
bool graph_mode,
size_t count,
size_t num_threads,
bool use_unique_stream_per_thread,
int64_t** output_shapes_out);
void SetConstant(const char* name, const AITData& tensor);
void SetManyConstants(
const char** names,
const AITData* tensors,
size_t num_tensors);
uint8_t* GetInactiveConstantsBuffer();
void SetDoubleBufferConstant(
const char* name,
const AITData& tensor,
StreamType stream = 0);
void SetManyDoubleBufferConstants(
const char** names,
const AITData* tensors,
size_t num_tensors,
StreamType stream = 0);
size_t NumInputs() const;
size_t NumOutputs() const;
const char* InputName(size_t input_idx) const;
const char* OutputName(size_t output_idx) const;
AITemplateParamShape MaxInputShape(size_t input_idx) const;
AITemplateParamShape MaxOutputShape(size_t output_idx) const;
AITemplateDtype InputDtype(size_t input_idx) const;
AITemplateDtype OutputDtype(size_t output_idx) const;
size_t MaxOutputStorageBytes(size_t output_idx) const;
size_t GetNumRuntimes() const {
return models_.size();
}
void FoldConstants(StreamType stream, bool sync, bool double_buffer = false);
void SwapConstants();
size_t GetNumConstants(bool unbound_constants_only = true) const;
size_t GetNumConstantFoldingInputs(bool unbound_constants_only = true) const;
// Write all constant names to the array pointed to by names_out.
// This function assumes that names_out has enough space to hold
// at least GetNumConstants() pointers. The strings written
// are guaranteed to live as long as their owning ModelContainer.
void WriteAllConstantNamesTo(
const char** names_out,
bool unbound_constants_only,
bool constant_folding_inputs_only) const;
private:
void WaitForAllModels(bool include_constant_folder = false);
void FoldConstantsImpl(StreamType stream, bool double_buffer = false);
void SetConstantImpl(
const char* name,
const AITData& tensor,
bool use_secondary_buffer = false,
StreamType stream = 0);
void SwapConstantFolderBuffer();
void PrepareForRun(
Model* model,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs);
Model* GetAvailableModel();
void ReclaimFinishedModels(std::unique_lock<std::mutex>& lk);
void ValidateParamDtype(AITemplateDtype dtype, size_t idx) const;
void ValidateBoundConstantDtype(AITemplateDtype dtype, size_t idx) const;
float BenchmarkImpl(
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
StreamType stream,
bool graph_mode,
size_t count,
int64_t** output_shapes_out);
AITemplateAllocator& allocator_;
std::vector<std::unique_ptr<Model>> models_;
std::unique_ptr<ConstantFolder> constant_folder_;
std::vector<Model*> available_models_;
std::deque<Model*> pending_models_;
// Guards accesses to available/pending models.
std::mutex models_mutex_;
// Notified whenever a model is put into pending_models_.
std::condition_variable pending_models_available_;
// Prevents constant folding or SetConstants on main models from starting
// while there are ongoing inferences (and vice versa). FoldConstants() and
// SetConstants acquires in unique mode, Run()/Benchmark() acquire in shared
// mode.
//
// Since constants_sync_mutex_ is acquired in shared mode for the entire
// duration of Run()/Benchmark(), there is no need to acquire models_mutex_
// while constants_sync_mutex_ is acquired in unique mode.
// Why complicate things with two locks? The system is designed with the
// assumption that concurrent inferences are common. We don't want to acquire
// models_mutex_ uniquely for the entire duration of Run(), because that
// prevents concurrent inferences from happening while kernels are being
// queued.
std::shared_mutex constants_sync_mutex_;
// constants_double_buffer_mutex_ is separate from constants_sync_mutex since
// when we use double buffer, it won't affect the main model.
std::shared_mutex constants_double_buffer_mutex_;
size_t num_inputs_;
size_t num_outputs_;
bool constant_folded_once_ = false;
};
} // namespace ait