//  Copyright (c) Meta Platforms, Inc. and affiliates.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
//
#pragma once

#include "constant_folder-generated.h"
#include "model-generated.h"
#include "model_interface.h"
#include "raii_wrapper.h"

#include <condition_variable>
#include <cstring>
#include <future>
#include <mutex>
#include <numeric>
#include <shared_mutex>
#include <unordered_map>
#include <unordered_set>

namespace ait {

enum class BufferState {
  CLEAN = 0,
  CONSTANTS_UPDATED = 1,
  CONSTANTS_FOLDED = 2
};

// ModelContainer inherits from this class; its implementation is
// generated at compilation time. Most of the ModelContainer
// logic does not need codegen; anything that does should be put
// into this class instead.
class ModelContainerBase {
 public:
  ModelContainerBase(
      size_t num_inputs,
      size_t num_outputs,
      size_t num_bound_constants,
      size_t num_unbound_constants,
      size_t params_size,
      AITemplateAllocator& allocator);

 protected:
  // The set of bounded constants/weights/parameters. These are constants which
  // have value during compile time. We maintain it's size, and unlike unbound
  // constants, we do not need to check whether they are set via SetConstant
  // prior to inference.
  std::unordered_map<std::string, size_t> bound_constant_name_to_idx_;

  // The set of unbound constants/weights/parameters. These are constants which
  // have no value at compile time and do not participate in constant folding.
  // They must be set via SetConstant prior to inference.
  std::unordered_map<std::string, size_t> unbound_constant_name_to_idx_;

  // The names of all tensors that are required for constant folding, but are
  // not necessarily in the final graph.
  // constant_folding_optional_inputs_ are those that has initial value during
  // compile time.
  std::unordered_set<std::string> constant_folding_inputs_;
  std::unordered_set<std::string> constant_folding_optional_inputs_;

  // Offsets here correspond to the offsets of constants that were the outputs
  // of constant folding. The indices are guaranteed to map to the correct
  // indices in constant_folder_.
  std::vector<size_t> constant_folding_outputs_offsets_;
  // Offsets here correspond to the offsets of constants for bounded constants.
  std::vector<size_t> bound_constant_offsets_;

  // size for constants_ GPUPtr
  size_t constants_size_;
  // Pieces of memory for holding all constants, controled by
  // use_constants_primary_buffer_
  GPUPtr constants_primary_;
  GPUPtr constants_secondary_;
  bool use_constants_primary_buffer_;
  // State of whether SetConstants/FoldConstants was called.
  BufferState buffer_state_;
  // Mapping for constant names to pointer
  std::unordered_map<std::string, const void*> model_constants_;

  // size of the containers below: # inputs + # outputs + # unbound constants.
  size_t num_params_;

  // These entries correspond to inputs/outputs/unbound constants in order;
  // inputs first, then outputs, then constants.
  std::vector<const char*> param_names_;
  std::vector<std::vector<int64_t>> max_param_shapes_;
  std::vector<AITemplateDtype> param_dtypes_;

  // These are entries used for bound constants.
  std::vector<size_t> bound_constant_size_;
  std::vector<AITemplateDtype> bound_constant_dtypes_;

  // NB: technically these could be derived from both the max shape and
  // the dytpe, but it's easier to just cache them.
  std::vector<size_t> max_param_storage_bytes_;
  std::vector<size_t> max_param_numel_;
};

// This creates a new ModelContainer; its implementation is also
// codegened (the parameters passed to the ctor are determined
// at compilation time)
class ModelContainer;
ModelContainer* CreateModelContainer(
    size_t num_runtimes,
    AITemplateAllocator& allocator);

// Each ModelContainer contains num_models Models. Inference runs
// can be started by invoking Run() with lists of pre-allocated
// input/output tensors. GetOutputMaximumShape() can be used to
// determine how much memory is required for each output.
//
// If there are N tensors marked with is_output=True,
// the user will always be expected to pass N output pointers -
// extra copies will occur if the outputs are views of constants,
// inputs, or other outputs in this case to avoid surprises.
//
// Use stream = nullptr for default stream. ModelContainer/Model does not
// create or own any stream. The user is expected to create and manage streams.
//
// We can support at most num_models concurrent inferences.
// Run() takes a stream to run the inference on. For example,
// to start up two inferences on different streams concurrently,
// we can do this:
//
// model_container.Run(inputs0, num_inputs, outputs0, num_ouputs, stream0, ...);
// model_container.Run(inputs1, num_inputs, outputs1, num_ouputs, stream1, ...);
// StreamSynchronize(stream0);
// StreamSynchronize(stream1);
//
// Note that if there are no models available for inference, Run() will block
// until one becomes available.
//
// ModelContainer optionally takes an allocator argument, which it will use to
// allocate the space for the buffers used for intermediate tensors and
// constants. If it is nullptr, the default allocator will be used (e.g. just
// {cuda/hip}{Malloc/Free}).
// Important: we assume that the allocator lives until the ModelContainer is
// destroyed. The default allocator has a static lifetime.
class ModelContainer : ModelContainerBase {
 public:
  ModelContainer(
      size_t num_models,
      size_t num_inputs,
      size_t num_outputs,
      size_t num_bound_constants,
      size_t num_unbound_constants,
      size_t params_size,
      AITemplateAllocator& allocator);

  void Run(
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs,
      StreamType stream,
      bool sync,
      bool graph_mode,
      int64_t** output_shapes_out);

  void RunWithOutputsOnHost(
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs,
      StreamType stream,
      bool graph_mode,
      int64_t** output_shapes_out);

  void Profile(
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs,
      StreamType stream,
      size_t num_iters,
      const char* filename);

  float Benchmark(
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs,
      StreamType stream,
      bool graph_mode,
      size_t count,
      size_t num_threads,
      bool use_unique_stream_per_thread,
      int64_t** output_shapes_out);

  void SetConstant(const char* name, const AITData& tensor);
  void SetManyConstants(
      const char** names,
      const AITData* tensors,
      size_t num_tensors);

  uint8_t* GetInactiveConstantsBuffer();
  void SetDoubleBufferConstant(
      const char* name,
      const AITData& tensor,
      StreamType stream = 0);
  void SetManyDoubleBufferConstants(
      const char** names,
      const AITData* tensors,
      size_t num_tensors,
      StreamType stream = 0);

  size_t NumInputs() const;
  size_t NumOutputs() const;

  const char* InputName(size_t input_idx) const;
  const char* OutputName(size_t output_idx) const;

  AITemplateParamShape MaxInputShape(size_t input_idx) const;
  AITemplateParamShape MaxOutputShape(size_t output_idx) const;

  AITemplateDtype InputDtype(size_t input_idx) const;
  AITemplateDtype OutputDtype(size_t output_idx) const;

  size_t MaxOutputStorageBytes(size_t output_idx) const;

  size_t GetNumRuntimes() const {
    return models_.size();
  }

  void FoldConstants(StreamType stream, bool sync, bool double_buffer = false);
  void SwapConstants();

  size_t GetNumConstants(bool unbound_constants_only = true) const;
  size_t GetNumConstantFoldingInputs(bool unbound_constants_only = true) const;

  // Write all constant names to the array pointed to by names_out.
  // This function assumes that names_out has enough space to hold
  // at least GetNumConstants() pointers. The strings written
  // are guaranteed to live as long as their owning ModelContainer.
  void WriteAllConstantNamesTo(
      const char** names_out,
      bool unbound_constants_only,
      bool constant_folding_inputs_only) const;

 private:
  void WaitForAllModels(bool include_constant_folder = false);
  void FoldConstantsImpl(StreamType stream, bool double_buffer = false);
  void SetConstantImpl(
      const char* name,
      const AITData& tensor,
      bool use_secondary_buffer = false,
      StreamType stream = 0);
  void SwapConstantFolderBuffer();

  void PrepareForRun(
      Model* model,
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs);

  Model* GetAvailableModel();
  void ReclaimFinishedModels(std::unique_lock<std::mutex>& lk);
  void ValidateParamDtype(AITemplateDtype dtype, size_t idx) const;
  void ValidateBoundConstantDtype(AITemplateDtype dtype, size_t idx) const;

  float BenchmarkImpl(
      const AITData* inputs,
      size_t num_inputs,
      AITData* outputs,
      size_t num_outputs,
      StreamType stream,
      bool graph_mode,
      size_t count,
      int64_t** output_shapes_out);

  AITemplateAllocator& allocator_;

  std::vector<std::unique_ptr<Model>> models_;
  std::unique_ptr<ConstantFolder> constant_folder_;
  std::vector<Model*> available_models_;
  std::deque<Model*> pending_models_;

  // Guards accesses to available/pending models.
  std::mutex models_mutex_;
  // Notified whenever a model is put into pending_models_.
  std::condition_variable pending_models_available_;
  // Prevents constant folding or SetConstants on main models from starting
  // while there are ongoing inferences (and vice versa). FoldConstants() and
  // SetConstants acquires in unique mode, Run()/Benchmark() acquire in shared
  // mode.
  //
  // Since constants_sync_mutex_ is acquired in shared mode for the entire
  // duration of Run()/Benchmark(), there is no need to acquire models_mutex_
  // while constants_sync_mutex_ is acquired in unique mode.
  // Why complicate things with two locks? The system is designed with the
  // assumption that concurrent inferences are common. We don't want to acquire
  // models_mutex_ uniquely for the entire duration of Run(), because that
  // prevents concurrent inferences from happening while kernels are being
  // queued.
  std::shared_mutex constants_sync_mutex_;
  // constants_double_buffer_mutex_ is separate from constants_sync_mutex since
  // when we use double buffer, it won't affect the main model.
  std::shared_mutex constants_double_buffer_mutex_;

  size_t num_inputs_;
  size_t num_outputs_;

  bool constant_folded_once_ = false;
};

} // namespace ait