jstzwjr commited on Apr 28, 2025

Commit

11481cd

1 Parent(s): 3cdcd5e

add genie

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Genie/Genie/GenieSymbols.default +31 -0
Genie/Genie/Makefile +57 -0
Genie/Genie/README +16 -0
Genie/Genie/make/Android.mk +56 -0
Genie/Genie/make/Application.mk +14 -0
Genie/Genie/make/Makefile.linux-x86_64 +192 -0
Genie/Genie/src/Dialog.cpp +1804 -0
Genie/Genie/src/Dialog.hpp +95 -0
Genie/Genie/src/Exception.hpp +27 -0
Genie/Genie/src/GenieCommon.cpp +15 -0
Genie/Genie/src/GenieDialog.cpp +249 -0
Genie/Genie/src/GenieDialogEmbedding.cpp +41 -0
Genie/Genie/src/Macro.hpp +101 -0
Genie/Genie/src/Util/HandleGenerator.hpp +62 -0
Genie/Genie/src/Util/HandleManager.hpp +84 -0
Genie/Genie/src/qualla/context.cpp +118 -0
Genie/Genie/src/qualla/dialog.cpp +590 -0
Genie/Genie/src/qualla/dialogs/basic.cpp +421 -0
Genie/Genie/src/qualla/dialogs/kv-share.cpp +359 -0
Genie/Genie/src/qualla/dialogs/lhd-dec.cpp +481 -0
Genie/Genie/src/qualla/dialogs/multistream.cpp +300 -0
Genie/Genie/src/qualla/dialogs/spec-dec.cpp +458 -0
Genie/Genie/src/qualla/dialogs/ssd-q1.cpp +1046 -0
Genie/Genie/src/qualla/embedding.cpp +190 -0
Genie/Genie/src/qualla/engine.cpp +198 -0
Genie/Genie/src/qualla/engines/lib.cpp +9 -0
Genie/Genie/src/qualla/engines/qnn-api/BackendExtensions.cpp +158 -0
Genie/Genie/src/qualla/engines/qnn-api/BackendExtensions.hpp +62 -0
Genie/Genie/src/qualla/engines/qnn-api/ClientBuffer.cpp +122 -0
Genie/Genie/src/qualla/engines/qnn-api/ClientBuffer.hpp +85 -0
Genie/Genie/src/qualla/engines/qnn-api/IBackend.hpp +156 -0
Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp +56 -0
Genie/Genie/src/qualla/engines/qnn-api/ICommandLineManager.hpp +95 -0
Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp +382 -0
Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp +170 -0
Genie/Genie/src/qualla/engines/qnn-api/Log.hpp +24 -0
Genie/Genie/src/qualla/engines/qnn-api/NetRunBackend.hpp +173 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp +0 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp +429 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnApiUtils.cpp +636 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnApiUtils.hpp +94 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnConfig.hpp +44 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnTypeDef.hpp +52 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnTypeMacros.hpp +702 -0
Genie/Genie/src/qualla/engines/qnn-api/RpcMem.cpp +481 -0
Genie/Genie/src/qualla/engines/qnn-api/RpcMem.hpp +115 -0
Genie/Genie/src/qualla/engines/qnn-api/dlwrap.cpp +66 -0
Genie/Genie/src/qualla/engines/qnn-api/dlwrap.hpp +33 -0
Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.cpp +104 -0
Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp +157 -0

Genie/Genie/GenieSymbols.default ADDED Viewed

	@@ -0,0 +1,31 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+{
+  global:
+    Genie_getApiMajorVersion*;
+    Genie_getApiMinorVersion*;
+    Genie_getApiPatchVersion*;
+    GenieDialogConfig_createFromJson*;
+    GenieDialogConfig_free*;
+    GenieDialog_create*;
+    GenieDialog_query*;
+    GenieDialog_tokenQuery*;
+    GenieDialog_embeddingQuery*;
+    GenieDialog_save*;
+    GenieDialog_restore*;
+    GenieDialog_reset*;
+    GenieDialog_setLoraStrength*;
+    GenieDialog_applyLora*;
+    GenieDialog_free*;
+    GenieEmbeddingConfig_createFromJson*;
+    GenieEmbeddingConfig_free*;
+    GenieEmbedding_create*;
+    GenieEmbedding_generate*;
+    GenieEmbedding_free*;
+  local: *;
+};

Genie/Genie/Makefile ADDED Viewed

	@@ -0,0 +1,57 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+RUST_TARGET := aarch64-linux-android
+RUST_SOURCE_DIR := ./src/qualla/tokenizers/rust
+# specify compiler
+export CXX := clang++-14
+export PATH := $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin:$(PATH)
+.PHONY: all x86 android clean clean_x86 clean_android
+.DEFAULT: x86
+all: x86 android
+x86: build_x86_tokenizer
+	@echo "-------------------- Building genie for x86 -------------------- "
+	@$(MAKE) -f make/Makefile.linux-x86_64 CPATH="/usr/include/x86_64-linux-gnu" || (echo "-------------------- genie x86 build failed --------------------"; exit 1; )
+	@echo "-------------------- genie x86 build succeeded -------------------- "
+android: check_ndk build_android_tokenizer
+	@echo "-------------------- Building genie for android -------------------- "
+	@$(ANDROID_NDK_ROOT)/ndk-build APP_ALLOW_MISSING_DEPS=true APP_ABI="arm64-v8a" NDK_PROJECT_PATH=./ NDK_APPLICATION_MK=make/Application.mk APP_BUILD_SCRIPT=make/Android.mk || (echo "-------------------- genie android build failed --------------------"; exit 1; )
+	@$(rename_target_dirs)
+	@echo "-------------------- genie android build succeeded -------------------- "
+clean: clean_x86 clean_android
+clean_x86:
+	@$(MAKE) -f make/Makefile.linux-x86_64 clean
+clean_android:
+	if [ -d "lib/aarch64-android" ]; then rm -rf lib/aarch64-android; fi
+	if [ -d "obj/local" ]; then rm -rf obj/local; fi
+# utilities
+rename_target_dirs = \
+   				 @if [ -d ./lib/aarch64-android ]; then rm -rf ./lib/aarch64-android; fi; \
+					 find ./obj/local -type d -execdir rename 's/arm64-v8a/aarch64-android/' '{}' \+ \
+					 && mkdir -p lib \
+					 && mv ./obj/local/aarch64-android lib/ \
+					 && mv ./libs/arm64-v8a/libc++_shared.so lib/aarch64-android/ \
+					 && rm -rf ./libs \
+check_ndk:
+ifeq ($(ANDROID_NDK_ROOT),)
+	$(error ERROR: ANDROID_NDK_ROOT not set, skipping compilation for Android platform(s).)
+endif
+build_x86_tokenizer: $(RUST_SOURCE_DIR)/Cargo.toml
+	cargo build --release --manifest-path=$<
+build_android_tokenizer: $(RUST_SOURCE_DIR)/Cargo.toml
+	cargo build --release --manifest-path=$< --target=$(RUST_TARGET)

Genie/Genie/README ADDED Viewed

	@@ -0,0 +1,16 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+Genie library source code example
+---------------------------------
+The Genie library (libGenie.so / Genie.dll) source code example provides users with an ability to recreate the Genie
+library from source. Note that the Genie library source may be refactored, rewritten, or otherwise modified without
+notice. The Genie C API is the commercially controlled and versioned interface that users should expect to be stable.
+Please refer to the Genie SDK documentation tutorials at ${SDK_ROOT}/doc/Genie/ for more information on how to build the
+sample code.

Genie/Genie/make/Android.mk ADDED Viewed

	@@ -0,0 +1,56 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+LOCAL_PATH := $(call my-dir)
+SUPPORTED_TARGET_ABI := arm64-v8a x86 x86_64
+#============================ Verify Target Info and Application Variables =========================================
+ifneq ($(filter $(TARGET_ARCH_ABI),$(SUPPORTED_TARGET_ABI)),)
+    ifneq ($(APP_STL), c++_shared)
+        $(error Unsupported APP_STL: "$(APP_STL)")
+    endif
+else
+    $(error Unsupported TARGET_ARCH_ABI: '$(TARGET_ARCH_ABI)')
+endif
+#============================ Define Common Variables ===============================================================
+# PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../../include/QNN
+# Include paths
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../include
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../include/Genie
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/include
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../include/QNN
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../include/QNN/HTP
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/tokenizers
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-api
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-htp
+#========================== Define T2T Lib variables =============================================
+include $(CLEAR_VARS)
+LOCAL_MODULE := tokenizers_capi
+LOCAL_SRC_FILES := ../src/qualla/tokenizers/rust/target/aarch64-linux-android/release/libtokenizers_capi.a
+include $(PREBUILT_STATIC_LIBRARY)
+include $(CLEAR_VARS)
+LOCAL_C_INCLUDES               := $(PACKAGE_C_INCLUDES)
+MY_SRC_FILES                   := $(wildcard $(LOCAL_PATH)/../src/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/dialogs/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-api/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-htp/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/utils/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/loggers/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/samplers/*.cpp)
+LOCAL_MODULE                   := libGenie
+LOCAL_SRC_FILES                := $(subst make/,,$(MY_SRC_FILES))
+LOCAL_STATIC_LIBRARIES         := tokenizers_capi
+include $(BUILD_SHARED_LIBRARY)

Genie/Genie/make/Application.mk ADDED Viewed

	@@ -0,0 +1,14 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+APP_ABI      := arm64-v8a
+APP_STL      := c++_shared
+APP_PLATFORM := android-21
+APP_MODULES := Genie
+APP_CPPFLAGS += -std=c++2a -O3 -Wall -frtti -fexceptions -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_HTP=TRUE -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
+APP_LDFLAGS  += -lc -lm -ldl -Wl,--version-script=GenieSymbols.default -Wl,--strip-all

Genie/Genie/make/Makefile.linux-x86_64 ADDED Viewed

	@@ -0,0 +1,192 @@

+#=============================================================================
+#
+#  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#
+#=============================================================================
+# define relevant directories
+SRC_DIR := src/qualla
+#
+SRC_DIR_GENIE_TOKENIZERS := src/qualla/tokenizers
+#
+SRC_DIR_SAMPLE_DIALOGS := src/qualla/dialogs
+# All engines
+SRC_DIR_GENIE_ENGINES := src/qualla/engines
+SRC_DIR_GENIE_QNN_API := src/qualla/engines/qnn-api
+SRC_DIR_GENIE_ENGINES_CPU := src/qualla/engines/qnn-cpu
+SRC_DIR_GENIE_UTILS := src/qualla/utils
+#
+SRC_DIR_GENIE_LOGGERS := src/qualla/loggers
+#
+SRC_DIR_GENIE_SAMPLERS := src/qualla/samplers
+#
+SRC_DIR_GENIE := src
+# Includes
+GENIE_ENGINES_CPU_INCLUDE := src/qualla/engines/qnn-cpu
+GENIE_ENGINES_API_INCLUDE := src/qualla/engines/qnn-api
+GENIE_ENGINES_HTP_INCLUDE := src/qualla/engines/qnn-htp
+GENIE_TOKENIZER_INCLUDE   := src/qualla/tokenizers
+GENIE_INCLUDE  := include
+GENIE_C_API_HEADERS_INCLUDE := ../../../include/Genie
+QUALLA_INCLUDE := src/qualla/include
+QNN_API_INCLUDE := ../../../include/QNN/
+QNN_API_HTP_INCLUDE := $(QNN_API_INCLUDE)/HTP
+AR := /usr/bin/ar
+ARFLAGS := rcs
+# Checking if clang++ is present. If not switch to clang++
+ifeq ($(shell $(CXX) -v 2>&1 | grep -c "clang version"), 0)
+CXX := clang++
+endif
+QNN_TARGET ?= x86_64-linux-clang
+export TARGET_DIR := ./lib/$(QNN_TARGET)
+libGenie := $(TARGET_DIR)/libGenie.so
+libtokenizers := src/qualla/tokenizers/rust/target/release/libtokenizers_capi.a
+# define target architecture if not previously defined, default is x86
+ifndef TARGET_AARCH_VARS
+TARGET_AARCH_VARS:= -march=x86-64
+endif
+.PHONY: linux_x86_64
+.DEFAULT: linux_x86_64
+GENIE_all: $(libGenie)
+# Include paths
+INCLUDES += -I$(GENIE_INCLUDE) -I$(QUALLA_INCLUDE) -I$(SRC_DIR_GENIE_TOKENIZERS) -I$(QNN_API_INCLUDE) -I$(GENIE_ENGINES_CPU_INCLUDE) -I$(QNN_API_HTP_INCLUDE) -I$(GENIE_ENGINES_API_INCLUDE) -I$(GENIE_TOKENIZER_INCLUDE) -I$(GENIE_C_API_HEADERS_INCLUDE)
+# set compiler flags
+COMMON_CXXFLAGS = -std=c++2a -frtti -fPIC -Wall -pg -pthread -nostdinc++ -stdlib=libc++ -idirafter /usr/lib/llvm-14/include/c++/v1 -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include $(INCLUDES)
+COMMON_LDFLAGS = -shared -s -fPIC -pthread -L/usr/lib/x86_64-linux-gnu  -L./src/qualla/tokenizers/rust/target/release
+COMMON_CFLAGS = -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include
+ifdef QNN_DEBUG_ENABLE
+CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API="" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
+CFLAGS += $(COMMON_CFLAGS)
+LDFLAGS += $(COMMON_LDFLAGS)
+else
+CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
+CFLAGS += $(COMMON_CFLAGS)
+LDFLAGS += $(COMMON_LDFLAGS) -fvisibility=hidden -flto
+endif
+# define library sources
+SOURCES_GENIE_CPP := $(wildcard $(SRC_DIR_GENIE)/*.cpp)
+SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
+SOURCES_GENIE_TOKENIZERS := $(wildcard $(SRC_DIR_GENIE_TOKENIZERS)/*.cpp)
+SOURCES_GENIE_QNN_API_CPP := $(wildcard $(SRC_DIR_GENIE_QNN_API)/*.cpp)
+SOURCES_GENIE_ENGINES_CPP := $(filter-out $(SRC_DIR_GENIE_ENGINES)/qnn-htp.cpp, $(wildcard $(SRC_DIR_GENIE_ENGINES)/*.cpp))
+SOURCES_GENIE_DIALOGS_CPP := $(wildcard $(SRC_DIR_SAMPLE_DIALOGS)/*.cpp)
+SOURCES_GENIE_ENGINES_CPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_CPU)/*.cpp)
+SOURCES_GENIE_UTILS_CPP := $(wildcard $(SRC_DIR_GENIE_UTILS)/*.cpp)
+SOURCES_GENIE_LOGGERS_CPP := $(wildcard $(SRC_DIR_GENIE_LOGGERS)/*.cpp)
+SOURCES_GENIE_SAMPLERS_CPP := $(wildcard $(SRC_DIR_GENIE_SAMPLERS)/*.cpp)
+# define object directory
+OBJ_ROOT := obj
+OBJ_DIR_QUALLA := obj/$(QNN_TARGET)/qualla
+OBJ_DIR_GENIE := obj/$(QNN_TARGET)/genie
+OBJ_DIR_GENIE_TOKENIZERS := $(OBJ_DIR_QUALLA)/tokenizers
+OBJ_DIR_GENIE_QNN_API := $(OBJ_DIR_QUALLA)/qnn-api
+OBJ_DIR_GENIE_DIALOGS := $(OBJ_DIR_QUALLA)/dialogs
+OBJ_DIR_GENIE_ENGINES := $(OBJ_DIR_QUALLA)/engines
+OBJ_DIR_GENIE_UTILS := $(OBJ_DIR_QUALLA)/utils
+OBJ_DIR_GENIE_ENGINES_CPU := $(OBJ_DIR_QUALLA)/engines/qnn-cpu
+$(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_CPU))
+OBJ_DIR_GENIE_LOGGERS := obj/$(QNN_TARGET)/qualla/loggers
+OBJ_DIR_GENIE_SAMPLERS := obj/$(QNN_TARGET)/qualla/samplers
+$(shell mkdir -p $(OBJ_DIR_GENIE))
+$(shell mkdir -p $(OBJ_DIR_GENIE_LOGGERS))
+$(shell mkdir -p $(OBJ_DIR_GENIE_SAMPLERS))
+# setup object files in object directory
+OBJECTS_GENIE := $(patsubst %.cpp,$(OBJ_DIR_GENIE)/%.o,$(foreach x,$(SOURCES_GENIE_CPP),$(notdir $(x))))
+OBJECTS_QUALLA := $(patsubst %.cpp,$(OBJ_DIR_QUALLA)/%.o,$(foreach x,$(SOURCES),$(notdir $(x))))
+OBJECTS_GENIE_TOKENIZERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_TOKENIZERS)/%.o,$(foreach x,$(SOURCES_GENIE_TOKENIZERS),$(notdir $(x))))
+OBJECTS_GENIE_QNN_API := $(patsubst %.cpp,$(OBJ_DIR_GENIE_QNN_API)/%.o,$(foreach x,$(SOURCES_GENIE_QNN_API_CPP),$(notdir $(x))))
+OBJECTS_GENIE_ENGINES := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_CPP),$(notdir $(x))))
+OBJECTS_GENIE_DIALOGS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_DIALOGS)/%.o,$(foreach x,$(SOURCES_GENIE_DIALOGS_CPP),$(notdir $(x))))
+OBJECTS_GENIE_UTILS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_UTILS)/%.o,$(foreach x,$(SOURCES_GENIE_UTILS_CPP),$(notdir $(x))))
+OBJECTS_GENIE_ENGINES_CPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_CPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_CPU_CPP),$(notdir $(x))))
+OBJECTS_GENIE_LOGGERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_LOGGERS)/%.o,$(foreach x,$(SOURCES_GENIE_LOGGERS_CPP),$(notdir $(x))))
+OBJECTS_GENIE_SAMPLERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_SAMPLERS)/%.o,$(foreach x,$(SOURCES_GENIE_SAMPLERS_CPP),$(notdir $(x))))
+LIBS=-ldl
+# Rule to make shared lib
+.PHONY: libGenie
+libGenie: $(libGenie)
+# Implicit rule to compile and link object files
+$(OBJ_DIR_GENIE)/%.o: $(SRC_DIR_GENIE)/%.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_QUALLA)/%.o: $(SRC_DIR)/%.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_TOKENIZERS)/%.o: $(SRC_DIR_GENIE_TOKENIZERS)/%.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_QNN_API)/%.o: $(SRC_DIR_GENIE_QNN_API)/%.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_ENGINES)/%.o: $(SRC_DIR_GENIE_ENGINES)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_DIALOGS)/%.o: $(SRC_DIR_SAMPLE_DIALOGS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_UTILS)/%.o: $(SRC_DIR_GENIE_UTILS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_ENGINES_CPU)/%.o: $(SRC_DIR_GENIE_ENGINES_CPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_LOGGERS)/%.o: $(SRC_DIR_GENIE_LOGGERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_SAMPLERS)/%.o: $(SRC_DIR_GENIE_SAMPLERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+# set up resources
+directories := $(TARGET_DIR) $(OBJ_DIR_GENIE) $(OBJ_DIR_GENIE_QNN_API) $(OBJ_DIR_QUALLA) $(OBJ_DIR_GENIE_TOKENIZERS) $(OBJ_DIR_GENIE_ENGINES) $(OBJ_DIR_GENIE_DIALOGS) $(OBJ_DIR_GENIE_UTILS) $(OBJ_DIR_GENIE_ENGINES_CPU) $(OBJ_DIR_GENIE_LOGGERS) $(OBJ_DIR_GENIE_SAMPLERS)
+# Compile
+$(libGenie): $(OBJECTS_GENIE) $(OBJECTS_QUALLA) $(OBJECTS_GENIE_QNN_API) $(OBJECTS_GENIE_TOKENIZERS) $(OBJECTS_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU) $(OBJECTS_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS) | $(directories)
+	$(CXX) $(CXXFLAGS) -shared -o $@ $^ $(LIBS) $(libtokenizers)
+# rule for object directory resource
+$(OBJECTS_GENIE): | $(OBJ_DIR_GENIE)
+$(OBJECTS_QUALLA): | $(OBJ_DIR_QUALLA)
+$(OBJECTS_GENIE_TOKENIZERS): | $(OBJ_DIR_GENIE_TOKENIZERS)
+$(OBJECTS_GENIE_QNN_API): | $(OBJ_DIR_GENIE_QNN_API)
+$(OBJECTS_GENIE_ENGINES): | $(OBJ_DIR_GENIE_ENGINES)
+$(OBJECTS_GENIE_DIALOGS): | $(OBJ_DIR_GENIE_DIALOGS)
+$(OBJECTS_GENIE_UTILS): | $(OBJ_DIR_GENIE_UTILS)
+$(OBJECTS_GENIE_ENGINES_CPU): | $(OBJ_DIR_GENIE_ENGINES_CPU)
+$(OBJECTS_GENIE_LOGGERS): | $(OBJ_DIR_GENIE_LOGGERS)
+$(OBJECTS_GENIE_SAMPLERS): | $(OBJ_DIR_GENIE_SAMPLERS)
+# rule to create directories
+$(directories):
+	mkdir -p $@
+.PHONY: clean
+clean:
+	rm -rf $(OBJ_ROOT) $(TARGET_DIR)

Genie/Genie/src/Dialog.cpp ADDED Viewed

	@@ -0,0 +1,1804 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <exception>
+#include <set>
+#include <sstream>
+#include "Dialog.hpp"
+#include "Exception.hpp"
+#include "Macro.hpp"
+#include "qualla/detail/json.hpp"
+#include "qualla/env.hpp"
+using namespace genie;
+#ifdef _WIN32
+inline std::string libPrefix = "";
+inline std::string libSuffix = ".dll";
+#else
+inline std::string libPrefix = "lib";
+inline std::string libSuffix = ".so";
+#endif
+inline std::string getLibName(std::string baseName) { return libPrefix + baseName + libSuffix; }
+//=============================================================================
+// Context::Config functions
+//=============================================================================
+static void validateContextConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "context config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "bos-token", "eos-token", "size", "n-vocab"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing context field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "context";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid context config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "bos-token") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "eos-token") {
+      JSON_ENFORCE_ARRAY_OR_NUMERIC();
+    } else if (item.key() == "eot-token") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "size") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-vocab") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "pad-token") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown context config key: " + item.key());
+    }
+  }
+}
+static void translateContextConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  if (genieConfig["dialog"].contains("context")) {
+    if (genieConfig["dialog"]["context"].contains("bos-token")) {
+      quallaConfig["context"]["bos-token"] = genieConfig["dialog"]["context"]["bos-token"];
+    }
+    if (genieConfig["dialog"]["context"].contains("eos-token")) {
+      quallaConfig["context"]["eos-token"] = genieConfig["dialog"]["context"]["eos-token"];
+    }
+    if (genieConfig["dialog"]["context"].contains("eot-token")) {
+      quallaConfig["context"]["eot-token"] = genieConfig["dialog"]["context"]["eot-token"];
+    }
+    if (genieConfig["dialog"]["context"].contains("size")) {
+      quallaConfig["context"]["size"] = genieConfig["dialog"]["context"]["size"];
+    }
+    if (genieConfig["dialog"]["context"].contains("n-vocab")) {
+      quallaConfig["context"]["n-vocab"] = genieConfig["dialog"]["context"]["n-vocab"];
+    }
+    if (genieConfig["dialog"]["context"].contains("pad-token")) {
+      quallaConfig["context"]["pad-token"] = genieConfig["dialog"]["context"]["pad-token"];
+    }
+  }
+}
+//=============================================================================
+// Sampler::Config functions
+//=============================================================================
+static void validateSamplerConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "sampler config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing sampler field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "sampler";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid sampler config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "seed") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "temp") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "top-k") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "top-p") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "greedy") {
+      JSON_ENFORCE_BOOLEAN();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key());
+    }
+  }
+}
+static void translateSamplerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  if (genieConfig["dialog"].contains("sampler")) {
+    quallaConfig["sampler"]["type"] = "basic";
+    if (genieConfig["dialog"]["sampler"].contains("seed")) {
+      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("temp")) {
+      quallaConfig["sampler"]["temp"] = genieConfig["dialog"]["sampler"]["temp"];
+    }
+    quallaConfig["sampler"]["role"] = "primary";
+#if defined(GENIE_SPD_FEATURE)
+    if (genieConfig["dialog"]["type"] == "spd") {
+      quallaConfig["sampler"]["role"] = "target";
+    }
+#endif
+    if (genieConfig["dialog"]["sampler"].contains("top-k")) {
+      quallaConfig["sampler"]["top-k"] = genieConfig["dialog"]["sampler"]["top-k"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("top-p")) {
+      quallaConfig["sampler"]["top-p"] = genieConfig["dialog"]["sampler"]["top-p"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("greedy")) {
+      quallaConfig["sampler"]["greedy"] = genieConfig["dialog"]["sampler"]["greedy"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("seed")) {
+      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
+    }
+  }
+}
+//=============================================================================
+// Tokenizer::Config functions
+//=============================================================================
+static void validateTokenizerConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "tokenizer config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "path"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing tokenizer field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "tokenizer";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid tokenizer config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "path") {
+      JSON_ENFORCE_STRING();
+      // Note: the existence of this file is checked by qualla
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown tokenizer config key: " + item.key());
+    }
+  }
+}
+static void translateTokenizerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  quallaConfig["tokenizer"] = genieConfig["dialog"]["tokenizer"]["path"];
+}
+//=============================================================================
+// Embedding::Config functions
+//=============================================================================
+static void validateEmbeddingConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "embedding config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "size"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing embedding field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "embedding";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid embedding config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "size") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "datatype") {
+      JSON_ENFORCE_STRING();
+      const std::set<std::string> supportedTypes = {"float32", "native"};
+      if (std::find(supportedTypes.begin(), supportedTypes.end(), item.value()) ==
+          supportedTypes.end()) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Unknown embedding datatype: " + std::string(item.value()));
+      }
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown embedding config key: " + item.key());
+    }
+  }
+}
+static void translateEmbeddingConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  if (genieConfig["dialog"].contains("embedding")) {
+    quallaConfig["context"]["n-embd"] = genieConfig["dialog"]["embedding"]["size"];
+    if (genieConfig["dialog"]["embedding"].contains("datatype")) {
+      quallaConfig["context"]["embedding-datatype"] =
+          genieConfig["dialog"]["embedding"]["datatype"];
+    }
+  }
+}
+bool position_dim_set = false;
+bool rope_theta_set   = false;
+//=============================================================================
+// Backend::Config functions
+//=============================================================================
+static void validateBackendHtpConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnHtp config is not an object");
+  }
+  std::set<std::string> mandatoryFields{
+      "version", "spill-fill-bufsize", "mmap-budget", "use-mmap", "cpu-mask", "poll"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "QnnHtp";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid QnnHtp config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "spill-fill-bufsize") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "mmap-budget") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "use-mmap") {
+      JSON_ENFORCE_BOOLEAN();
+#ifdef _WIN32
+      if (item.value() == true) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid QnnHtp config. use-mmap not supported on target");
+      }
+#endif
+    } else if (item.key() == "pos-id-dim") {
+      position_dim_set = true;
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "cpu-mask") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "poll") {
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "kv-dim") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "kv-update-method") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "allow-async-init") {
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "rope-theta") {
+      rope_theta_set = true;
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key());
+    }
+  }
+}
+static void validateBackendGenaiConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnGenAiTransformer config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Missing QnnGenAiTransformer field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "QnnGenAiTransformer";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(
+            GENIE_STATUS_ERROR_JSON_VALUE,
+            "Invalid QnnGenAiTransformer config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "use-mmap") {
+      JSON_ENFORCE_BOOLEAN();
+#ifdef _WIN32
+      if (item.value() == true) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid QnnGenAiTransformer config. use-mmap not supported on target");
+      }
+#endif
+    } else if (item.key() == "n-logits") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-layer") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-embd") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-heads") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown QnnGenAiTransformer config key: " + item.key());
+    }
+  }
+}
+static void validateBackendConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "backend config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "type"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing backend field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "backend";
+  std::string type;
+  bool htp = false;
+  qualla::json htpConfig;
+  bool genai = false;
+  qualla::json genaiConfig;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid backend config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "type") {
+      JSON_ENFORCE_STRING();
+      type = item.value().get<std::string>();
+      if (type == "QnnHtp") {
+        htp = true;
+      } else if (type == "QnnGenAiTransformer") {
+        genai = true;
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid backend config: unsupported type: " + item.value().dump());
+      }
+    } else if (item.key() == "extensions") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "QnnHtp") {
+      JSON_ENFORCE_OBJECT();
+      htpConfig = item.value();
+    } else if (item.key() == "QnnGenAiTransformer") {
+      JSON_ENFORCE_OBJECT();
+      genaiConfig = item.value();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown backend config key: " + item.key());
+    }
+  }
+  if (htp) {
+    if (!htpConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp dialog config");
+    }
+    validateBackendHtpConfig(htpConfig);
+  } else {
+    if (htpConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "QnnHtp backend config for incorrect backend type: " + type);
+    }
+  }
+  if (genai) {
+    if (!genaiConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnGenAiTransformer dialog config");
+    }
+    validateBackendGenaiConfig(genaiConfig);
+  } else {
+    if (genaiConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "QnnGenAiTransformer backend config for incorrect backend type: " + type);
+    }
+  }
+}
+//=============================================================================
+// Model::Config functions
+//=============================================================================
+static void validateLoraAdapterConfig(const qualla::json& config,
+                                      LORA_VERSION& specifiedLoraVersion) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "lora adapter config is not an object");
+  }
+  const std::set<std::string> mandatoryFields{"version", "name"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing lora adapter field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  const std::string component        = "lora adapter";
+  LORA_VERSION configuredLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_UNDEFINED;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid lora config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "name") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "bin-sections") {
+      JSON_ENFORCE_ARRAY();
+      configuredLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_V2;  // Adapter occurs with V2
+      for (auto& elem : item.value()) {
+        if (!elem.is_string()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                          "bin-sections must be an array of strings");
+        }
+      }
+    } else if (item.key() == "path") {
+      configuredLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_V1;  // Weights are V1
+      JSON_ENFORCE_STRING();
+      // Note:all directory validations will done by NSP engine
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown lora adapter config key: " + item.key());
+    }
+  }
+  if (specifiedLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_V1 &&
+      configuredLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_V2) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "LoRA Adapters must be used with lora version: 2");
+  } else if (specifiedLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_V2 &&
+             configuredLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_V1) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "LoRA Weights must be used with lora version: 1");
+  } else if (configuredLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_UNDEFINED) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Invalid lora config.");
+  }
+}
+static void validateLoraConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "lora config is not an object");
+  }
+  const std::set<std::string> mandatoryFields{"version", "adapters"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing lora field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  const std::string component       = "lora";
+  LORA_VERSION specifiedLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_V2;  // Default is loraV2
+  if (config.find("lora-version") != config.end()) {
+    switch (static_cast<uint8_t>(config["lora-version"])) {
+      case 1:
+        specifiedLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_V1;
+        break;
+      case 2:
+        specifiedLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_V2;
+        break;
+      default:
+        specifiedLoraVersion = LORA_VERSION::GENIE_LORA_VERSION_UNDEFINED;
+        break;
+    }
+  }
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid lora config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "alpha-tensor-name") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "adapters") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        validateLoraAdapterConfig(elem, specifiedLoraVersion);
+      }
+    } else if (item.key() == "lora-version") {  // Optional
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown lora config key: " + item.key());
+    }
+  }
+  if (specifiedLoraVersion == LORA_VERSION::GENIE_LORA_VERSION_UNDEFINED) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "Unsupported lora version: " + to_string(config["lora-version"]));
+  }
+}
+static void validateModelBinaryConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "binary config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "ctx-bins"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "binary";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid binary config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "ctx-bins") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        if (!elem.is_string()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "ctx-bins must be an array of strings");
+        }
+      }
+    } else if (item.key() == "lora") {
+      JSON_ENFORCE_OBJECT();
+      validateLoraConfig(item.value());
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown binary config key: " + item.key());
+    }
+  }
+}
+static void validateModelLibraryConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "library config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "model-bin"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "library";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid library config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "model-bin") {
+      JSON_ENFORCE_STRING();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key());
+    }
+  }
+}
+static void validateRopeScalingConfig(const qualla::json& config) {
+  // component is used in the "ENFORCE" macros
+  std::string component = "rope-scaling";
+  if (config.is_object()) {
+    std::string ropeType;
+    for (auto& item : config.items()) {
+      if (item.key() == "rope-type") {
+        JSON_ENFORCE_STRING();
+        ropeType = item.value().get<std::string>();
+        if (ropeType != "llama3" && ropeType != "default" && ropeType != "longrope") {
+          throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Rope type not supported" + ropeType);
+        }
+      } else if (item.key() == "factor" || item.key() == "low-freq-factor" ||
+                 item.key() == "high-freq-factor" ||
+                 item.key() == "original-max-position-embeddings") {
+        JSON_ENFORCE_NUMERIC();
+      } else if (item.key() == "short-factor" || item.key() == "long-factor") {
+        JSON_ENFORCE_ARRAY();
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                        "Rope scaling parameter not supported " + item.key());
+      }
+    }
+  }
+}
+static void validatePositionalEncodingConfig(const qualla::json& config) {
+  // component is used in the "ENFORCE" macros
+  std::string component = "positional-encoding";
+  qualla::json ropeScalingConfig;
+  if (config.is_object()) {
+    for (auto& item : config.items()) {
+      if (item.key() == "type") {
+        std::string positionEncodingType = item.value().get<std::string>();
+        if (positionEncodingType != "rope" && positionEncodingType != "absolute" &&
+            positionEncodingType != "alibi") {
+          throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "positional-encoding type not supported");
+        }
+      } else if (item.key() == "rope-dim") {
+        JSON_ENFORCE_NUMERIC();
+      } else if (item.key() == "rope-theta") {
+        JSON_ENFORCE_NUMERIC();
+      } else if (item.key() == "rope-scaling") {
+        JSON_ENFORCE_OBJECT();
+        ropeScalingConfig = item.value();
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                        "Unknown positional encoding config key: " + item.key());
+      }
+    }
+  }
+  if (position_dim_set) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "Specify one config from pos-id-dim and positional-encoding");
+  }
+  if (rope_theta_set) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "Specify one config from rope-theta and positional-encoding");
+  }
+  if (ropeScalingConfig.is_object()) {
+    validateRopeScalingConfig(ropeScalingConfig);
+  }
+}
+static void validateModelConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "model config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "type"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing model field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "model";
+  std::string type;
+  bool binary = false;
+  qualla::json binaryConfig;
+  bool library = false;
+  qualla::json libraryConfig;
+  qualla::json positionalEncodingConfig;
+  bool positionalEncoding = false;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid model config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "type") {
+      JSON_ENFORCE_STRING();
+      type = item.value().get<std::string>();
+      if (type == "binary") {
+        binary = true;
+      } else if (type == "library") {
+        library = true;
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid model config: unsupported type: " + item.value().dump());
+      }
+    } else if (item.key() == "binary") {
+      JSON_ENFORCE_OBJECT();
+      binaryConfig = item.value();
+    } else if (item.key() == "library") {
+      JSON_ENFORCE_OBJECT();
+      libraryConfig = item.value();
+    } else if (item.key() == "positional-encoding") {
+      JSON_ENFORCE_OBJECT();
+      positionalEncodingConfig = item.value();
+      positionalEncoding       = true;
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown model config key: " + item.key());
+    }
+  }
+  if (binary) {
+    if (!binaryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary model config");
+    }
+    validateModelBinaryConfig(binaryConfig);
+  } else {
+    if (binaryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "binary model config for incorrect model type: " + type);
+    }
+  }
+  if (library) {
+    if (!libraryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library model config");
+    }
+    validateModelLibraryConfig(libraryConfig);
+  } else {
+    if (libraryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "library model config for incorrect model type: " + type);
+    }
+  }
+  if (positionalEncoding) {
+    if (!positionalEncodingConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing Positional encoding config");
+    }
+    validatePositionalEncodingConfig(positionalEncodingConfig);
+  } else {
+    if (positionalEncodingConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Positional encoding config for incorrect model type: " + type);
+    }
+  }
+}
+//=============================================================================
+// Engine::Config functions
+//=============================================================================
+static void validateEngineConfig(const qualla::json& config, std::string dialogType) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "backend", "model", "n-threads"};
+#if defined(GENIE_SPD_FEATURE)
+  if (dialogType == "spd") {
+    mandatoryFields.insert("role");
+  }
+#endif
+  if (dialogType == "kv-share") {
+    mandatoryFields.insert("role");
+  }
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing engine field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "engine";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid engine config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "backend") {
+      JSON_ENFORCE_OBJECT();
+      validateBackendConfig(item.value());
+    } else if (item.key() == "model") {
+      JSON_ENFORCE_OBJECT();
+      validateModelConfig(item.value());
+    } else if (item.key() == "n-threads") {
+      JSON_ENFORCE_NUMERIC();
+#if defined(GENIE_SPD_FEATURE)
+    } else if (item.key() == "role" && dialogType == "spd") {
+      JSON_ENFORCE_STRING();
+      if (item.value() != "draft" && item.value() != "target") {
+        throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                        "Unknown value: for engine config key: " + item.key());
+      }
+#endif
+    } else if (item.key() == "role" && dialogType == "kv-share") {
+      JSON_ENFORCE_STRING();
+      if (item.value() != "primary" && item.value() != "secondary") {
+        throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                        "Unknown value: for engine config key: " + item.key());
+      }
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown engine config key: " + item.key());
+    }
+  }
+}
+static void validateMultiEngineConfig(const qualla::json& configs, std::string dialogType) {
+  if (configs.is_object()) {
+    validateEngineConfig(configs, dialogType);
+#if defined(GENIE_SPD_FEATURE)
+    if (dialogType == "spd") {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config for spd is not an array");
+    }
+#endif
+    if (dialogType == "kv-share") {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config for kv-share is not an array");
+    }
+#if defined(GENIE_SPD_FEATURE)
+  } else if (configs.is_array() && dialogType == "spd") {
+    if (configs.size() != 2) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for spd contain invalid number of engines");
+    }
+    bool engineRoleMask[2] = {false, false};
+    for (auto& item : configs) {
+      validateEngineConfig(item, dialogType);
+      if (item["role"] == "draft") {
+        engineRoleMask[0] = true;
+      } else if (item["role"] == "target") {
+        engineRoleMask[1] = true;
+      }
+    }
+    if (!engineRoleMask[0]) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for spd does not contain draft engine");
+    }
+    if (!engineRoleMask[1]) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for spd does not contain target engine");
+    }
+#endif
+  } else if (configs.is_array() && dialogType == "kv-share") {
+    if (configs.size() != 2) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for kv-share contain invalid number of engines");
+    }
+    bool engineRoleMask[2] = {false, false};
+    for (auto& item : configs) {
+      validateEngineConfig(item, dialogType);
+      if (item["role"] == "primary") {
+        engineRoleMask[0] = true;
+      } else if (item["role"] == "secondary") {
+        engineRoleMask[1] = true;
+      }
+    }
+    if (!engineRoleMask[0]) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for kv-share does not contain primary");
+    }
+    if (!engineRoleMask[1]) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "engine config for kv-share does not contain secondary");
+    }
+  } else {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config is not an object or an array");
+  }
+}
+static void translateEngineConfig(const qualla::json& genieEngineConfig,
+                                  qualla::json& quallaEngineConfig) {
+  if (genieEngineConfig["version"] == 1) {
+    if (genieEngineConfig.contains("role")) {
+      quallaEngineConfig["role"] = genieEngineConfig["role"];
+    } else {
+      quallaEngineConfig["role"] = "primary";
+    }
+    quallaEngineConfig["n-threads"] = genieEngineConfig["n-threads"];
+    if (genieEngineConfig["backend"]["type"] == "QnnHtp") {
+      quallaEngineConfig["type"]        = "qnn-htp";
+      quallaEngineConfig["backend-lib"] = getLibName("QnnHtp");
+      quallaEngineConfig["mmap-budget"] = genieEngineConfig["backend"]["QnnHtp"]["mmap-budget"];
+      quallaEngineConfig["use-mmap"]    = genieEngineConfig["backend"]["QnnHtp"]["use-mmap"];
+      quallaEngineConfig["spill-fill-bufsize"] =
+          genieEngineConfig["backend"]["QnnHtp"]["spill-fill-bufsize"];
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("pos-id-dim")) {
+        quallaEngineConfig["pos-id-dim"] = genieEngineConfig["backend"]["QnnHtp"]["pos-id-dim"];
+      }
+      quallaEngineConfig["cpumask"] = genieEngineConfig["backend"]["QnnHtp"]["cpu-mask"];
+      quallaEngineConfig["poll"]    = genieEngineConfig["backend"]["QnnHtp"]["poll"];
+      quallaEngineConfig["kv-dim"]  = genieEngineConfig["backend"]["QnnHtp"]["kv-dim"];
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("rope-theta")) {
+        quallaEngineConfig["rope-theta"] = genieEngineConfig["backend"]["QnnHtp"]["rope-theta"];
+      }
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("kv-update-method")) {
+        quallaEngineConfig["kv-update-method"] =
+            genieEngineConfig["backend"]["QnnHtp"]["kv-update-method"];
+      }
+      // By default, Qualla will default to the async init path.
+      // For now, we are forcing async init off unless explicitly
+      // specified in the Genie config. It is HTP specific feature only.
+      quallaEngineConfig["use-async-Init"] = false;
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("allow-async-init")) {
+        quallaEngineConfig["use-async-Init"] =
+            genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"];
+      }
+    } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") {
+      quallaEngineConfig["type"]        = "qnn-cpu";
+      quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer");
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-logits")) {
+        quallaEngineConfig["n_logits"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-logits"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("use-mmap")) {
+        quallaEngineConfig["use-mmap"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["use-mmap"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-layer")) {
+        quallaEngineConfig["n_layer"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-layer"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-embd")) {
+        quallaEngineConfig["n_embd"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-embd"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-heads")) {
+        quallaEngineConfig["n_heads"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"];
+      }
+    }
+    if (genieEngineConfig["backend"].contains("extensions")) {
+      quallaEngineConfig["backend-ext-conf"] = genieEngineConfig["backend"]["extensions"];
+    }
+    if (genieEngineConfig["model"]["type"] == "binary") {
+      quallaEngineConfig["model-list"] = genieEngineConfig["model"]["binary"]["ctx-bins"];
+      if (genieEngineConfig["model"]["binary"].contains("lora")) {
+        quallaEngineConfig["lora-version"] =
+            static_cast<uint8_t>(LORA_VERSION::GENIE_LORA_VERSION_V2);
+        if (genieEngineConfig["model"]["binary"]["lora"].contains("lora-version") &&
+            genieEngineConfig["model"]["binary"]["lora"]["lora-version"] == 1) {
+          quallaEngineConfig["lora-version"] =
+              genieEngineConfig["model"]["binary"]["lora"]["lora-version"];
+        }
+        for (int i = 0; i < genieEngineConfig["model"]["binary"]["lora"]["adapters"].size(); i++) {
+          quallaEngineConfig["lora"][i]["adapter-name"] =
+              genieEngineConfig["model"]["binary"]["lora"]["adapters"][i]["name"];
+          quallaEngineConfig["lora"][i]["alpha-tensor-name"] = "";
+          if (genieEngineConfig["model"]["binary"]["lora"].contains("alpha-tensor-name")) {
+            quallaEngineConfig["lora"][i]["alpha-tensor-name"] =
+                genieEngineConfig["model"]["binary"]["lora"]["alpha-tensor-name"];
+          }
+          quallaEngineConfig["lora"][i]["alpha-tensor-value"] = 1.0f;
+          quallaEngineConfig["lora"][i]["binsection-basedir"] = "";
+          if (genieEngineConfig["model"]["binary"]["lora"].contains("lora-version") &&
+              genieEngineConfig["model"]["binary"]["lora"]["lora-version"] == 1) {
+            quallaEngineConfig["lora"][i]["path"] =
+                genieEngineConfig["model"]["binary"]["lora"]["adapters"][i]["path"];
+          } else {
+            quallaEngineConfig["lora"][i]["bin-sections"] =
+                genieEngineConfig["model"]["binary"]["lora"]["adapters"][i]["bin-sections"];
+          }
+        }
+      }
+    } else if (genieEngineConfig["model"]["type"] == "library") {
+      quallaEngineConfig["model"]          = getLibName("QnnGenAiTransformerModel");
+      quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"];
+      quallaEngineConfig["op-package"] =
+          getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider";
+    }
+    if (genieEngineConfig["model"].contains("positional-encoding")) {
+      quallaEngineConfig["positional-encoding"]["type"] =
+          genieEngineConfig["model"]["positional-encoding"]["type"];
+      if (genieEngineConfig["model"]["positional-encoding"]["type"] == "rope") {
+        quallaEngineConfig["positional-encoding"]["rope-dim"] =
+            genieEngineConfig["model"]["positional-encoding"]["rope-dim"];
+        if (genieEngineConfig["model"]["positional-encoding"].contains("rope-theta")) {
+          quallaEngineConfig["positional-encoding"]["rope-theta"] =
+              genieEngineConfig["model"]["positional-encoding"]["rope-theta"];
+        }
+        if (genieEngineConfig["model"]["positional-encoding"].contains("rope-scaling")) {
+          if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                  "rope-type")) {
+            quallaEngineConfig["positional-encoding"]["rope-scaling"]["rope-type"] =
+                genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]["rope-type"];
+            if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]["rope-type"] ==
+                "llama3") {
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]["factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "low-freq-factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["low-freq-factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]
+                                     ["low-freq-factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "high-freq-factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["high-freq-factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]
+                                     ["high-freq-factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "original-max-position-embeddings")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]
+                                  ["original-max-position-embeddings"] =
+                                      genieEngineConfig["model"]["positional-encoding"]
+                                                       ["rope-scaling"]
+                                                       ["original-max-position-embeddings"];
+              }
+            }
+            if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]["rope-type"] ==
+                "longrope") {
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]["factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "short-factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["short-factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]
+                                     ["short-factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "long-factor")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]["long-factor"] =
+                    genieEngineConfig["model"]["positional-encoding"]["rope-scaling"]
+                                     ["long-factor"];
+              }
+              if (genieEngineConfig["model"]["positional-encoding"]["rope-scaling"].contains(
+                      "original-max-position-embeddings")) {
+                quallaEngineConfig["positional-encoding"]["rope-scaling"]
+                                  ["original-max-position-embeddings"] =
+                                      genieEngineConfig["model"]["positional-encoding"]
+                                                       ["rope-scaling"]
+                                                       ["original-max-position-embeddings"];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+static void translateMultiEngineConfig(const qualla::json& genieConfig,
+                                       qualla::json& quallaConfig) {
+  if (genieConfig["dialog"]["engine"].is_array()) {
+    quallaConfig["engine"] = qualla::json::array();
+    for (auto& item : genieConfig["dialog"]["engine"]) {
+      qualla::json quallaEngineConfig;
+      translateEngineConfig(item, quallaEngineConfig);
+      quallaConfig["engine"].push_back(quallaEngineConfig);
+    }
+  } else {
+    translateEngineConfig(genieConfig["dialog"]["engine"], quallaConfig["engine"]);
+  }
+}
+//=============================================================================
+// Dialog::Config functions
+//=============================================================================
+qnn::util::HandleManager<Dialog::Config> Dialog::Config::s_manager;
+GenieDialogConfig_Handle_t Dialog::Config::add(std::shared_ptr<Dialog::Config> config) {
+  return (GenieDialogConfig_Handle_t)s_manager.add(config);
+}
+std::shared_ptr<Dialog::Config> Dialog::Config::get(GenieDialogConfig_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Dialog::Config::remove(GenieDialogConfig_Handle_t handle) {
+  s_manager.remove((qnn::util::Handle_t)handle);
+}
+#if defined(GENIE_SSD_FEATURE)
+static void validateDialogSsdConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "ssd-q1 config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version",
+                                        "ssd-version",
+                                        "forecast-token-count",
+                                        "branches",
+                                        "forecast-prefix",
+                                        "forecast-prefix-name"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing ssd-q1 field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "ssd-q1";
+  int branchesSize       = 0;
+  int forecastTokenCount = 0;
+  int nStreams     = 1;
+  float pThreshold = 0.0;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid ssd-q1 config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "ssd-version") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "forecast-token-count") {
+      JSON_ENFORCE_NUMERIC();
+      forecastTokenCount = item.value();
+    } else if (item.key() == "branches") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        if (!elem.is_number_integer()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "branches must be an array of integers");
+        }
+      }
+      branchesSize = item.value().size();
+    } else if (item.key() == "forecast-prefix") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "forecast-prefix-name") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "n-streams") {
+      JSON_ENFORCE_NUMERIC();
+      nStreams = item.value();
+    } else if (item.key() == "p-threshold") {
+      JSON_ENFORCE_NUMERIC();
+      pThreshold = item.value();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown ssd-q1 config key: " + item.key());
+    }
+  }
+  if ((pThreshold > 0.0) && (nStreams <= 1)) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                    "p-threshold can only be used with multistream (n-streams > 1)");
+  }
+  if (branchesSize > forecastTokenCount) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                    "Size of branches array must be less than forecast-token-count");
+  }
+}
+#endif
+#if defined(GENIE_LADE_FEATURE)
+static void validateDialogLadeConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "lade config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "update-mode", "window", "ngram", "gcap"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing lade field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "lade";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid lade config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "update-mode") {
+      JSON_ENFORCE_STRING();
+      std::string mode = item.value().get<std::string>();
+      if ((mode != "FWD_MAX_HIT") && (mode != "FWD_LEVEL") && (mode != "ALWAYS_FWD_ONE")) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid lade config: unsupported update-mode: " + item.value().dump());
+      }
+    } else if (item.key() == "window") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "ngram") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "gcap") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown lade config key: " + item.key());
+    }
+  }
+}
+#endif
+#if defined(GENIE_SPD_FEATURE)
+static void validateDialogSpdConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "spd config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "draft-len"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing spd field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "spd";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid spd config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "draft-len") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown spd config key: " + item.key());
+    }
+  }
+}
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+static void validateDialogMultistreamConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "multistream config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "n-streams"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing multistream field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "multistream";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid multistream config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "n-streams") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "p-threshold") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown multistream config key: " + item.key());
+    }
+  }
+}
+#endif
+static void validateDialogConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Dialog config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "type", "context", "tokenizer", "engine"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing dialog field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "dialog";
+  std::string dialogType = "basic";
+#if defined(GENIE_SSD_FEATURE)
+  bool ssdq1 = false;
+  qualla::json ssdq1Config;
+#endif
+#if defined(GENIE_LADE_FEATURE)
+  bool lade = false;
+  qualla::json ladeConfig;
+#endif
+#if defined(GENIE_SPD_FEATURE)
+  bool spd = false;
+  qualla::json spdConfig;
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+  bool multistream = false;
+  qualla::json multistreamConfig;
+#endif
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid dialog config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "type") {
+      JSON_ENFORCE_STRING();
+      dialogType = item.value();
+      if (dialogType == "basic" || dialogType == "kv-share") {
+        // Do nothing
+#if defined(GENIE_SSD_FEATURE)
+      } else if (dialogType == "ssd-q1") {
+        ssdq1 = true;
+#endif
+#if defined(GENIE_LADE_FEATURE)
+      } else if (dialogType == "lade") {
+        lade = true;
+#endif
+#if defined(GENIE_SPD_FEATURE)
+      } else if (dialogType == "spd") {
+        spd = true;
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+      } else if (dialogType == "multistream") {
+        multistream = true;
+#endif
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "Invalid dialog type: " + dialogType);
+      }
+#if defined(GENIE_SSD_FEATURE)
+    } else if (item.key() == "ssd-q1") {
+      JSON_ENFORCE_OBJECT();
+      ssdq1Config = item.value();
+      // ssd-q1 validation is done below
+#endif
+#if defined(GENIE_LADE_FEATURE)
+    } else if (item.key() == "lade") {
+      JSON_ENFORCE_OBJECT();
+      ladeConfig = item.value();
+      // ssd-q1 validation is done below
+#endif
+#if defined(GENIE_SPD_FEATURE)
+    } else if (item.key() == "spd") {
+      JSON_ENFORCE_OBJECT();
+      spdConfig = item.value();
+      // spd validation is done below
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+    } else if (item.key() == "multistream") {
+      JSON_ENFORCE_OBJECT();
+      multistreamConfig = item.value();
+      // multistream validation is done below
+#endif
+    } else if (item.key() == "stop-sequence") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        if (!elem.is_string()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                          "stop-sequence must be an array of strings");
+        }
+      }
+    } else if (item.key() == "max-num-tokens") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() < 0) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "number of tokens must be > 0. provided: " + item.value().dump());
+      }
+    } else if (item.key() == "context") {
+      JSON_ENFORCE_OBJECT();
+      validateContextConfig(item.value());
+    } else if (item.key() == "tokenizer") {
+      JSON_ENFORCE_OBJECT();
+      validateTokenizerConfig(item.value());
+    } else if (item.key() == "sampler") {
+      JSON_ENFORCE_OBJECT();
+      validateSamplerConfig(item.value());
+    } else if (item.key() == "engine") {
+      JSON_ENFORCE_ARRAY_OR_OBJECT();
+    } else if (item.key() == "embedding") {
+      JSON_ENFORCE_OBJECT();
+      validateEmbeddingConfig(item.value());
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown dialog config key: " + item.key());
+    }
+  }
+  // Engine Verification requires dialogType for engine roles. Since "type" is encounterd
+  // later than "engine" in loop. Therefore, moving engine validation out of the loop.
+  validateMultiEngineConfig(config["engine"], dialogType);
+#if defined(GENIE_SSD_FEATURE)
+  if (ssdq1) {
+    if (!ssdq1Config.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing ssd-q1 dialog config");
+    }
+    validateDialogSsdConfig(ssdq1Config);
+  } else {
+    if (ssdq1Config.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "ssd-q1 dialog config for incorrect dialog type: " + dialogType);
+    }
+  }
+#endif
+#if defined(GENIE_LADE_FEATURE)
+  if (lade) {
+    if (!ladeConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing lade dialog config");
+    }
+    validateDialogLadeConfig(ladeConfig);
+  } else {
+    if (ladeConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "lade dialog config for incorrect dialog type: " + dialogType);
+    }
+  }
+#endif
+#if defined(GENIE_SPD_FEATURE)
+  if (spd) {
+    if (!spdConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing spd dialog config");
+    }
+    validateDialogSpdConfig(spdConfig);
+  } else {
+    if (spdConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "spd dialog config for incorrect dialog type: " + dialogType);
+    }
+  }
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+  if (multistream) {
+    if (!multistreamConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing multistream dialog config");
+    }
+    validateDialogMultistreamConfig(multistreamConfig);
+  } else {
+    if (multistreamConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "multistream dialog config for incorrect dialog type: " + dialogType);
+    }
+  }
+#endif
+}
+static void translateDialogConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  if (genieConfig["dialog"]["version"] == 1) {
+    if (genieConfig["dialog"]["type"] == "lade") {
+      quallaConfig["type"] = "lhd-dec";
+    } else if (genieConfig["dialog"]["type"] == "spd") {
+      quallaConfig["type"] = "spec-dec";
+    } else if (genieConfig["dialog"]["type"] == "multistream") {
+      quallaConfig["type"] = "multistream";
+    } else {
+      quallaConfig["type"] = genieConfig["dialog"]["type"];
+    }
+#if defined(GENIE_SSD_FEATURE)
+    if (genieConfig["dialog"]["type"] == "ssd-q1") {
+      quallaConfig["ssd-version"] = genieConfig["dialog"]["ssd-q1"]["ssd-version"];
+      quallaConfig["forecast-token-count"] =
+          genieConfig["dialog"]["ssd-q1"]["forecast-token-count"];
+      quallaConfig["branches"]        = genieConfig["dialog"]["ssd-q1"]["branches"];
+      quallaConfig["forecast-prefix"] = genieConfig["dialog"]["ssd-q1"]["forecast-prefix"];
+      quallaConfig["forecast-prefix-name"] =
+          genieConfig["dialog"]["ssd-q1"]["forecast-prefix-name"];
+      if (genieConfig["dialog"]["ssd-q1"].contains("n-streams")) {
+        quallaConfig["n-streams"] = genieConfig["dialog"]["ssd-q1"]["n-streams"];
+      }
+      if (genieConfig["dialog"]["ssd-q1"].contains("p-threshold")) {
+        quallaConfig["p-threshold"] = genieConfig["dialog"]["ssd-q1"]["p-threshold"];
+      }
+    }
+#endif
+#if defined(GENIE_LADE_FEATURE)
+    if (genieConfig["dialog"]["type"] == "lade") {
+      quallaConfig["lhd-update-mode"] = genieConfig["dialog"]["lade"]["update-mode"];
+      quallaConfig["window"]          = genieConfig["dialog"]["lade"]["window"];
+      quallaConfig["ngram"]           = genieConfig["dialog"]["lade"]["ngram"];
+      quallaConfig["gcap"]            = genieConfig["dialog"]["lade"]["gcap"];
+    }
+#endif
+#if defined(GENIE_SPD_FEATURE)
+    if (genieConfig["dialog"]["type"] == "spd") {
+      quallaConfig["draft-len"] = genieConfig["dialog"]["spd"]["draft-len"];
+    }
+#endif
+#if defined(GENIE_MULTISTREAM_FEATURE)
+    if (genieConfig["dialog"]["type"] == "multistream") {
+      quallaConfig["n-streams"] = genieConfig["dialog"]["multistream"]["n-streams"];
+      if (genieConfig["dialog"]["multistream"].contains("p-threshold")) {
+        quallaConfig["p-threshold"] = genieConfig["dialog"]["multistream"]["p-threshold"];
+      }
+    }
+#endif
+  }
+  if (genieConfig["dialog"].contains("stop-sequence")) {
+    quallaConfig["prompt"]["stop-sequence"] = genieConfig["dialog"]["stop-sequence"];
+  }
+  translateContextConfig(genieConfig, quallaConfig);
+  translateTokenizerConfig(genieConfig, quallaConfig);
+  translateSamplerConfig(genieConfig, quallaConfig);
+  translateMultiEngineConfig(genieConfig, quallaConfig);
+  translateEmbeddingConfig(genieConfig, quallaConfig);
+}
+uint32_t getMaxNumTokens(const qualla::json& genieConfig) {
+  uint32_t tokenLimit{UINT32_MAX};
+  if (genieConfig["dialog"]["version"] == 1) {
+    if (genieConfig["dialog"].contains("max-num-tokens")) {
+      tokenLimit = genieConfig["dialog"]["max-num-tokens"];
+    }
+  }
+  return tokenLimit;
+}
+Dialog::Config::Config(const char* configStr) {
+  qualla::json config;
+  rope_theta_set   = false;
+  position_dim_set = false;
+  {
+    std::set<qualla::json> keys;
+    auto callback = [&keys](int depth, qualla::json::parse_event_t event, qualla::json& parsed) {
+      if ((depth == 1) && (event == qualla::json::parse_event_t::key)) {
+        if (keys.count(parsed) > 0) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                          "Multiple dialog config key: " + parsed.dump());
+        }
+        keys.insert(parsed);
+      }
+      return true;
+    };
+    config = qualla::json::parse(configStr, callback);
+  }
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Dialog config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"dialog"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing dialog field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "dialog";
+  for (auto& item : config.items()) {
+    if (item.key() == "dialog") {
+      JSON_ENFORCE_OBJECT();
+      validateDialogConfig(item.value());
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown dialog config key: " + item.key());
+    }
+  }
+  m_config = config;
+}
+qualla::json Dialog::Config::getJson() const { return m_config; }
+//=============================================================================
+// Dialog functions
+//=============================================================================
+qnn::util::HandleManager<Dialog> Dialog::s_manager;
+std::atomic<std::uint32_t> Dialog::s_nameCounter{0u};
+GenieDialog_Handle_t Dialog::add(std::shared_ptr<Dialog> dialog) {
+  return (GenieDialog_Handle_t)s_manager.add(dialog);
+}
+std::shared_ptr<Dialog> Dialog::get(GenieDialog_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Dialog::remove(GenieDialog_Handle_t handle) { s_manager.remove((qnn::util::Handle_t)handle); }
+Dialog::Dialog(std::shared_ptr<Config> config) {
+  auto env = qualla::Env::create(qualla::json{});
+  qualla::json quallaConfig;
+  translateDialogConfig(config->getJson(), quallaConfig);
+  m_tokenLimit   = getMaxNumTokens(config->getJson());
+  m_quallaDialog = qualla::Dialog::create(
+      env, "dialog" + std::to_string(s_nameCounter.fetch_add(1u)), quallaConfig);
+  if (!m_quallaDialog) {
+    throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a dialog object");
+  }
+}
+static_assert(qualla::Sentence::Code::COMPLETE ==
+              static_cast<qualla::Sentence::Code>(GENIE_DIALOG_SENTENCE_COMPLETE));
+static_assert(qualla::Sentence::Code::BEGIN ==
+              static_cast<qualla::Sentence::Code>(GENIE_DIALOG_SENTENCE_BEGIN));
+static_assert(qualla::Sentence::Code::CONTINUE ==
+              static_cast<qualla::Sentence::Code>(GENIE_DIALOG_SENTENCE_CONTINUE));
+static_assert(qualla::Sentence::Code::END ==
+              static_cast<qualla::Sentence::Code>(GENIE_DIALOG_SENTENCE_END));
+static_assert(qualla::Sentence::Code::ABORT ==
+              static_cast<qualla::Sentence::Code>(GENIE_DIALOG_SENTENCE_ABORT));
+int32_t Dialog::query(const char* queryStr,
+                      GenieDialog_SentenceCode_t sentenceCode,
+                      GenieDialog_QueryCallback_t callback,
+                      const void* userData) {
+  std::string query(queryStr);
+  uint32_t genTokenCount = 0u;
+  bool status            = m_quallaDialog->query(
+      query,
+      static_cast<qualla::Sentence::Code>(sentenceCode),
+      [&](const std::string& response, qualla::Sentence::Code code) {
+        callback(response.c_str(), static_cast<GenieDialog_SentenceCode_t>(code), userData);
+        bool keepGoing = ++genTokenCount < m_tokenLimit;
+        if (!keepGoing && ((code == qualla::Sentence::Code::BEGIN) ||
+                           (code == qualla::Sentence::Code::CONTINUE))) {
+          callback("", GENIE_DIALOG_SENTENCE_END, userData);
+        }
+        return keepGoing;
+      });
+  qualla::Dialog::KPIs kpis = m_quallaDialog->kpis();
+  printf(
+      "\n\n[KPIS]:\nInit Time: %zu us\nPrompt Processing Time: %zu us, Prompt Processing Rate : "
+      "%f toks/sec\n"
+      "Token Generation Time: %zu us, Token Generation Rate: %f toks/sec\n",
+      kpis.init.total_usec,
+      kpis.prompt.last_usec,
+      kpis.tps.prompt,
+      kpis.generate.last_usec,
+      kpis.tps.generate);
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}
+int32_t Dialog::save(const std::string& name) {
+  return m_quallaDialog->save(name) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}
+int32_t Dialog::restore(const std::string& name) {
+  return m_quallaDialog->restore(name) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}
+#if defined(GENIE_E2T_FEATURE)
+int32_t Dialog::embeddingQuery(const void* embeddings,
+                               const uint32_t embeddingsSize,
+                               GenieDialog_SentenceCode_t sentenceCode,
+                               GenieDialog_TokenToEmbeddingCallback_t t2eCallback,
+                               GenieDialog_QueryCallback_t callback,
+                               const void* userData) {
+  uint32_t genTokenCount = 0u;
+  if (embeddingsSize % m_quallaDialog->getEmbeddingBufferSize() != 0) {
+    throw std::runtime_error(
+        "The embeddings buffer size must be an integer multiple of the embedding vector size in "
+        "bytes.");
+  }
+  const uint8_t* embeddingsSrc = static_cast<const uint8_t*>(embeddings);
+  std::vector<uint8_t> embeddingVector(embeddingsSrc, embeddingsSrc + embeddingsSize);
+  qualla::Dialog::T2ECallback t2eQuallaCallback{nullptr};
+  if (t2eCallback) {
+    t2eQuallaCallback = [&](const int32_t token, void* embedding, const uint32_t embd_size) {
+      t2eCallback(token, embedding, embd_size, userData);
+    };
+  }
+  bool status = m_quallaDialog->query(
+      embeddingVector,
+      static_cast<qualla::Sentence::Code>(sentenceCode),
+      t2eQuallaCallback,
+      [&](const std::string& response, qualla::Sentence::Code code) {
+        callback(response.c_str(), static_cast<GenieDialog_SentenceCode_t>(code), userData);
+        bool keepGoing = ++genTokenCount < m_tokenLimit;
+        if (!keepGoing && ((code == qualla::Sentence::Code::BEGIN) ||
+                           (code == qualla::Sentence::Code::CONTINUE))) {
+          callback("", GENIE_DIALOG_SENTENCE_END, userData);
+        }
+        return keepGoing;
+      });
+  qualla::Dialog::KPIs kpis = m_quallaDialog->kpis();
+  printf(
+      "\n\n[KPIS]:\nInit Time: %zu us\nPrompt Processing Time: %zu us, Prompt Processing Rate : "
+      "%f toks/sec\n"
+      "Token Generation Time: %zu us, Token Generation Rate: %f toks/sec\n",
+      kpis.init.total_usec,
+      kpis.prompt.last_usec,
+      kpis.tps.prompt,
+      kpis.generate.last_usec,
+      kpis.tps.generate);
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}
+#endif
+void Dialog::reset() { m_quallaDialog->reset(); }
+#if defined(GENIE_LORA_FEATURE)
+int32_t Dialog::applyLora(std::string loraAdapterName, std::string engine) {
+  bool status = m_quallaDialog->applyLoraAdapter(loraAdapterName, engine);
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_GENERAL);
+}
+int32_t Dialog::applyLoraStrength(std::string tensorName, std::string engine, float alpha) {
+  bool status = m_quallaDialog->applyLoraStrength(tensorName, alpha, engine);
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_GENERAL);
+}
+#endif
+int32_t Dialog::tokenQuery(const uint32_t* tokens,
+                           const uint32_t sizeInputTokens,
+                           GenieDialog_SentenceCode_t sentenceCode,
+                           GenieDialog_TokenQueryCallback_t callback,
+                           const void* userData) {
+  std::vector<uint32_t> inputTokens;
+  for (size_t i = 0; i < sizeInputTokens; i++) {
+    inputTokens.push_back(tokens[i]);
+  }
+  uint32_t genTokenCount = 0u;
+  dialogCallback.setCallBackType(qualla::QUALLA_CALLBACK_TYPE_TOKEN);
+  dialogCallback.getTokenCbFunc() = std::make_shared<
+      std::function<bool(const int32_t*, const uint32_t, qualla::Sentence::Code)>>();
+  *(dialogCallback.getTokenCbFunc()) = [&](const int32_t* responseTokens,
+                                           const uint32_t sizeResponseTokens,
+                                           qualla::Sentence::Code code) {
+    callback((const uint32_t*)responseTokens,
+             sizeResponseTokens,
+             static_cast<GenieDialog_SentenceCode_t>(code),
+             userData);
+    bool keepGoing = ++genTokenCount < m_tokenLimit;
+    if (!keepGoing &&
+        ((code == qualla::Sentence::Code::BEGIN) || (code == qualla::Sentence::Code::CONTINUE))) {
+      callback(nullptr, 0, GENIE_DIALOG_SENTENCE_END, userData);
+    }
+    return keepGoing;
+  };
+  bool status               = m_quallaDialog->query((const std::vector<uint32_t>)inputTokens,
+                                      static_cast<qualla::Sentence::Code>(sentenceCode),
+                                      dialogCallback);
+  qualla::Dialog::KPIs kpis = m_quallaDialog->kpis();
+  printf(
+      "\n\n[KPIS]:\nInit Time: %zu us\nPrompt Processing Time: %zu us, Prompt Processing Rate : "
+      "%f toks/sec\n"
+      "Token Generation Time: %zu us, Token Generation Rate: %f toks/sec\n",
+      kpis.init.total_usec,
+      kpis.prompt.last_usec,
+      kpis.tps.prompt,
+      kpis.generate.last_usec,
+      kpis.tps.generate);
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}

Genie/Genie/src/Dialog.hpp ADDED Viewed

	@@ -0,0 +1,95 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <atomic>
+#include <memory>
+#include "GenieDialog.h"
+#include "Util/HandleManager.hpp"
+#include "qualla/dialog.hpp"
+#include "qualla/DialogCallback.hpp"
+namespace genie {
+enum LORA_VERSION : uint8_t {
+  GENIE_LORA_VERSION_V1        = 0x1,
+  GENIE_LORA_VERSION_V2        = 0x2,
+  GENIE_LORA_VERSION_UNDEFINED = 0xFF
+};
+class Dialog {
+ public:
+  class Config {
+   public:
+    static GenieDialogConfig_Handle_t add(std::shared_ptr<Config> config);
+    static std::shared_ptr<Config> get(GenieDialogConfig_Handle_t handle);
+    static void remove(GenieDialogConfig_Handle_t handle);
+    Config(const char* configStr);
+    qualla::json getJson() const;
+   private:
+    static qnn::util::HandleManager<Config> s_manager;
+    qualla::json m_config;
+  };
+  static GenieDialog_Handle_t add(std::shared_ptr<Dialog> dialog);
+  static std::shared_ptr<Dialog> get(GenieDialog_Handle_t handle);
+  static void remove(GenieDialog_Handle_t handle);
+  qualla::DialogCallback dialogCallback;
+  Dialog(std::shared_ptr<Config> config);
+  Dialog(const Dialog&)            = delete;
+  Dialog& operator=(const Dialog&) = delete;
+  Dialog(Dialog&&)                 = delete;
+  Dialog& operator=(Dialog&&)      = delete;
+  int32_t query(const char* queryStr,
+                GenieDialog_SentenceCode_t sentenceCode,
+                GenieDialog_QueryCallback_t callback,
+                const void* userData);
+  int32_t save(const std::string&);
+  int32_t restore(const std::string&);
+#if defined(GENIE_E2T_FEATURE)
+  int32_t embeddingQuery(const void* embeddings,
+                const uint32_t embeddingsSize,
+                GenieDialog_SentenceCode_t sentenceCode,
+                GenieDialog_TokenToEmbeddingCallback_t t2eCallback,
+                GenieDialog_QueryCallback_t callback,
+                const void* userData);
+#endif
+  int32_t tokenQuery(const uint32_t* tokens,
+                 const uint32_t sizeInputTokens,
+                 GenieDialog_SentenceCode_t sentenceCode,
+                 GenieDialog_TokenQueryCallback_t callback,
+                 const void* userData);
+  void reset();
+#if defined(GENIE_LORA_FEATURE)
+  int32_t applyLora(std::string loraAdapterName, std::string engine);
+  int32_t applyLoraStrength(std::string tensorName, std::string engine, float alpha);
+#endif
+ private:
+  std::unique_ptr<qualla::Dialog> m_quallaDialog;
+  uint32_t m_tokenLimit{UINT32_MAX};
+  static qnn::util::HandleManager<Dialog> s_manager;
+  static std::atomic<std::uint32_t> s_nameCounter;
+};
+}  // namespace genie

Genie/Genie/src/Exception.hpp ADDED Viewed

	@@ -0,0 +1,27 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <exception>
+#include <string>
+#include "GenieCommon.h"
+namespace genie {
+class Exception : public std::runtime_error {
+ public:
+  Exception(Genie_Status_t status, std::string what) : std::runtime_error(what), m_status(status) {}
+  Genie_Status_t status() const { return m_status; }
+ private:
+  Genie_Status_t m_status;
+};
+}  // namespace genie

Genie/Genie/src/GenieCommon.cpp ADDED Viewed

	@@ -0,0 +1,15 @@

+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#include "GenieCommon.h"
+uint32_t Genie_getApiMajorVersion(void) { return GENIE_API_VERSION_MAJOR; }
+uint32_t Genie_getApiMinorVersion(void) { return GENIE_API_VERSION_MINOR; }
+uint32_t Genie_getApiPatchVersion(void) { return GENIE_API_VERSION_PATCH; }

Genie/Genie/src/GenieDialog.cpp ADDED Viewed

	@@ -0,0 +1,249 @@

+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#include "Dialog.hpp"
+#include "Exception.hpp"
+#include "GenieDialog.h"
+#include "Macro.hpp"
+#include "Util/HandleManager.hpp"
+#include "qualla/detail/json.hpp"
+using namespace genie;
+GENIE_API
+Genie_Status_t GenieDialogConfig_createFromJson(const char* str,
+                                                GenieDialogConfig_Handle_t* configHandle) {
+  try {
+    GENIE_ENSURE(str, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    auto config = std::make_shared<Dialog::Config>(str);
+    GENIE_ENSURE(config, GENIE_STATUS_ERROR_MEM_ALLOC);
+    *configHandle = genie::Dialog::Config::add(config);
+  } catch (const qualla::json::parse_error& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_JSON_FORMAT;
+  } catch (const Exception& e) {
+    std::cerr << e.what() << std::endl;
+    return e.status();
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieDialogConfig_free(const GenieDialogConfig_Handle_t configHandle) {
+  try {
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    {
+      // Check if the dialog actually exists
+      auto configObj = genie::Dialog::Config::get(configHandle);
+      GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    }
+    genie::Dialog::Config::remove(configHandle);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieDialog_create(const GenieDialogConfig_Handle_t configHandle,
+                                  GenieDialog_Handle_t* dialogHandle) {
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    // Get config object
+    auto configObj = genie::Dialog::Config::get(configHandle);
+    GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    // Create dialog
+    auto dialog = std::make_shared<genie::Dialog>(configObj);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_MEM_ALLOC);
+    // Create Handle
+    *dialogHandle = genie::Dialog::add(dialog);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  // Return SUCCESS
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieDialog_query(const GenieDialog_Handle_t dialogHandle,
+                                 const char* queryStr,
+                                 const GenieDialog_SentenceCode_t sentenceCode,
+                                 const GenieDialog_QueryCallback_t callback,
+                                 const void* userData) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(queryStr, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(callback, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    switch (sentenceCode) {
+      case GENIE_DIALOG_SENTENCE_COMPLETE:
+      case GENIE_DIALOG_SENTENCE_BEGIN:
+      case GENIE_DIALOG_SENTENCE_CONTINUE:
+      case GENIE_DIALOG_SENTENCE_END:
+      case GENIE_DIALOG_SENTENCE_ABORT:
+        // Do nothing
+        break;
+      default:
+        return GENIE_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    status = dialog->query(queryStr, sentenceCode, callback, userData);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieDialog_save(const GenieDialog_Handle_t dialogHandle, const char* path) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(path, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = dialog->save(path);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieDialog_restore(const GenieDialog_Handle_t dialogHandle, const char* path) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(path, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = dialog->restore(path);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieDialog_reset(const GenieDialog_Handle_t dialogHandle) {
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    dialog->reset();
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+#if defined(GENIE_LORA_FEATURE)
+GENIE_API
+Genie_Status_t GenieDialog_applyLora(const GenieDialog_Handle_t dialogHandle,
+                                     const char* engine,
+                                     const char* loraAdapterName) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(engine, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    std::string eng(engine);
+    GENIE_ENSURE(loraAdapterName, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    std::string loraName(loraAdapterName);
+    status = dialog->applyLora(loraName, eng);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieDialog_setLoraStrength(const GenieDialog_Handle_t dialogHandle,
+                                           const char* engine,
+                                           const char* tensorName,
+                                           const float alpha) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(engine, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    std::string eng(engine);
+    GENIE_ENSURE(tensorName, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    std::string alphaTensorName(tensorName);
+    GENIE_ENSURE_NOT_EMPTY(alphaTensorName, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = dialog->applyLoraStrength(tensorName, eng, alpha);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+#endif
+GENIE_API
+Genie_Status_t GenieDialog_tokenQuery(const GenieDialog_Handle_t dialogHandle,
+                                      const uint32_t* inputTokens,
+                                      const uint32_t numTokens,
+                                      const GenieDialog_SentenceCode_t sentenceCode,
+                                      const GenieDialog_TokenQueryCallback_t callback,
+                                      const void* userData) {
+  bool status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(inputTokens, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(callback, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = dialog->tokenQuery(inputTokens, numTokens, sentenceCode, callback, userData);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) {
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    {
+      // Check if the dialog actually exists
+      auto dialog = genie::Dialog::get(dialogHandle);
+      GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    }
+    genie::Dialog::remove(dialogHandle);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}

Genie/Genie/src/GenieDialogEmbedding.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#include "Dialog.hpp"
+#include "Exception.hpp"
+#include "GenieDialog.h"
+#include "Macro.hpp"
+#include "Util/HandleManager.hpp"
+#include "qualla/detail/json.hpp"
+using namespace genie;
+GENIE_API
+Genie_Status_t GenieDialog_embeddingQuery(const GenieDialog_Handle_t dialogHandle,
+                                          const void* embeddings,
+                                          const uint32_t embeddingsSize,
+                                          const GenieDialog_SentenceCode_t sentenceCode,
+                                          const GenieDialog_TokenToEmbeddingCallback_t t2eCallback,
+                                          const GenieDialog_QueryCallback_t callback,
+                                          const void* userData) {
+  Genie_Status_t status;
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(embeddings, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(callback, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = dialog->embeddingQuery(
+        embeddings, embeddingsSize, sentenceCode, t2eCallback, callback, userData);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}

Genie/Genie/src/Macro.hpp ADDED Viewed

	@@ -0,0 +1,101 @@

+//============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#pragma once
+//======================================================================================================================
+// Error generation macros
+//======================================================================================================================
+#define GENIE_LOG_ERROR(fmt, ...)
+#define GENIE_ENSURE_MSG(value, return_error, msg) \
+  do {                                             \
+    if (!(value)) {                                \
+      GENIE_LOG_ERROR(" " msg);                    \
+      return return_error;                         \
+    }                                              \
+  } while (0)
+#define GENIE_ENSURE(value, return_error)          \
+  do {                                             \
+    if (!(value)) {                                \
+      GENIE_LOG_ERROR("%s was not true.", #value); \
+      return return_error;                         \
+    }                                              \
+  } while (0)
+#define GENIE_ENSURE_STATUS(status, return_error) \
+  do {                                            \
+    if ((status) != GENIE_SUCCESS) {              \
+      return return_error;                        \
+    }                                             \
+  } while (0)
+#define GENIE_ENSURE_EQ(a, b, return_error)                     \
+  do {                                                          \
+    if ((a) != (b)) {                                           \
+      GENIE_LOG_ERROR("%s != %s (%d != %d)", #a, #b, (a), (b)); \
+      return return_error;                                      \
+    }                                                           \
+  } while (0)
+#define GENIE_ENSURE_NOT_EMPTY(value, return_error) \
+  do {                                              \
+    if (value.empty()) {                            \
+      GENIE_LOG_ERROR("%s was not true.", #value);  \
+      return return_error;                          \
+    }                                               \
+  } while (0)
+//======================================================================================================================
+// JSON config macros
+//======================================================================================================================
+#define JSON_ENFORCE_OBJECT()                                                                 \
+  if (!item.value().is_object()) {                                                            \
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,                                           \
+                    "Invalid " + component + " config: " + item.key() + " is not an object"); \
+  }
+#define JSON_ENFORCE_ARRAY()                                                                 \
+  if (!item.value().is_array()) {                                                            \
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,                                          \
+                    "Invalid " + component + " config: " + item.key() + " is not an array"); \
+  }
+#define JSON_ENFORCE_ARRAY_OR_OBJECT()                                                     \
+  if (!item.value().is_array() && !item.value().is_object()) {                             \
+    throw Exception(                                                                       \
+        GENIE_STATUS_ERROR_JSON_SCHEMA,                                                    \
+        "Invalid " + component + " config: " + item.key() + " is not an array or object"); \
+  }
+#define JSON_ENFORCE_NUMERIC()                                                              \
+  if (!item.value().is_number()) {                                                          \
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,                                         \
+                    "Invalid " + component + " config: " + item.key() + " is not numeric"); \
+  }
+#define JSON_ENFORCE_ARRAY_OR_NUMERIC()                                                     \
+  if (!item.value().is_number() && !item.value().is_array()) {                              \
+    throw Exception(                                                                        \
+        GENIE_STATUS_ERROR_JSON_SCHEMA,                                                     \
+        "Invalid " + component + " config: " + item.key() + " is not an array or numeric"); \
+  }
+#define JSON_ENFORCE_BOOLEAN()                                                              \
+  if (!item.value().is_boolean()) {                                                         \
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,                                         \
+                    "Invalid " + component + " config: " + item.key() + " is not boolean"); \
+  }
+#define JSON_ENFORCE_STRING()                                                                \
+  if (!item.value().is_string()) {                                                           \
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,                                          \
+                    "Invalid " + component + " config: " + item.key() + " is not a string"); \
+  }

Genie/Genie/src/Util/HandleGenerator.hpp ADDED Viewed

	@@ -0,0 +1,62 @@

+//==============================================================================
+//
+//  Copyright (c) 2019-2020,2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <mutex>
+namespace qnn {
+namespace util {
+typedef std::size_t Handle_t;
+class HandleGenerator final {
+  static_assert(std::is_integral<Handle_t>::value, "Handle must be an integral type");
+  static_assert((sizeof(Handle_t) == 8) || (sizeof(Handle_t) == 4),
+                "Implementation of HandleGenerator::bswap() for sizeof(std::size_t) is required");
+ public:
+  HandleGenerator(const HandleGenerator&) = delete;
+  HandleGenerator& operator=(const HandleGenerator&) = delete;
+  HandleGenerator(HandleGenerator&&)                 = delete;
+  HandleGenerator& operator=(HandleGenerator&&) = delete;
+  static Handle_t generate(const void* const addr) {
+    return (bswap((Handle_t)addr) ^ (Handle_t)s_operand);
+  }
+  static const void* reverse(const Handle_t handle) {
+    return (void*)bswap(handle ^ (Handle_t)s_operand);
+  }
+  static constexpr Handle_t invalid() { return s_operand; }
+ private:
+  HandleGenerator() {}
+  static uint32_t bswap32(const uint32_t val) {
+    return (val >> 24U) | ((val >> 8U) & 0xff00U) | ((val << 8U) & 0xff0000U) | (val << 24U);
+  }
+  static uint64_t bswap64(const uint64_t val) {
+    return ((bswap32(val) + 0ULL) << 32U) | bswap32(val >> 32U);
+  }
+  template <typename T>
+  static size_t bswap(T val) {
+    if (sizeof(T) == 4) {
+      return bswap32(val);
+    } else {
+      return bswap64(val);
+    }
+  }
+  // Magic number generated via "openssl rand -hex 8"
+  static constexpr Handle_t s_operand = (Handle_t)0xd4c2416534bcdc9b;
+};
+}  // namespace util
+}  // namespace qnn

Genie/Genie/src/Util/HandleManager.hpp ADDED Viewed

	@@ -0,0 +1,84 @@

+//==============================================================================
+//
+//  Copyright (c) 2019-2020 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "HandleGenerator.hpp"
+namespace qnn {
+namespace util {
+template <typename T>
+class HandleManager {
+ public:
+  HandleManager()                     = default;
+  HandleManager(const HandleManager&) = delete;
+  HandleManager& operator=(const HandleManager&) = delete;
+  HandleManager(HandleManager&&)                 = delete;
+  HandleManager& operator=(HandleManager&&) = delete;
+  Handle_t add(std::shared_ptr<T> item) {
+    std::lock_guard<std::mutex> locker(m_itemsMtx);
+    if (!item) {
+      return HandleGenerator::invalid();
+    }
+    auto handle     = HandleGenerator::generate(item.get());
+    m_items[handle] = item;
+    return handle;
+  }
+  Handle_t add(T* item) { return add(std::shared_ptr<T>(item)); }
+  Handle_t add(std::weak_ptr<T> item) { return add(item.lock()); }
+  std::shared_ptr<T> get(Handle_t handle) {
+    std::lock_guard<std::mutex> locker(m_itemsMtx);
+    auto it = m_items.find(handle);
+    if (it == m_items.end()) {
+      return std::shared_ptr<T>(nullptr);
+    }
+    return it->second;
+  }
+  typedef std::function<bool(const std::pair<Handle_t, std::shared_ptr<T>>&)> UnaryPredicate_t;
+  Handle_t findIf(UnaryPredicate_t pred) const {
+    auto it = std::find_if(m_items.begin(), m_items.end(), pred);
+    if (it == m_items.end()) {
+      return HandleGenerator::invalid();
+    }
+    return it->first;
+  }
+  size_t remove(Handle_t handle) {
+    std::lock_guard<std::mutex> locker(m_itemsMtx);
+    return m_items.erase(handle);
+  }
+  void clear() { m_items.clear(); }
+  const std::unordered_map<Handle_t, std::shared_ptr<T>>& getItems() const { return m_items; }
+ private:
+  std::unordered_map<Handle_t, std::shared_ptr<T>> m_items;
+  std::mutex m_itemsMtx;
+};
+}  // namespace util
+}  // namespace qnn

Genie/Genie/src/qualla/context.cpp ADDED Viewed

	@@ -0,0 +1,118 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/logger.hpp>
+#include <qualla/context.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/onload.hpp>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace qualla {
+Context::Context(Env& env, const std::string& name, const qualla::json& json)
+    : _name(name), _env(env), _conf(json) {
+    _env.logger().debug(fmt::format("ctx-new: {} config {}", _name, _conf.dump()));
+    qualla::Config conf(json, "context:");
+    _size    = conf.optional<size_t>("size", 1024);
+    _size    = conf.optional<size_t>("n-ctx", _size); // alternative name
+    _n_vocab = conf.optional<size_t>("n-vocab", 32000);
+    _n_embd  = conf.optional<size_t>("n-embd", 1024);
+    _embedding_length = conf.optional<int32_t>("embedding-length", -1);
+    _embedding_datatype = conf.optional<std::string>("embedding-datatype", "float32");
+    // For backward compatibility. When eot-token is removed, this logic can be simplified
+    // Currently, EOT is marked as default truncating token if available
+    int32_t eot_tok = conf.optional<int32_t>("eot-token", -1);
+    if (eot_tok >= 0) _eos_tok_list.insert(eot_tok);
+    const qualla::json eos_conf = conf.optional<qualla::json>("eos-token", _eos_tok);
+    if (eos_conf.is_array() && eos_conf.size() > 0) {
+        const std::vector<int32_t>& eos_tokens = eos_conf.get<std::vector<int32_t>>();
+        _eos_tok                               = eos_tokens[0];
+        for (const int32_t& eos_tok : eos_tokens)
+            _eos_tok_list.insert(eos_tok);
+    } else if (eos_conf.is_number_integer()) {
+        int32_t eos_tok = eos_conf.get<int32_t>();
+        _eos_tok        = (eot_tok >= 0) ? eot_tok : eos_tok;
+        _eos_tok_list.insert(eos_tok);
+    }
+    _pad_tok = conf.optional<qualla::json>("pad-token", _eos_tok);
+}
+std::unique_ptr<Context> Context::create(
+        Env&                env,
+        const std::string&  name,
+        const qualla::json& conf
+) {
+    return std::make_unique<Context>(env, name, conf);
+}
+std::unique_ptr<Context> Context::create(
+        Env&               env,
+        const std::string& name,
+        std::istream&      json_stream
+) {
+    return create(env, name, json::parse(json_stream));
+}
+std::unique_ptr<Context> Context::create(
+        Env&               env,
+        const std::string& name,
+        const std::string& json_str
+) {
+    return create(env, name, json::parse(json_str));
+}
+#ifdef QUALLA_STATIC
+// This is a hack to make sure all core bits are linked in for the static build
+extern void needFileLogger();
+extern void needStdoutLogger();
+extern void needBasicSampler();
+extern void needBasicDialog();
+extern void needKvShareDialog();
+extern void needSpdDialog();
+extern void needSsdDialog();
+extern void needLadeDialog();
+extern void needMultistreamDialog();
+    #ifdef QUALLA_ENGINE_QNN_HTP
+extern void needQnnHtpEngine();
+    #endif
+    #ifdef QUALLA_ENGINE_QNN_CPU
+extern void needQnnCpuEngine();
+    #endif
+static OnLoad needs([]() {
+    needStdoutLogger();
+    needFileLogger();
+    needBasicDialog();
+    needBasicSampler();
+    needKvShareDialog();
+    needSpdDialog();
+    needSsdDialog();
+    needLadeDialog();
+    needMultistreamDialog();
+    #ifdef QUALLA_ENGINE_QNN_HTP
+    needQnnHtpEngine();
+    #endif
+    #ifdef QUALLA_ENGINE_QNN_CPU
+    needQnnCpuEngine();
+    #endif
+});
+#endif
+} // namespace qualla

Genie/Genie/src/qualla/dialog.cpp ADDED Viewed

	@@ -0,0 +1,590 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/sampler-utils.hpp>
+#include <algorithm>
+#include <functional>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <filesystem>
+#include <iostream>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace fs = std::filesystem;
+namespace qualla {
+Dialog::Dialog(std::shared_ptr<Env> env, const std::string& name, const qualla::json& json)
+    : _env(env) {
+    Timer start;
+    __DEBUG("dialog-new: {} config {}", name, json.dump());
+    using qc = qualla::Config;
+    // Create Gpiomarker and reset the gpio status to low
+    const qualla::json& gpio_conf = qc::optional<qualla::json>(json, "gpio", {});
+    _gpio_marker                  = GpioMarker::create(gpio_conf);
+    _gpio_marker->set();
+    // Create the context first
+    _ctx = Context::create(*_env, name, qc::mandatory<qualla::json>(json, "context"));
+    // Parse prompt config
+    const qualla::json& pmt_conf = qc::optional<qualla::json>(json, "prompt", {});
+    _prompt_type                 = qc::optional<std::string>(pmt_conf, "type", "llama2");
+    _sys_tags   = qc::optional<std::vector<std::string>>(pmt_conf, "sys-tags", {"", ""});
+    _inst_tags  = qc::optional<std::vector<std::string>>(pmt_conf, "inst-tags", {"", ""});
+    _role_tags  = qc::optional<std::vector<std::string>>(pmt_conf, "role-tags", {"", ""});
+    _sys_prompt = qc::optional<std::string>(pmt_conf, "sys-prompt", "");
+    const std::vector<std::string>& stop_sequence =
+            qc::optional<std::vector<std::string>>(pmt_conf, "stop-sequence", {});
+    _stop_sequence = SequenceMatchTrie(stop_sequence);
+    // Create Tokenizer
+    // TODO: auto-detect / validate n_vocab with tokenizer vocab
+    fs::path tok_path = _env->path().models / qc::mandatory<std::string>(json, "tokenizer");
+    _tokenizer        = Tokenizer::create(*_ctx, tok_path);
+    // Create Sampler(s)
+    auto add_sampler = [&](const qualla::json& j) {
+        std::string role = qc::optional<std::string>(j, "role", "primary");
+        _sampler[role]   = Sampler::create(*_ctx, j);
+    };
+    const qualla::json& sam_conf = qc::mandatory<qualla::json>(json, "sampler");
+    if (sam_conf.is_array()) {
+        for (auto sc : sam_conf) {
+            add_sampler(sc);
+        }
+    } else
+        add_sampler(sam_conf);
+    // Create Engine(s)
+    auto add_engine = [&](const qualla::json& j) {
+        std::string role = qc::optional<std::string>(j, "role", "primary");
+        _engine[role]    = Engine::create(*_ctx, j);
+        using FF = Engine::Feature::Flags;
+        if (!_engine[role]->supports(FF::OUTPUT_LOGITS))
+            throw std::runtime_error("the engine must output Logits");
+    };
+    const qualla::json& eng_conf = qc::mandatory<qualla::json>(json, "engine");
+    if (eng_conf.is_array()) {
+        for (auto ec : eng_conf) {
+            add_engine(ec);
+        }
+    } else{
+        add_engine(eng_conf);
+    }
+    // Store input type (token, embedding, etc) from the engine.
+    // This assumes multi-engine usecases use matching input types.
+    m_inputType = _engine.begin()->second->getInputType();
+    _kpis.init.update(start.elapsed_usec());
+}
+Dialog::~Dialog() {}
+static bool __no_response_query(const std::string&, Sentence::Code) {
+    return false;
+}
+static bool __no_response_token(const int32_t*, const uint32_t, Sentence::Code) {
+    return false;
+}
+static bool __no_response(const std::string&, Sentence::Code) {
+    return false;
+}
+void Dialog::getTopK(std::vector<float>& logits, std::vector<std::vector<int32_t>>& tokens, size_t topK, float pThreshold, Dialog::Callback callback) {
+    auto& sampler = *_sampler["primary"];
+    // Sample top-k logits but with a minimum probability threshold
+#if defined(__GNUC__) && !defined(__clang__)
+    std::span<float> indexed_logits_span(logits);
+    IndexedLogits indexed_logits(indexed_logits_span, sampler.rng());
+#else
+    IndexedLogits indexed_logits(std::span{logits.data(),logits.size()}, sampler.rng());
+#endif
+    indexed_logits.softmax();
+    indexed_logits.topK(topK);
+    for (int i = 0; i < topK; i++) {
+        _last_tok = indexed_logits.indices[i];
+        // Only sample tokens above some probability threshold
+        // TODO: Modify sampling algorithm as necessary
+        if (indexed_logits.probs[i] < pThreshold) {
+            break;
+        } else if (_ctx->is_eos(_last_tok)) {
+            callback("", Sentence::CONTINUE);
+        } else {
+            tokens.push_back({_last_tok});
+        }
+    }
+}
+bool Dialog::query(const std::string& str, Sentence::Code scode, Dialog::Callback callback) {
+    std::vector<int32_t> p_vec; // prompt tokens
+    std::string          p_str; // prompt string
+    p_vec.reserve(1024);
+    if (scode == Sentence::COMPLETE || scode == Sentence::BEGIN) {
+        // Reset prompt/gen counts for new query
+        _n_prompt    = 0;
+        _n_generated = 0;
+        _n_previous_prompt    = 0;
+        _n_previous_generated = 0;
+        if (_last_tok >= 0 && !_ctx->is_eos(_last_tok)) p_vec.push_back(_last_tok);
+        p_str = _inst_tags[0];
+        if (!_n_queries) {
+            // First query. Prepend sys-prompt.
+            p_str += _sys_tags[0] + _sys_prompt + _sys_tags[1];
+        } else {
+            // Add EOS explicitly if the last query was aborted prematurely.
+            if (_ctx->eos_tok() >= 0) p_vec.push_back(_ctx->eos_tok());
+        }
+        // Add BOS
+        if (_ctx->bos_tok() >= 0) {
+            p_vec.push_back(_ctx->bos_tok());
+        }
+    }
+    // FIXME: make this more generic
+    if (_prompt_type == "llama3") {
+        p_str += _sys_tags[0] + _role_tags[1] + _sys_tags[1] + str + _inst_tags[2];
+    } else {
+        p_str += str;
+    }
+    if (scode == Sentence::COMPLETE || scode == Sentence::END) {
+        if (_prompt_type == "llama3") {
+            p_str += _sys_tags[0] + _role_tags[2] + _sys_tags[1];
+        } else {
+            p_str += _inst_tags[1];
+        }
+    }
+    _env->logger().post(Logger::DEBUG, [&]() {
+      qualla::json j{{"string", str}, {"prompt", p_str}};
+      return fmt::format("dialog-query: {} {}", _ctx->name(), j.dump());
+    });
+    _n_queries++;
+    _tokenizer->encode(p_str, p_vec);
+    __DEBUG("dialog-tokens: {} {}", _ctx->name(), p_vec);
+    __DEBUG("dialog-text: \"{}\"", p_str);
+    if (scode == Sentence::COMPLETE || scode == Sentence::END) {
+        // Detect stop sequences here
+        if (!_stop_sequence.empty()) {
+            _stop_sequence.reset();
+            return process(p_vec, [&](const std::string& str, Sentence::Code c) {
+              // Check for stop sequence and end inference when stop sequence is found
+              if (_stop_sequence.process_next_string(str)) {
+                callback(str, c); // Emit sequences until match is complete
+                return false;
+              }
+              // Else, return normal callback function
+              return callback(str, c);
+            });
+        }
+        return process(p_vec, callback);
+    }
+    return process(p_vec, __no_response);
+}
+bool Dialog::query(const std::vector<uint32_t>& input, Sentence::Code scode, qualla::DialogCallback& callback) {
+    std::vector<int32_t> p_vec; // prompt tokens
+    p_vec.reserve(1024);
+    if (scode == Sentence::COMPLETE || scode == Sentence::BEGIN) {
+        // Reset prompt/gen counts for new query
+        _n_prompt = 0;
+        _n_generated = 0;
+        _n_previous_prompt = 0;
+        _n_previous_generated = 0;
+        if (_last_tok >= 0)
+            p_vec.push_back(_last_tok);
+        // Add EOS explicitly if the last query was aborted prematurely.
+        if (_n_queries && _last_tok != _ctx->eos_tok()) {
+            p_vec.push_back(_ctx->eos_tok());
+        }
+        // Add BOS
+        if (_ctx->bos_tok() >= 0) {
+            p_vec.push_back(_ctx->bos_tok());
+        }
+    }
+    p_vec.insert(p_vec.end(), input.begin(), input.end());
+    __DEBUG("dialog-tokens: {} {}", _ctx->name(), p_vec);
+    _n_queries++;
+    if (scode == Sentence::COMPLETE || scode == Sentence::END) {
+        return process(p_vec, callback);
+    }
+    DialogCallback callback_return_token(QUALLA_CALLBACK_TYPE_TOKEN);
+    *(callback_return_token.getTokenCbFunc()) = __no_response_token;
+    return process(p_vec, callback_return_token);
+}
+bool Dialog::query(
+        std::vector<uint8_t>& embedding_vectors,
+        Sentence::Code        scode,
+        T2ECallback           t2eCallback,
+        Dialog::Callback      callback
+) {
+    _n_queries++;
+    if (scode == Sentence::COMPLETE || scode == Sentence::END) {
+        return process(embedding_vectors, t2eCallback, callback);
+    }
+    // Only process, no output
+    return process(embedding_vectors, t2eCallback, [&](const std::string&, Sentence::Code) {
+        return false;
+    });
+}
+bool Dialog::prime(const std::string& str) {
+    bool r = query(str, Sentence::COMPLETE, __no_response);
+    // End with EOS as we want the primer to be self-contained
+    _last_tok = _ctx->eos_tok();
+    return r;
+}
+bool Dialog::save(const std::string& o_name) {
+    Timer start;
+    // Save using session name unless override is provided
+    std::string name      = o_name.empty() ? _ctx->name() : o_name;
+    fs::path    save_path = name;
+    if (!_n_past) {
+        __ERROR("dialog-save: {} : nothing to save yet", name);
+        return false;
+    }
+    __INFO("dialog-save: saving as {} {}", name, save_path.string());
+    if (!fs::exists(save_path) && !fs::create_directories(save_path)) {
+        __ERROR("dialog-save: {} : failed to create cache directory", name);
+        return false;
+    }
+    // Save Dialog state
+    qualla::json j{
+            {"n-past", _n_past},
+            {"n-prompt", _n_prompt},
+            {"n-generated", _n_generated},
+            {"n-queries", _n_queries},
+            {"last-tok", _last_tok}
+    };
+    {
+        fs::path      p = save_path / "dialog.json";
+        std::ofstream f(p);
+        f << j;
+    }
+    // Save Engines (mandatory)
+    for (auto& e : _engine) {
+        if (!e.second->save(name)) {
+            __ERROR("dialog-save: {} : unable to save {} engine", name, e.first);
+            return false;
+        }
+    }
+    // Save Samplers (optional)
+    for (auto& s : _sampler) {
+        if (!s.second->save(name)) {
+            __WARN("dialog-save: {} : unable to save {} sampler", name, s.first);
+        }
+    }
+    _kpis.save.update(start.elapsed_usec());
+    return true;
+}
+bool Dialog::restore(const std::string& o_name) {
+    Timer start;
+    // Restore using session name unless override is provided
+    std::string name      = o_name.empty() ? _ctx->name() : o_name;
+    fs::path    restore_path = name;
+    __INFO("dialog-restore: restoring from {} {}", name, restore_path.string());
+    // Try to restore the Dialog state (optional)
+    // If this fails we reset everything and try to restore the engine.
+    qualla::json j{};
+    {
+        fs::path p = restore_path / "dialog.json";
+        if (fs::exists(p)) {
+            std::ifstream f(p);
+            j = qualla::json::parse(f);
+        } else {
+            __DEBUG("dialog-restore: {} : internal state not restored", name);
+        }
+    }
+    using qc     = qualla::Config;
+    _n_past      = qc::optional<uint32_t>(j, "n-past", 0);
+    _n_prompt    = qc::optional<uint32_t>(j, "n-prompt", 0);
+    _n_generated = qc::optional<uint32_t>(j, "n-generated", 0);
+    _n_queries   = qc::optional<uint32_t>(j, "n-queries", 1);
+    _last_tok    = qc::optional<int32_t>(j, "last-tok", _ctx->eos_tok());
+    // Restore Engines (mandatory)
+    for (auto& e : _engine) {
+        uint32_t n = e.second->restore(name);
+        if (!n) {
+            __ERROR("dialog-restore: {} : unable to restore {} engine", name, e.first);
+            return false;
+        }
+        // Restore n_past from the engine state
+        if (_n_past && n != _n_past) {
+            __WARN("dialog-restore: {} : n-past mismatch : {} engine {} intern {}",
+                   name,
+                   e.first,
+                   _n_past,
+                   n);
+            // Keep the smaller number
+            _n_past = std::min(n, _n_past);
+        } else
+            _n_past = n;
+    }
+    // Restore Samplers (optional)
+    for (auto& s : _sampler) {
+        if (!s.second->restore(name)) {
+            __WARN("dialog-restore: {} : unable to restore {} sampler", name, s.first);
+        }
+    }
+    _kpis.reset();
+    _kpis.restore.update(start.elapsed_usec());
+    return true;
+}
+void Dialog::reset() {
+    __INFO("dialog-reset: {}", _ctx->name());
+    _n_past      = 0;
+    _n_prompt    = 0;
+    _n_generated = 0;
+    _n_queries   = 0;
+    _last_tok    = -1;
+    _n_previous_prompt    = 0;
+    _n_previous_generated = 0;
+    _kpis.reset();
+    // Reset Engines and Samplers
+    for (auto& e : _engine)
+        e.second->reset();
+    for (auto& s : _sampler)
+        s.second->reset();
+    State::clear();
+}
+// Dialog KPIs helpers
+// Get latest KPIs
+Dialog::KPIs& Dialog::kpis() {
+    // Update TPS
+    if (_n_prompt) {
+        float t            = _kpis.prompt.last_usec / _n_prompt;
+        _kpis.tps.n_prompt = _n_prompt;
+        _kpis.tps.prompt   = 1000000.0 / (t ? t : 1000000.0);
+    }
+    if (_n_generated) {
+        float t              = _kpis.generate.last_usec / _n_generated;
+        _kpis.tps.n_generate = _n_generated;
+        _kpis.tps.generate   = 1000000.0 / (t ? t : 1000000.0);
+    }
+    // We could synthesize more KPIs from from other layers (engine, sampler, etc)
+    return _kpis;
+}
+std::string Dialog::KPIs::dump(std::string_view sep) const {
+    return fmt::format(
+            "init:[{}]{}prompt:[{}]{}generate:[{}]{}save:[{}]{}restore:[{}]{} tps-prompt:{:.2f} tps-generate:{:.2f}",
+            init.dump(),
+            sep,
+            prompt.dump(),
+            sep,
+            generate.dump(),
+            sep,
+            save.dump(),
+            sep,
+            restore.dump(),
+            sep,
+            tps.prompt,
+            tps.generate
+    );
+}
+void Dialog::KPIs::reset() {
+    init.reset();
+    prompt.reset();
+    generate.reset();
+    save.reset();
+    restore.reset();
+    tps.prompt   = 0.0;
+    tps.generate = 0.0;
+}
+// Create API
+// Dialog registry : type string + creator function
+using Registry = std::unordered_map<std::string, Dialog::Creator>;
+static std::unique_ptr<Registry> registry;
+void Dialog::__register(const std::string& type, Creator func) {
+    if (!registry) registry = std::make_unique<Registry>();
+    Registry& r = *registry;
+    r[type]     = func;
+}
+std::unique_ptr<Dialog> Dialog::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        const qualla::json&  conf
+) {
+    using qc         = qualla::Config;
+    std::string type = qc::optional<std::string>(conf, "type", "basic");
+    if (!registry) throw std::runtime_error(type + ": dialog not found");
+    Registry& r = *registry;
+    if (!r.contains(type)) throw std::runtime_error(type + ": dialog not found");
+    if (!r.contains(type)) {
+        throw std::runtime_error(type + ": dialog not found");
+    }
+    return std::unique_ptr<Dialog>(r[type](env, name, conf));
+}
+std::unique_ptr<Dialog> Dialog::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        std::istream&        json_stream
+) {
+    return create(env, name, json::parse(json_stream));
+}
+std::unique_ptr<Dialog> Dialog::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        const fs::path&      json_path
+) {
+    if (!fs::exists(json_path))
+        throw std::runtime_error(json_path.string() + ": file does not exist");
+    std::ifstream ifs(json_path);
+    return create(env, name, ifs);
+}
+std::vector<std::string> Dialog::list() {
+    std::vector<std::string> v;
+    if (!registry) return v;
+    Registry& r = *registry;
+    for (auto k : r)
+        v.push_back(k.first);
+    v.push_back("basic"); // default type, always registered
+    return v;
+}
+bool Dialog::applyLoraAdapter(std::string lora_adapter_name, std::string engine_role) {
+    auto& engine = *_engine[engine_role];
+    if (!engine.applyLoraAdapter(lora_adapter_name)) {
+        __WARN("dialog-applyLoraAdapter: failed for {}", lora_adapter_name);
+        return false;
+    }
+    return true;
+}
+bool Dialog::applyLoraStrength(std::string tensor_name, float tensor_val, std::string engine_role) {
+    auto& engine = *_engine[engine_role];
+    if (!engine.applyLoraStrength(tensor_name, tensor_val)) {
+        __WARN("dialog-applyLoraStrength: failed for {}", tensor_name);
+        return false;
+    }
+    return true;
+}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/basic.cpp ADDED Viewed

	@@ -0,0 +1,421 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/basic-dialog.hpp>
+#include <functional>
+#include <filesystem>
+#include <string>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+BasicDialog::BasicDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf) : Dialog(env, name, conf) {
+    if (!_engine.contains("primary")) {
+        State::fatal("\"primary\" engine not present in config!");
+        return;
+    }
+}
+bool BasicDialog::processFollowOnGeneration(std::vector<int32_t>& tokens, std::vector<float>& logits, Dialog::Callback callback){
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    while (true) {
+        if (State::canceled()) {
+          callback("", Sentence::END);
+          break;
+        }
+        // This condition is valid for both tokens and embedding
+        if (_n_past + 1 > _ctx->size()) {
+          __WARN("Context limit exceeded ({} + 1 > {})", _n_past, _ctx->size());
+          callback("", Sentence::END);
+          break;
+        }
+        if (m_inputType == InputType::TOKENS) {
+          if (!engine.process(tokens, logits))
+            return Dialog::abort("engine processing failed", callback);
+        } else if(m_inputType == InputType::EMBEDDINGS) {
+          // Convert tokens to embedding for the processing in the engine.
+          auto embedBufSize = engine.getEmbeddingBufferSize();
+          std::vector<uint8_t> embedding;
+          for(auto &token: tokens){
+              std::vector<uint8_t> curTokenEmbedding(embedBufSize,0);
+              m_t2eCallback(token, curTokenEmbedding.data(), embedBufSize);
+              embedding.insert(embedding.end(), curTokenEmbedding.begin(), curTokenEmbedding.end());
+          }
+          if (!engine.process(embedding, {}, logits))
+              return Dialog::abort("engine processing failed", callback);
+        }
+        else{
+          return Dialog::abort("No valid Input Type is used", callback);
+        }
+        tokens[0] = _last_tok = sampler.process(logits);
+        _n_past++;
+        _n_generated++;
+        if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+        if (_ctx->is_eos(_last_tok)) {
+            callback("", Sentence::END);
+            break;
+        }
+        if (!callback(_tokenizer->decode(tokens), Sentence::CONTINUE)) break;
+    }
+    return true;
+}
+bool BasicDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    if(m_inputType != InputType::TOKENS) {
+        __ERROR("Input type for model is not tokens.");
+        return false;
+    }
+    _gpio_marker->set();
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> logits;
+    State::clear();
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(tokens, logits, false))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_prompt += tokens.size();
+    _n_past += tokens.size();
+    if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+    tokens[0] = _last_tok = sampler.process(logits);
+    tokens.resize(1);
+    _n_generated++;
+    _gpio_marker->set();
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    if (_ctx->is_eos(_last_tok)) {
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!callback(_tokenizer->decode(tokens), Sentence::BEGIN)) return true;
+    State::busy(true);
+    processFollowOnGeneration(tokens, logits, callback);
+    State::busy(false);
+    _gpio_marker->set();
+    _gpio_marker->reset();
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return !State::failed();
+}
+bool BasicDialog::processFollowOnGeneration(std::vector<int32_t>& tokens, std::vector<float>& logits, qualla::DialogCallback callback){
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    while (true) {
+        if (State::canceled()) {
+            callback.callBack(nullptr, 0, Sentence::END, tokenizer());
+            break;
+        }
+        // This condition is valid for both tokens and embedding
+        if (_n_past + 1 > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + 1 > {})", _n_past, _ctx->size());
+            callback.callBack(nullptr, 0, Sentence::END, tokenizer());
+            break;
+        }
+        if (m_inputType == InputType::TOKENS) {
+            if (!engine.process(tokens, logits))
+                return Dialog::abort("engine processing failed", callback);
+        } else if(m_inputType == InputType::EMBEDDINGS) {
+            // Convert tokens to embedding for the processing in the engine.
+            auto embedBufSize = engine.getEmbeddingBufferSize();
+            std::vector<uint8_t> embedding;
+            for(auto &token: tokens){
+                std::vector<uint8_t> curTokenEmbedding(embedBufSize,0);
+                m_t2eCallback(token, curTokenEmbedding.data(), embedBufSize);
+                embedding.insert(embedding.end(), curTokenEmbedding.begin(), curTokenEmbedding.end());
+            }
+            if (!engine.process(embedding, {}, logits))
+                return Dialog::abort("engine processing failed", callback);
+        }
+        else{
+            return Dialog::abort("No valid Input Type is used", callback);
+        }
+        tokens[0] = _last_tok = sampler.process(logits);
+        _n_past++;
+        _n_generated++;
+        if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+        if (_ctx->is_eos(_last_tok)) {
+            callback.callBack(nullptr, 0, Sentence::END, tokenizer());
+            break;
+        }
+        if (!callback.callBack(tokens.data(), tokens.size(), Sentence::CONTINUE, tokenizer())) break;
+    }
+    return true;
+}
+bool BasicDialog::process(std::vector<int32_t>& tokens, qualla::DialogCallback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    if(m_inputType != InputType::TOKENS) {
+        __ERROR("Input type for model is not tokens.");
+        return false;
+    }
+    _gpio_marker->set();
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> logits;
+    State::clear();
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback.callBack(nullptr, 0, Sentence::END, tokenizer());
+        return true;
+    }
+    if (!engine.process(tokens, logits, false)) {
+        return Dialog::abort("engine prompt processing failed", callback);
+    }
+    _n_prompt += tokens.size();
+    _n_past += tokens.size();
+    if (!engine.updateKV(_n_past)) {
+        return Dialog::abort("context size exceeded", callback);
+    }
+    tokens[0] = _last_tok = sampler.process(logits);
+    tokens.resize(1);
+    _n_generated++;
+    _gpio_marker->set();
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    if (_ctx->is_eos(_last_tok)) {
+        callback.callBack(nullptr, 0, Sentence::END, tokenizer());
+        return true;
+    }
+    if (!callback.callBack(tokens.data(), tokens.size(), Sentence::BEGIN, tokenizer()))
+       return true;
+    State::busy(true);
+    processFollowOnGeneration(tokens, logits, callback);
+    State::busy(false);
+    _gpio_marker->set();
+    _gpio_marker->reset();
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return !State::failed();
+}
+bool BasicDialog::process(
+        std::vector<uint8_t>& embedding_vectors,
+        T2ECallback         t2eCallback,
+        Dialog::Callback    callback
+) {
+    Timer start;
+    if(m_inputType != InputType::EMBEDDINGS) {
+        __ERROR("Input type for model is not embeddings.");
+        return false;
+    }
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> logits;
+    State::clear();
+    _gpio_marker->set();
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    // Store the t2e callback for reference during follow-on generation.
+    m_t2eCallback = t2eCallback;
+    size_t embedBufSize = engine.getEmbeddingBufferSize();
+    {
+        std::vector<uint8_t> eosEmbedding(embedBufSize, 0.0);
+        if (m_t2eCallback) {
+            m_t2eCallback(_ctx->eos(), eosEmbedding.data(), embedBufSize);
+        }
+        // For non-autogenerative usecases (where t2eCallback is not supplied),
+        // the EOS vector is all zero. This is fine for models with proper
+        // attention masking support, but may degrade accuracy otherwise.
+        if (!engine.cacheEosEmbedding(eosEmbedding)) {
+            __DEBUG("Failed to set the eos token embedding.");
+            return false;
+        }
+    }
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    size_t curTokenCount = embedding_vectors.size() / embedBufSize;
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset(); // Don't include preprocessing time
+    if (_n_past + curTokenCount > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, curTokenCount, _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(embedding_vectors, {}, logits))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_prompt += curTokenCount;
+    _n_past += curTokenCount;
+    std::vector<int32_t> tokens(1, 0);
+    if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+    tokens[0] = _last_tok = sampler.process(logits);
+    _n_generated++;
+    _gpio_marker->set();
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    if (_ctx->is_eos(_last_tok)) {
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!callback(_tokenizer->decode(tokens), Sentence::BEGIN)) {
+        return true;
+    }
+    if (!m_t2eCallback) {
+        callback("", Sentence::END);
+        return true;
+    }
+    State::busy(true);
+    processFollowOnGeneration(tokens, logits, callback);
+    State::busy(false);
+    _gpio_marker->set();
+    _gpio_marker->reset();
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return !State::failed();
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Dialog::__register(
+            "basic",
+            [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                return (Dialog*)new BasicDialog(env, name, conf);
+            }
+    );
+});
+void needBasicDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/kv-share.cpp ADDED Viewed

	@@ -0,0 +1,359 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/sampler.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/sampler-utils.hpp>
+#include <qualla/detail/basic-sampler.hpp>
+#include <qualla/detail/cache-file.hpp>
+#include <functional>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <filesystem>
+#include <random>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+    using qc = qualla::Config;
+    class KvShareDialog : public Dialog {
+    public:
+        KvShareDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf)
+                : Dialog(env, name, conf) {}
+        virtual bool process(std::vector<int32_t>& tokens, Dialog::Callback callback) override;
+        virtual bool process(std::vector<int32_t>& tokens, DialogCallback callback) override {
+            return false;
+        }
+        virtual void reset() override;
+        bool convertKV(const fs::path& cache_dir);
+    };
+    void KvShareDialog::reset() {
+      __INFO("dialog-reset: {}", _ctx->name());
+      _n_past      = 0;
+      _n_prompt    = 0;
+      _n_generated = 0;
+      _n_queries   = 0;
+      _last_tok    = -1;
+      _kpis.reset();
+      // Reset Samplers
+      for (auto& s : _sampler)
+        s.second->reset();
+      // Reset Engines
+      for (auto& e : _engine) {
+        e.second->reset();
+        e.second->unload();
+      }
+      State::clear();
+    }
+    bool KvShareDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+      // Check for prev failures and bail out early
+      if (State::failed()) return false;
+      Timer start;
+      // Vector for storing logits.
+      // Allocated & filled by the engine.
+      std::vector<float> logits;
+      State::clear();
+      auto& sampler = *_sampler["primary"];
+      auto& p_engine = *_engine["primary"];   // prompt
+      auto& s_engine = *_engine["secondary"]; // generation
+      if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+      }
+      if (!p_engine.process(tokens, logits))
+        return Dialog::abort("engine prompt processing failed", callback);
+      _n_prompt += tokens.size();
+      _n_past += tokens.size();
+      if (!p_engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+      tokens[0] = _last_tok = sampler.process(logits);
+      tokens.resize(1);
+      _n_generated++;
+      _kpis.prompt.update(start.elapsed_usec());
+      // Log latest KPIs
+      _env->logger().post(Logger::KPIS, kpis().dump(" "));
+      if (_ctx->is_eos(_last_tok)) {
+        callback("", Sentence::END);
+        return true;
+      }
+      if (!callback(_tokenizer->decode(tokens), Sentence::BEGIN)) return true;
+      __DEBUG("dialog: {} : switching engines", _ctx->name());
+      {
+        // Setup cache dir for saving the engine state
+        std::string cache_name = _ctx->name() + "-kv-share";
+        fs::path    cache_dir  = _env->path().cache / cache_name;
+        if (!fs::exists(cache_dir) && !fs::create_directories(cache_dir)) {
+          __ERROR("dialog: {} : failed to create cache directory {}",
+                                    _ctx->name(),
+                                    cache_dir.string());
+          return Dialog::abort("engine switch failed", callback);
+        }
+        // Save and unload the primary engine
+        p_engine.save(cache_name);
+        p_engine.unload();
+        // The purpose is to save the hyperparams
+        s_engine.save(cache_name);
+        convertKV(cache_dir);
+        size_t n = s_engine.restore(cache_name);
+        if(!fs::remove_all(cache_dir)) {
+          __WARN("dialog: {} : cache files not closed/dir not found", _ctx->name());
+        }
+        if (n != _n_past) {
+          __WARN("dialog: {} : kv size mismatch {} expected {}", _ctx->name(), n, _n_past);
+          _n_past = n;
+        }
+        s_engine.updateKV(_n_past);
+      }
+      start.reset();
+      State::busy(true);
+      while (true) {
+        if (State::canceled()) {
+          callback("", Sentence::END);
+          break;
+        }
+        if (_n_past + tokens.size() > _ctx->size()) {
+          __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+          callback("", Sentence::END);
+          break;
+        }
+        if (!s_engine.process(tokens, logits))
+          return Dialog::abort("secondary engine processing failed", callback);
+        tokens[0] = _last_tok = sampler.process(logits);
+        _n_past++;
+        _n_generated++;
+        if (!s_engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+        if (_ctx->is_eos(_last_tok)) {
+          callback("", Sentence::END);
+          break;
+        }
+        if (!callback(_tokenizer->decode(tokens), Sentence::CONTINUE)) break;
+      }
+      State::busy(false);
+      _kpis.generate.update(start.elapsed_usec());
+      // Log latest KPIs in a single line
+      _env->logger().post(Logger::KPIS, kpis().dump(" "));
+      return true;
+    }
+    bool KvShareDialog::convertKV(const fs::path& cache_dir) {
+      Timer start;
+      fs::path nsp_cache_path = cache_dir / "kv-cache.primary.qnn-htp";
+      fs::path cpu_cache_path = cache_dir / "kv-cache.secondary.qnn-cpu";
+      __DEBUG("kv-convert: begin converting {} to ", nsp_cache_path.string(), cpu_cache_path.string());
+      std::ifstream nsp_fs(nsp_cache_path, std::ios::in | std::ios::binary);
+      if (nsp_fs.fail()) {
+        __ERROR("kv-convert: error reading file {}", nsp_cache_path.string());
+        State::error("failed to read primary kv-cache");
+        return false;
+      }
+      // Read spec from nsp file
+      CacheFileSpec nsp_spec;
+      nsp_fs.read((char*)&nsp_spec, sizeof(nsp_spec));
+      if (nsp_spec.magic != 0xC0DE) {
+        __ERROR("kv-convert: expected 0xC0DE found {:#x}", nsp_spec.magic);
+        State::error("invalid format of primary kv-cache");
+        return false;
+      }
+      // clang-format off
+      __DEBUG("kv-convert: load {{ num_tensors {}, magic {}, dtype {}, n_heads {}, embed_dim {} update_size {} }}",
+              nsp_spec.num_tensors, nsp_spec.magic, int(nsp_spec.dtype), nsp_spec.n_heads, nsp_spec.embed_dim, nsp_spec.update_size);
+      // clang-format on
+      std::fstream cpu_fs(cpu_cache_path, std::ios::in | std::ios::out | std::ios::binary);
+      if (cpu_fs.fail()) {
+        // TODO: replace with proper error handling
+        __ERROR("kv-convert: failed to write {}", cpu_cache_path.string());
+        State::error("failed to save secondary kv-cache");
+        return false;
+      }
+      CacheFileSpec cpu_spec;
+      cpu_fs.read((char*)&cpu_spec, sizeof(cpu_spec));
+      if (cpu_spec.magic != 0xC0DE) {
+        __ERROR("kv-convert: expected 0xC0DE found {:#x}", cpu_spec.magic);
+        State::error("invalid format of secondary kv-cache");
+        return false;
+      }
+      // Set the n_tokens processed during prompt processing and the spec write to file
+      cpu_spec.update_size = nsp_spec.update_size;
+      cpu_fs.seekp(std::ios::beg);
+      cpu_fs.write((char*)&cpu_spec, sizeof(cpu_spec));
+      const uint32_t n_layer = nsp_spec.num_tensors / 2;
+      const uint32_t n_head  = nsp_spec.n_heads;
+      const uint32_t kv_dim  = nsp_spec.embed_dim;
+      const uint32_t n_tok   = nsp_spec.update_size;
+      const size_t cache_size = n_layer * n_head * kv_dim * n_tok;
+      // Read Key/Value Cache
+      std::vector<uint8_t> key_cache(cache_size);
+      std::vector<uint8_t> value_cache(cache_size);
+      nsp_fs.read((char*)key_cache.data(), cache_size);
+      nsp_fs.read((char*)value_cache.data(), cache_size);
+      // Read Quantization parameters
+      std::vector<double> key_scales(n_layer);
+      std::vector<double> value_scales(n_layer);
+      nsp_fs.read((char*)key_scales.data(), n_layer * sizeof(double));
+      nsp_fs.read((char*)value_scales.data(), n_layer * sizeof(double));
+      nsp_fs.close();
+      // Convert and write on cpu_file
+      // Dequant and transpose caches
+      const uint32_t layer_size = n_head * kv_dim * n_tok;
+      const uint32_t head_size  = kv_dim * n_tok;
+      // Transpose kvdim * n_tok (QNN-HTP K$) -> n_tok * kvdim (QNN-CPU K$)
+      // For ScopGPT KV$ Format
+      __DEBUG("kv-convert: dequantizing keys");
+      std::vector<float> dequant_keys(cache_size);
+      for (uint32_t i = 0; i < n_layer; i++) {
+        for (uint32_t j = 0; j < n_head; j++) {
+          for (uint32_t k = 0; k < kv_dim; k++) {
+            for (uint32_t l = 0; l < n_tok; l++) {
+              // Interleave K$
+              // QNN HTP: [0 2 4 ... 126 1 3 5 ... 127]
+              // QNN CPU: [0 1 2 ... 63  64 65 ... 127]
+              const uint32_t interleaved_k =
+                      (2 * k < kv_dim) ? 2 * k : 2 * (k - kv_dim / 2) + 1;
+              const uint32_t read_loc  = i * layer_size + j * head_size + k * n_tok  + l;
+              const uint32_t write_loc = i * layer_size + j * head_size + l * kv_dim + interleaved_k;
+              dequant_keys[write_loc] =
+                      (static_cast<float>(key_cache[read_loc]) - 128) * key_scales[i];
+            }
+          }
+        }
+      }
+      __DEBUG("kv-convert: dequantizing values");
+      std::vector<float> dequant_values(cache_size);
+      for (uint32_t i = 0; i < n_layer; i++) {
+        for (uint32_t j = 0; j < n_head; j++) {
+          for (uint32_t l = 0; l < n_tok; l++) {
+            for (uint32_t k = 0; k < kv_dim; k++) {
+              const uint32_t read_loc  = i * layer_size + j * head_size + l * kv_dim + k;
+              const uint32_t write_loc = read_loc;
+              dequant_values[write_loc] =
+                      (static_cast<float>(value_cache[read_loc]) - 128) * value_scales[i];
+            }
+          }
+        }
+      }
+      __DEBUG("kv-convert: storing converted KV to file");
+      cpu_fs.write((char *)dequant_keys.data(), dequant_keys.size() * sizeof(float));
+      cpu_fs.write((char *)dequant_values.data(), dequant_values.size() * sizeof(float));
+      cpu_fs.flush();
+      cpu_fs.close();
+      __DEBUG("kv-convert: done converting {} to {} in {} usec",
+              nsp_cache_path.string(),
+              cpu_cache_path.string(),
+              start.elapsed_usec());
+      return true;
+    }
+// Registrator instance
+    static OnLoad regy([]() {
+        Dialog::__register(
+                "kv-share",
+                [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                    return (Dialog*)new KvShareDialog(env, name, conf);
+                }
+        );
+    });
+    void needKvShareDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/lhd-dec.cpp ADDED Viewed

	@@ -0,0 +1,481 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/lhd-dialog.hpp>
+#include <functional>
+#include <filesystem>
+#include <string>
+#include <cmath>
+#include <cstdio>
+#include <random>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+using qc = qualla::Config;
+LhdDecDialog::LhdDecDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf)
+    : Dialog(env, name, conf) {
+    _window = qc::optional<size_t>(conf, "window", 8);
+    _ngram  = qc::optional<size_t>(conf, "ngram", 3);
+    _gcap   = qc::optional<size_t>(conf, "gcap", 8);
+    _lhd_mode_str = qc::optional<std::string>(conf, "lhd-update-mode", "ALWAYS_FWD_ONE");
+}
+bool LhdDecDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float>   logits;
+    std::vector<int32_t> resultTokens;
+    State::clear();
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(tokens, logits, false))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_prompt += tokens.size();
+    _n_past += tokens.size();
+    if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+    std::vector<int32_t> tokens_tmp(1);
+    tokens_tmp[0] = _last_tok = sampler.process(logits);
+    resultTokens.push_back(_last_tok);
+    _n_generated++;
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    if (_ctx->is_eos(_last_tok)) {
+        callback("", Sentence::END);
+        return true;
+    }
+    // Exit condition : Prediction limit reached OR ctx size limit reached
+    if (!callback(_tokenizer->decode(tokens_tmp), Sentence::BEGIN)) return true;
+    State::busy(true);
+    // verification branch init
+    v_branch.resize(_gcap);
+    // n-gram pools
+    const size_t    n_vocab = _ctx->n_vocab();
+    ngram_container ngrams_pool(n_vocab, _ngram, _gcap);
+    // lookahead branch first level init
+    lhd_branch.resize(_ngram - 1);
+    lhd_branch_prev.resize(_window);
+    for (int j = 0; j < _ngram - 1; j++) {
+        lhd_branch[j].resize(_window);
+        for (int i = 0; i < _window; i++) {
+            if (j == 0) {
+                // initialize with the random token from prompt
+                lhd_branch[j][i] = tokens[1 + rand() % (tokens.size() - 1)];
+            } else {
+                // initialize with a sequence of increasing numbers
+                lhd_branch[j][i] = 1000 + i;
+            }
+        }
+    }
+    // lookahead branch other level init
+    while (_level_idx < _ngram - 1) {
+        batch.clear();
+        attention_map.clear();
+        // fill the first token of the first level
+        batch.push_back(_last_tok);
+        attention_map.push_back(-1);
+        lhd_branch[0][0] = _last_tok;
+        // fill the remaining WINDOW - 1 tokens for the first level
+        for (int i = 1; i < _window; i++) {
+            batch.push_back(lhd_branch[0][i]);
+            attention_map.push_back(i - 1);
+        }
+        // fill the rest of the levels
+        for (int j = 1; j < _ngram - 1; j++) {
+            for (int i = 0; i < _window; i++) {
+                batch.push_back(lhd_branch[j][i]);
+                attention_map.push_back((j - 1) * _window + i);
+            }
+        }
+        // re-init tokens batch
+        tokens.resize(_window * (_ngram - 1));
+        tokens = batch;
+        if (_n_past + tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            break;
+        }
+        size_t n_tok = engine.process(tokens, attention_map, logits, true);
+        if (n_tok != tokens.size())
+            return Dialog::abort("engine lookahead branch processing failed", callback);
+        for (int i = 0; i < _window; i++) {
+            size_t sample_tmp_idx = (_level_idx - 1) * _window + i;
+            // sampler from logits all
+            std::span<float> span_logits{logits.data(),logits.size()};
+            std::span<float> span_tmp = span_logits.subspan(sample_tmp_idx * n_vocab, n_vocab);
+            int32_t          sampled_tmp_token = sampler.process(span_tmp);
+            lhd_branch[_level_idx][i]          = sampled_tmp_token;
+        }
+        _level_idx++;
+    }
+    if (_lhd_mode_str == "FWD_MAX_HIT")
+        _lhd_update_mode = FWD_MAX_HIT;
+    else if (_lhd_mode_str == "FWD_LEVEL")
+        _lhd_update_mode = FWD_LEVEL;
+    else
+        _lhd_update_mode = ALWAYS_FWD_ONE;
+    start.reset();
+    while (true) {
+        if (State::canceled()) {
+            callback("", Sentence::END);
+            break;
+        }
+        // input batch init
+        {
+            batch.clear();
+            attention_map.clear();
+            // fill the first token of the first level
+            batch.push_back(_last_tok);
+            attention_map.push_back(-1);
+            // lhd_branch[0][0] = _last_tok;
+            // fill the remaining WINDOW - 1 tokens for the first level
+            for (int i = 1; i < _window; i++) {
+                batch.push_back(lhd_branch[0][i]);
+                attention_map.push_back(i - 1);
+            }
+            // fill the rest of the levels
+            for (int j = 1; j < _ngram - 1; j++) {
+                for (int i = 0; i < _window; i++) {
+                    batch.push_back(lhd_branch[j][i]);
+                    attention_map.push_back((j - 1) * _window + i);
+                }
+            }
+            // build verification n-grams(branch)
+            {
+                const int g_cur = ngrams_pool.cnt[_last_tok];
+                v_branch.resize(g_cur);
+                // input_token_batch.size = (_window + g_cur) * (_ngram - 1);
+                tokens.resize((_window + g_cur) * (_ngram - 1));
+                for (int g = 0; g < g_cur; g++) {
+                    v_branch[g].active = true;
+                    v_branch[g].tokens.resize(_ngram);
+                    v_branch[g].i_batch.resize(_ngram);
+                    v_branch[g].seq_id     = _window + 1 + g;
+                    v_branch[g].i_batch[0] = 0;
+                    v_branch[g].tokens[0]  = _last_tok;
+                }
+                for (int j = 0; j < _ngram - 1; j++) {
+                    for (int g = 0; g < g_cur; g++) {
+                        const int     idx = _last_tok * (_ngram - 1) * _gcap + g * (_ngram - 1);
+                        const int32_t t   = ngrams_pool.tokens[idx + j];
+                        v_branch[g].tokens[j + 1]  = t;
+                        v_branch[g].i_batch[j + 1] = j + 1;
+                    }
+                }
+                for (int g = 0; g < g_cur; g++) {
+                    for (int j = 0; j < _ngram - 1; j++) {
+                        batch.push_back(v_branch[g].tokens[j + 1]);
+                        if (j == 0)
+                            attention_map.push_back(0);
+                        else
+                            attention_map.push_back(batch.size() - 2);
+                    }
+                }
+            }
+        }
+        // re-init tokens batch
+        std::vector<bool> selected(attention_map.size(), false);
+        tokens = batch;
+        if (_n_past + tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            break;
+        }
+        size_t n_tok = engine.process(tokens, attention_map, logits, true);
+        if (n_tok != tokens.size()) return Dialog::abort("engine gen processing failed", callback);
+        // verification branch seq-id
+        size_t seq_id_best = 0;
+        // max hit pos
+        size_t i_batch_best = 0;
+        // Lookahead decoding and verification
+        for (int v = 0; v < _ngram; ++v) {
+            int i_batch = 0;
+            if (v > 0) {
+                for (int g = 0; g < (int)v_branch.size(); g++) {
+                    // record the best matched seq and pos
+                    if (v_branch[g].active) {
+                        i_batch      = v_branch[g].i_batch[v];
+                        i_batch_best = i_batch;
+                        seq_id_best  = v_branch[g].seq_id;
+                        ++_n_accept;
+                        break;
+                    }
+                }
+                if (i_batch == 0) {
+                    break;
+                }
+            }
+            size_t sample_idx;
+            if (seq_id_best == 0)
+                sample_idx = 0;
+            else
+                sample_idx = _window * (_ngram - 1) + (seq_id_best - (_window + 1)) * (_ngram - 1) +
+                             i_batch - 1;
+            //vector selected set
+            selected[sample_idx] = true;
+            // sampler from logits all
+            std::span<float> span_logits{logits.data(),logits.size()};
+            std::span<float> sample_logit = span_logits.subspan(sample_idx * n_vocab, n_vocab);
+            _last_tok                     = sampler.process(sample_logit);
+            std::vector<int32_t> tokens_tmp(1);
+            tokens_tmp[0] = _last_tok;
+            resultTokens.push_back(_last_tok);
+            _n_generated++;
+            _n_past++;
+            if (_ctx->is_eos(_last_tok)) break;
+            if (!callback(_tokenizer->decode(tokens_tmp), Sentence::CONTINUE)) return true;
+            // if verify pass, check the next sample token until verifing failed
+            for (int g = 0; g < (int)v_branch.size(); g++) {
+                // update the n-gram active status
+                if (v_branch[g].active) {
+                    if (v == _ngram - 1) {
+                        v_branch[g].active = false;
+                    } else {
+                        if (_last_tok != v_branch[g].tokens[v + 1]) {
+                            v_branch[g].active = false;
+                        }
+                    }
+                }
+            }
+            // update lookahead tokens when v=0 OR verify match
+            {
+                for (int i = 0; i < _window; i++) {
+                    lhd_branch_prev[i] = lhd_branch[0][i];
+                }
+                if (v == 0) {
+                    for (int j = 0; j < _ngram - 2; j++) {
+                        lhd_branch[j] = lhd_branch[j + 1];
+                    }
+                    // sample from the last level
+                    for (int i = 0; i < _window; i++) {
+                        size_t           sample_idx = (_ngram - 2) * _window + i;
+                        std::span<float> sample_logit =
+                                span_logits.subspan(sample_idx * n_vocab, n_vocab);
+                        lhd_branch[_ngram - 2][i] = sampler.process(sample_logit);
+                    }
+                } else {
+                    if (_lhd_update_mode == FWD_MAX_HIT) {
+                        // update lookahead branch by foward
+                        for (int j = 0; j < _ngram - 1; j++) {
+                            for (int i = 0; i < _window - v; i++) {
+                                lhd_branch[j][i] = lhd_branch[j][i + 1];
+                            }
+                        }
+                    } else if (_lhd_update_mode == FWD_LEVEL) {
+                        // update lookahead branch by shifting level
+                        for (int j = 0; j < _ngram - 2; j++) {
+                            lhd_branch[j] = lhd_branch[j + 1];
+                        }
+                        for (int i = 0; i < _window; i++) {
+                            // init from the previous level
+                            lhd_branch[_ngram - 2][i] = lhd_branch[0][i];
+                        }
+                    }
+                }
+            }
+            // update n-grams pool
+            // only update n-grams pools when v=0
+            if (v == 0) {
+                std::vector<int32_t> ngram(_ngram - 1);
+                // n-gram pool generation
+                for (int f = 0; f < _window; ++f) {
+                    const int ft = lhd_branch_prev[f]; // first token of the n-gram
+                    for (int j = 0; j < _ngram - 1; ++j) {
+                        ngram[j] = lhd_branch[j][f];
+                    }
+                    // filter-out repeating n-grams
+                    {
+                        bool is_unique = true;
+                        for (int k = 0; k < ngrams_pool.cnt[ft]; ++k) {
+                            // caculate the related idx by the first n-gram token
+                            const int idx = ft * (_ngram - 1) * _gcap + k * (_ngram - 1);
+                            bool is_match = true;
+                            for (int j = 0; j < _ngram - 1; ++j) {
+                                if (ngrams_pool.tokens[idx + j] != ngram[j]) {
+                                    is_match = false;
+                                    break;
+                                }
+                            }
+                            // if n-gram match all, discard one of them
+                            if (is_match) {
+                                is_unique = false;
+                                break;
+                            }
+                        }
+                        if (!is_unique) {
+                            continue;
+                        }
+                    }
+                    const int head = ngrams_pool.head[ft];
+                    const int idx  = ft * (_ngram - 1) * _gcap + head * (_ngram - 1);
+                    for (int i = 0; i < _ngram - 1; i++) {
+                        // update the n-gram pool with new n-gram
+                        ngrams_pool.tokens[idx + i] = ngram[i];
+                    }
+                    ngrams_pool.cnt[ft]  = std::min(_gcap, ngrams_pool.cnt[ft] + 1);
+                    ngrams_pool.head[ft] = (head + 1) % _gcap;
+                    ngrams_pool.n_total++;
+                }
+            }
+        }
+        if (_lhd_update_mode == FWD_MAX_HIT) {
+            // std::random_device rd;
+            // std::mt19937 gen(rd());
+            // std::uniform_int_distribution<> dis(0, resultTokens.size() - 1);
+            // fill lookahead branch
+            for (int i = 0; i < _ngram - 1; i++) {
+                for (int j = _window - i_batch_best; j < _window; j++) {
+                    lhd_branch[i][j] = resultTokens[1 + rand() % (resultTokens.size() - 1)];
+                    // lhd_branch[i][j] = resultTokens[dis(gen)];
+                    // std::cout << "Fill token = " << lhd_branch[i][j] << std::endl;
+                }
+            }
+        }
+        // KV cache management
+        if (!engine.updateKV(_n_past, selected))
+            return Dialog::abort("context size exceeded", callback);
+        if (_ctx->is_eos(_last_tok)) {
+            callback("", Sentence::END);
+            break;
+        }
+    }
+    State::busy(false);
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    std::cout << std::endl << std::endl << std::flush;
+    __DEBUG("lhd-dec: n_generated = {} ---------- n_accept = {}", _n_generated, _n_accept);
+    return !State::failed();
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Dialog::__register(
+            "lhd-dec",
+            [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                return (Dialog*)new LhdDecDialog(env, name, conf);
+            }
+    );
+});
+void needLadeDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/multistream.cpp ADDED Viewed

	@@ -0,0 +1,300 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/multistream-dialog.hpp>
+#include <qualla/detail/sampler-utils.hpp>
+#include <functional>
+#include <filesystem>
+#include <string>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+bool MultiStreamDialog::processFollowOnGeneration(std::vector<std::vector<int32_t>>& streams, std::vector<float>& logits, Dialog::Callback callback) {
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    std::vector<std::vector<int32_t>> attention_mask(_n_streams);
+    std::vector<int32_t>              streamIndices;
+    if (streams.size() == 0) {
+        callback("\n", Sentence::END);
+        return true;
+    }
+    for (int i = 0; i < streams.size(); i++) {
+        // Initialize all attention_masks to attend to all previous tokens
+        attention_mask[i].resize(_n_past, 1);
+        streamIndices.push_back(i);
+    }
+    State::busy(true);
+    while (true) {
+        if (State::canceled()) break;
+        // If this exceeds context length, truncate all streams and return
+        if (_n_past + streamIndices.size() > _ctx->size()) {
+            for (auto stream : streamIndices)
+                callback(_tokenizer->decode(streams[stream]) + "\n", Sentence::CONTINUE);
+            break;
+        }
+        // Accumulate input tokens from all streams
+        std::vector<int32_t> multi_tokens(streamIndices.size());
+        for (int i = 0; i < streamIndices.size(); i++) {
+            multi_tokens[i] = streams[streamIndices[i]].back();
+            // Also add current iteration to the attention_mask
+            for (auto _mask_row : streamIndices)
+                // Set to true iff on diagonal, i.e. attend to itself
+                attention_mask[streamIndices[i]].push_back((streamIndices[i] == _mask_row) ? 1 : 0);
+        }
+        // Concatenate attention_mask for all active streams
+        std::vector<int32_t> multi_attn_mask;
+        multi_attn_mask.reserve((_n_past + streamIndices.size()) * streamIndices.size());
+        for (auto i : streamIndices)
+            multi_attn_mask.insert(
+                    multi_attn_mask.end(),
+                    attention_mask[i].begin(),
+                    attention_mask[i].end()
+            );
+        // __DEBUG("Multi attention mask = {}", multi_attn_mask);
+        if (m_inputType == InputType::TOKENS) {
+            // Process input tokens for all streams in one batch
+            if (!engine.process(multi_tokens, multi_attn_mask, logits, true))
+                return Dialog::abort("engine gen processing failed", callback);
+        } else if (m_inputType == InputType::EMBEDDINGS) {
+            // Accumulate input embeddings from all streams
+            auto embedBufSize = engine.getEmbeddingBufferSize();
+            std::vector<uint8_t> multi_embeddings;
+            for (auto token : multi_tokens) {
+                // Convert tokens to embedding for the processing in the engine.
+                std::vector<uint8_t> curTokenEmbedding(embedBufSize, 0);
+                m_t2eCallback(token, curTokenEmbedding.data(), embedBufSize);
+                multi_embeddings.insert(multi_embeddings.end(), curTokenEmbedding.begin(), curTokenEmbedding.end());
+            }
+            // Process input tokens for all streams in one batch
+            if (!engine.process(multi_embeddings, multi_attn_mask, logits, true))
+                return Dialog::abort("engine gen processing failed", callback);
+        }
+        // Process all logits independently
+        std::span<float> logit_span = std::span{logits.data(),logits.size()};
+        for (int i = 0; i < streamIndices.size(); i++) {
+            _last_tok = sampler.process(logit_span.subspan(i * _vocab, _vocab));
+            streams[streamIndices[i]].push_back(_last_tok);
+        }
+        _n_past += streamIndices.size();
+        _n_generated += streamIndices.size();
+        if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+        for (auto it = streamIndices.begin(); it != streamIndices.end();) {
+            int32_t stream = *it;
+            if (_ctx->is_eos(streams[stream].back())) {
+                callback(_tokenizer->decode(streams[stream]) + "\n", Sentence::CONTINUE);
+                it = streamIndices.erase(it);
+            } else {
+                ++it;
+            }
+        }
+        if (streamIndices.size() == 0) break;
+    }
+    callback("\n", Sentence::END);
+    State::busy(false);
+    return true;
+}
+bool MultiStreamDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    if(m_inputType != InputType::TOKENS) {
+        __ERROR("Input type for model is not tokens.");
+        return false;
+    }
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> logits;
+    State::clear();
+    auto& engine  = *_engine["primary"];
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(tokens, logits, false))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_prompt += tokens.size();
+    _n_past += tokens.size();
+    _prompt_len = _n_past;
+    if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+    std::vector<std::vector<int32_t>> streams;
+    getTopK(logits, streams, _n_streams, _p_threshold, callback);
+    _n_generated += streams.size();
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    bool status = processFollowOnGeneration(streams, logits, callback);
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return status;
+}
+bool MultiStreamDialog::process(
+        std::vector<uint8_t>& embedding_vectors,
+        T2ECallback         t2eCallback,
+        Dialog::Callback    callback
+) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    if(m_inputType != InputType::EMBEDDINGS) {
+        __ERROR("Input type for model is not embeddings.");
+        return false;
+    }
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> logits;
+    State::clear();
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    // Store the t2e callback for reference during follow-on generation.
+    m_t2eCallback = t2eCallback;
+    size_t embedBufSize = engine.getEmbeddingBufferSize();
+    {
+        std::vector<uint8_t> eosEmbedding(embedBufSize, 0.0);
+        if (m_t2eCallback) {
+            m_t2eCallback(_ctx->eos(), eosEmbedding.data(), embedBufSize);
+        }
+        // For non-autogenerative usecases (where t2eCallback is not supplied),
+        // the EOS vector is all zero. This is fine for models with proper
+        // attention masking support, but may degrade accuracy otherwise.
+        if (!engine.cacheEosEmbedding(eosEmbedding)) {
+            __DEBUG("Failed to set the eos token embedding.");
+            return false;
+        }
+    }
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    size_t curTokenCount = embedding_vectors.size() / embedBufSize;
+    if (_n_past + curTokenCount > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, curTokenCount, _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(embedding_vectors, {}, logits))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_prompt += curTokenCount;
+    _n_past += curTokenCount;
+    _prompt_len = _n_past;
+    if (!engine.updateKV(_n_past)) return Dialog::abort("context size exceeded", callback);
+    std::vector<std::vector<int32_t>> streams;
+    getTopK(logits, streams, _n_streams, _p_threshold, callback);
+    _n_generated += streams.size();
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    bool status = processFollowOnGeneration(streams, logits, callback);
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return status;
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Dialog::__register(
+            "multistream",
+            [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                return (Dialog*)new MultiStreamDialog(env, name, conf);
+            }
+    );
+});
+void needMultistreamDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/spec-dec.cpp ADDED Viewed

	@@ -0,0 +1,458 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/dialog.hpp>
+#include <qualla/sampler.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/sampler-utils.hpp>
+#include <qualla/detail/basic-sampler.hpp>
+#include <functional>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <filesystem>
+#include <random>
+#include <thread>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+using qc = qualla::Config;
+class SpecDecDialog : public Dialog {
+  public:
+    SpecDecDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf);
+    virtual bool process(std::vector<int32_t>& tokens, Dialog::Callback callback) override;
+    virtual bool process(std::vector<int32_t>& tokens, DialogCallback callback) override {
+        return false;
+    }
+  private:
+    int32_t _draft_len; // Number of draft tokens
+    bool    _parallel;  // Enable parallel processing (where possible)
+    Sampler& _d_sampler; // Draft sampler
+    Sampler& _t_sampler; // Target sampler
+    // Token acceptor, called for each accepted token.
+    // Returns true to continue, false to stop
+    using Acceptor = std::function<bool(int32_t token)>;
+    // Rejection sampling.
+    // Returns number of accepted tokens
+    size_t rejectionSampling(
+            std::span<int32_t> tokens,
+            std::span<float>   target_logits,
+            std::span<float>   draft_probs,
+            Acceptor           accept
+    );
+    int32_t sampleFromModifiedDist(std::span<float> src0_dst, std::span<float> src1);
+};
+SpecDecDialog::SpecDecDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf)
+    : Dialog(env, name, conf),
+      _d_sampler(_sampler.contains("draft") ? *_sampler["draft"] : *_sampler["target"]),
+      _t_sampler(*_sampler["target"]) {
+    _draft_len = qc::optional<int32_t>(conf, "draft-len", 3);
+    _parallel  = qc::optional<bool>(conf, "parallel", false);
+    // Check all underlying components for correct types an config
+    // If something is not right we set our error state that can be checked later
+    if (!_sampler.contains("target")) {
+        State::fatal("\"target\" sampler not present in config!");
+        return;
+    }
+    if (!_engine.contains("target")) {
+        State::fatal("\"target\" engine not present in config!");
+        return;
+    }
+    if (!_engine.contains("draft")) {
+        State::fatal("\"draft\" engine not present in config!");
+        return;
+    }
+}
+int32_t SpecDecDialog::sampleFromModifiedDist(std::span<float> src0_dst, std::span<float> src1) {
+    //  [max(prob_target[x] - prob_draft[x], 0.f) for all x in vocab]
+    size_t size = src0_dst.size();
+    if (_t_sampler.gumbel()) {
+        // Avoid going in the denormal zone.
+        float tiny = 1.1754943508222875e-38;
+#pragma clang loop vectorize(enable) unroll_count(4)
+        for (size_t i = 0U; i < size; i++) {
+            float p_src0 = std::exp(src0_dst[i]);
+            float p_src1 = std::exp(src1[i]);
+            src0_dst[i]  = std::log(std::max(tiny, p_src0 - p_src1));
+        }
+        // NOTE: The output logps_target is unnormalized since we use Gumbel trick.
+        //       If we use standard multinomial sampling, normalization should be added.
+    } else {
+        float sum = 0.0; // Unlikely to overflow (?)
+#pragma clang loop vectorize(enable) unroll_count(4)
+        for (size_t i = 0U; i < size; i++) {
+            float num = std::max(0.f, src0_dst[i] - src1[i]);
+            sum += num;
+            src0_dst[i] = num;
+        }
+        // Normalize
+#pragma clang loop vectorize(enable) unroll_count(4)
+        for (size_t i = 0U; i < size; i++) {
+            src0_dst[i] /= sum;
+        }
+    }
+    if (_t_sampler.greedy()) return argmax(src0_dst);
+    if (_t_sampler.gumbel()) return sampleUsingGumbelMax(src0_dst, _t_sampler.rng());
+    // Skipping softmax since the probs are already normalized
+    return sampleFromProbs(src0_dst, _t_sampler.rng());
+}
+size_t SpecDecDialog::rejectionSampling(
+        std::span<int32_t> tokens,
+        std::span<float>   target_logits,
+        std::span<float>   draft_probs,
+        Acceptor           accept
+) {
+    const size_t n_vocab = _ctx->n_vocab();
+    const size_t n_tok   = tokens.size();
+    assert(tokens.size() == draft_probs.size() / n_vocab);
+    assert(target_logits.size() == draft_probs.size() + n_vocab);
+    // Rejection sampling:
+    // For each token in the n_tok tokens sampled from the draft model:
+    // 1. Determine the probability of that token being accepted by the target model
+    // 2. Accept the token with probability = prob_target[tok] / prob_draft[tok] (clamped to [0, 1])
+    // 3. If the token is rejected, resample a new token from the following distribution:
+    //      [max(prob_target[x] - prob_draft[x], 0.f) for all x in vocab]
+    int32_t t_tok;
+    size_t  n_accepted = 0;
+    std::vector<float> target_probs;
+    for (int32_t i = 0; i < n_tok; i++) {
+        int32_t d_tok = tokens[i];
+        std::span<float> t_span = target_logits.subspan(i * n_vocab, n_vocab);
+        if (_t_sampler.greedy()) {
+            t_tok = _t_sampler.process(t_span);
+            if (t_tok != d_tok) {
+                // Reject
+                break;
+            }
+        } else {
+            target_probs.clear();
+            t_tok = _t_sampler.process(t_span, target_probs, false); // only probs, no token
+            // Acceptance threshold
+            double threshold;
+            float  prob_draft  = draft_probs[i * n_vocab + d_tok];
+            float  prob_target = target_probs[d_tok];
+            if (_t_sampler.gumbel()) {
+                threshold = std::exp(double(prob_target) - double(prob_draft));
+            } else {
+                threshold = double(prob_target) / double(prob_draft);
+            }
+            double r = sampleFromUniform(_t_sampler.rng());
+            if (r > threshold) {
+                // Reject
+                break;
+            }
+        }
+        // Accepted!
+        ++n_accepted;
+        if (!accept(d_tok)) return n_accepted;
+    }
+    // Sample an extra token either from the target distribution or the modified distribution
+    if (n_accepted == n_tok) {
+        t_tok = _t_sampler.process(target_logits.subspan(n_tok * n_vocab));
+    } else if (!_t_sampler.greedy()) {
+        // Resample from modified distribution.
+        t_tok = sampleFromModifiedDist(
+                std::span{target_probs.data(),target_probs.size()}, draft_probs.subspan(n_accepted * n_vocab, n_vocab)
+        );
+    } // for greedy, t_tok should be already valid from the loop above
+    ++n_accepted;
+    accept(t_tok);
+    return n_accepted;
+}
+bool SpecDecDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    const size_t n_vocab = _ctx->n_vocab();
+    // Vector for storing logits.
+    // Allocated & filled by the engine.
+    std::vector<float> t_logits;
+    std::vector<float> d_logits;
+    bool keep_generating = true;
+    // A buffer for tokens to be decoded (one at a time, per the Middleware's request)
+    std::vector<int32_t> decode_buf(1, 0);
+    // Decode new token.
+    // Return true to continue generation, and false otherwise
+    auto decode_token = [&](int32_t t) {
+        decode_buf[0] = _last_tok = t;
+        if (_ctx->is_eos(t)) {
+            keep_generating = false;
+            callback("", Sentence::END);
+        } else {
+            keep_generating = callback(_tokenizer->decode(decode_buf), Sentence::CONTINUE);
+        }
+        return keep_generating;
+    };
+    State::clear();
+    auto& t_engine = *_engine["target"];
+    auto& d_engine = *_engine["draft"];
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    // Step 0: Process the prompt both on the target and draft models.
+    bool d_pmpt, t_pmpt;
+    if (_parallel) {
+        std::thread dt([&]() { d_pmpt = d_engine.process(tokens, d_logits, false); });
+        std::thread tt([&]() { t_pmpt = t_engine.process(tokens, t_logits, false); });
+        dt.join();
+        tt.join();
+    } else {
+        d_pmpt = d_engine.process(tokens, d_logits, false);
+        t_pmpt = t_engine.process(tokens, t_logits, false);
+    }
+    if (!d_pmpt) return Dialog::abort("draft engine prompt processing failed", callback);
+    if (!t_pmpt) return Dialog::abort("target engine prompt processing failed", callback);
+    // KV state Update
+    _n_prompt += tokens.size();
+    _n_past += tokens.size();
+    if (!t_engine.updateKV(_n_past)) return Dialog::abort("target context size exceeded", callback);
+    if (!d_engine.updateKV(_n_past)) return Dialog::abort("draft context size exceeded", callback);
+    // Sample one token from the target.
+    _last_tok = _t_sampler.process(t_logits);
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    if (!decode_token(_last_tok)) return true;
+    // Done with the prompt, start generating
+    start.reset();
+    State::busy(true);
+    // Buffers for all the tokens that need to be considered for each iteration
+    std::vector<int32_t> toks_to_target(_draft_len + 1);
+    std::vector<int32_t> toks_to_draft(2);
+    // Buffer for all the probability distributions from the draft sampler
+    std::vector<float> d_probs(n_vocab * _draft_len);
+    toks_to_target.assign(1, _last_tok);
+    toks_to_draft.assign(1, _last_tok);
+    // For keeping track of the number of tokens that were accepted in each iteration.
+    std::vector<int32_t> n_accepted_counts(_draft_len + 1, 0);
+    // Draft n_past, either in sync with n_past or one token behind (accepted-all)
+    size_t d_n_past = _n_past;
+    while (!State::canceled() && keep_generating) {
+        // Step 1: Use draft model to decode draft_len (aka gamma) tokens, and accumulate probabilities
+        d_probs.clear();
+        for (int32_t i = 0; i < _draft_len; i++) {
+            if (d_n_past + toks_to_draft.size() > _ctx->size()) {
+                __WARN("Context limit exceeded ({} + {} > {})",
+                       d_n_past,
+                       toks_to_target.size(),
+                       _ctx->size());
+                _kpis.generate.update(start.elapsed_usec());
+                // Log latest KPIs in a single line
+                _env->logger().post(Logger::KPIS, kpis().dump(" "));
+                callback("", Sentence::END);
+                return true;
+            }
+            if (!d_engine.process(toks_to_draft, d_logits))
+                return Dialog::abort("draft engine gen processing failed", callback);
+            d_n_past += toks_to_draft.size();
+            if (!d_engine.updateKV(d_n_past))
+                return Dialog::abort("draft context size exceeded", callback);
+            int32_t token = _d_sampler.process(d_logits, d_probs);
+            toks_to_draft.assign(1, token);
+            toks_to_target.push_back(token);
+            if (_ctx->is_eos(token)) break;
+        }
+        // Step 2: run the target model on the draft tokens
+        if (_n_past + toks_to_target.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})",
+                   _n_past,
+                   toks_to_target.size(),
+                   _ctx->size());
+            callback("", Sentence::END);
+            _kpis.generate.update(start.elapsed_usec());
+            // Log latest KPIs in a single line
+            _env->logger().post(Logger::KPIS, kpis().dump(" "));
+            return true;
+        }
+        std::vector<int32_t> attention_map(toks_to_target.size());
+        std::iota(attention_map.begin(), attention_map.end(), -1);
+        size_t n_tok_t =
+                t_engine.process(toks_to_target, attention_map, t_logits, true /* all logits */);
+        if (n_tok_t != toks_to_target.size())
+            return Dialog::abort("target engine gen processing failed", callback);
+        // Step 3: accept or reject draft tokens
+        size_t n_accepted = rejectionSampling(
+                std::span{toks_to_target.data(),toks_to_target.size()}.subspan(1),
+                std::span{t_logits.data(),t_logits.size()}, std::span{d_probs.data(),d_probs.size()}, decode_token
+        );
+        _n_generated += n_accepted;
+        _n_past += n_accepted;
+        // Update stats
+        n_accepted_counts[n_accepted - 1]++;
+        // Accepted all?
+        if (n_accepted == _draft_len + 1) {
+            // Grab the last 2 tokens
+            toks_to_draft.assign({toks_to_target[_draft_len], _last_tok});
+            d_n_past = _n_past - 1;
+        } else {
+            // Grab only the last token
+            toks_to_draft.assign(1, _last_tok);
+            d_n_past = _n_past;
+        }
+        toks_to_target.assign(1, _last_tok);
+        __DEBUG("spec-dec: draft_len {} n_generated {} n_accepted {} n_past {}",
+                _draft_len,
+                _n_generated,
+                n_accepted,
+                _n_past);
+        std::vector<bool> selected(attention_map.size(), false);
+        selected[0]   = true; // first token is selected always
+        auto last_sel = 0;
+        for (int i = n_accepted - 1; i != 0; i = attention_map[i]) {
+            selected[i] = true;
+            last_sel    = i > last_sel ? i : last_sel;
+        }
+        selected.resize(last_sel + 1); // trim away rejected tokens
+        // Step 4: commit accepted tokens to kv-caches
+        if (!t_engine.updateKV(_n_past, selected))
+            return Dialog::abort("target context size exceeded", callback);
+        if (!d_engine.updateKV(d_n_past))
+            return Dialog::abort("draft context size exceeded", callback);
+    }
+    if (d_n_past != _n_past) {
+        // The draft engine needs to process one last token to catch up
+        toks_to_draft.resize(1);
+        if (!d_engine.process(toks_to_draft))
+            return Dialog::abort("draft engine gen processing failed", callback);
+        if (!d_engine.updateKV(_n_past))
+            return Dialog::abort("draft context size exceeded", callback);
+    }
+    State::busy(false);
+    _kpis.generate.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    __KPIS("spec-dec: accepted counts: {}", n_accepted_counts);
+    return true;
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Dialog::__register(
+            "spec-dec",
+            [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                return (Dialog*)new SpecDecDialog(env, name, conf);
+            }
+    );
+});
+// Register spec-dec sampler for compatibility
+static OnLoad sampler_regy([]() {
+    Sampler::__register("spec-dec", [](Context& ctx, const json& conf) {
+        return (Sampler*)new BasicSampler(ctx, conf);
+    });
+});
+void needSpdDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/dialogs/ssd-q1.cpp ADDED Viewed

	@@ -0,0 +1,1046 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/context.hpp>
+#include <qualla/dialog.hpp>
+#include <qualla/sampler.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/json.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <qualla/detail/sampler-utils.hpp>
+#include <qualla/detail/basic-sampler.hpp>
+#include <functional>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <filesystem>
+#include <random>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+#define __INFO(__fmt, ...)  _env->logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env->logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env->logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env->logger().post(Logger::KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env->logger().post(Logger::TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+using qc     = qualla::Config;
+using Logits = std::span<float>;
+class SelfSpecDecDialog : public Dialog {
+    enum { VERSION = 1 };
+  public:
+    SelfSpecDecDialog(std::shared_ptr<Env> env, const std::string& name, const json& conf);
+    virtual bool process(std::vector<int32_t>& tokens, Dialog::Callback callback) override;
+    virtual bool process(std::vector<uint8_t>& embedding_vectors, Dialog::T2ECallback t2eCallback, Dialog::Callback callback) override;
+    virtual void reset() override;
+    virtual bool process(std::vector<int32_t>& tokens, DialogCallback callback) override {
+        return false;
+    }
+    virtual bool save(const std::string& name) override;
+    virtual bool restore(const std::string& name) override;
+  private:
+    Sampler& _t_sampler;
+    int32_t _vocab;
+    std::string _kv_prefix_name{"forecast-prefix"};
+    // AR8
+    size_t              _draft{1};
+    std::vector<size_t> _branches{3};
+    size_t _forecast_prefix{16};
+    size_t _forecast_token_offset{32000};
+    size_t _forecast_token_count{4};
+    // Multistream parameters
+    int32_t _n_streams;
+    float   _p_threshold;
+    InputType m_inputType{InputType::UNKNOWN};
+    bool processFollowOnGeneration(std::vector<int32_t>& tokens, std::vector<float>& logits, Dialog::Callback callback);
+    // Multistream
+    bool processFollowOnGeneration(std::vector<std::vector<int32_t>>& streams, std::vector<float>& logits, Dialog::Callback callback);
+    /*
+        Helper function for combining masks for SSD mulstistream.
+        @param  masks           The attention mask to be tiled
+        @param  streamIndices   Indices of streams. The tiling count is equal to the size of this vector.
+        @param  pastMap         A vector of stream indices for masking all past tokens after the prompt.
+        @param  prefixOffset    Offset where KV prefix masking begins in each tile.
+        @param  finalMask       A mask that combines all of the independent masks such that
+                                they can be executed in the same inference.
+    */
+    void tileAttentionMask(const std::vector<int32_t>& mask, const std::vector<size_t> streamIndices, const std::vector<size_t>& pastMap, const size_t prefixOffset, std::vector<int32_t>& finalMask);
+    std::vector<int32_t> gen_attention_map() const;
+    auto                 get_len_flat_sample_tree() const;
+    auto                 gen_forecast_tokens(int repeat) const;
+    // Sampling and verification
+    std::vector<int32_t> build_sample_tree(
+            int32_t                     last_token,
+            Logits                      logits,
+            const std::vector<int32_t>& indices
+    );
+    std::tuple<std::vector<int32_t>, std::vector<int32_t>> verify_and_select_longest(
+            std::span<int32_t> sample_tree,
+            Logits             logits
+    );
+    std::vector<int32_t> sample_to_draft(Logits logits, size_t index, size_t count) {
+        const auto    thislogit = logits.subspan(index * _vocab, _vocab);
+        IndexedLogits logit(thislogit, _t_sampler.rng());
+        logit.topK(count);
+        return logit.indices;
+    }
+    int32_t sample_to_verify(Logits logits, size_t index) {
+        const auto thislogit = logits.subspan(index * _vocab, _vocab);
+        if (_t_sampler.greedy()) {
+            return argmax(thislogit);
+        }
+        auto token = _t_sampler.process(thislogit);
+        return token;
+    }
+};
+SelfSpecDecDialog::SelfSpecDecDialog(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        const json&          conf
+)
+    : Dialog(env, name, conf), _t_sampler(*_sampler["primary"]) {
+    auto ssd_version = qc::optional<int>(conf, "ssd-version", 0);
+    if (ssd_version > SelfSpecDecDialog::VERSION) __WARN("newer ssd-version in config!");
+    _vocab = _ctx->n_vocab();
+    _branches = qc::optional(conf, "branches", _branches);
+    _draft    = _branches.size();
+    _forecast_prefix       = qc::optional(conf, "forecast-prefix", _forecast_prefix);
+    _forecast_token_count  = qc::optional(conf, "forecast-token-count", _forecast_token_count);
+    _forecast_token_offset = _vocab;
+    _kv_prefix_name = qc::optional(conf, "forecast-prefix-name", _kv_prefix_name);
+    _n_streams   = qc::optional<int32_t>(conf, "n-streams", 1);
+    _p_threshold = qc::optional<float>(conf, "p-threshold", 0.0);
+    if (!_engine.contains("primary")) {
+        State::fatal("\"primary\" engine not present in config!");
+        return;
+    }
+    //Get Input Type from the engine
+    m_inputType = _engine["primary"]->getInputType();
+    // Load KV prefix
+    Timer  timer;
+    size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name);
+    if (n_restored_prefix != _forecast_prefix) {
+        // clang-format off
+        throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",
+                    n_restored_prefix, _kv_prefix_name, _forecast_prefix ) );
+        // clang-format on
+    }
+    _n_past = _forecast_prefix;
+    _kpis.restore.update(timer.elapsed_usec());
+}
+auto SelfSpecDecDialog::get_len_flat_sample_tree() const {
+    size_t len_flat_sample_tree = 1;
+    size_t last_tokens          = 1;
+    for (int i = 0; i < _draft; ++i) {
+        len_flat_sample_tree += last_tokens * _branches[i];
+        last_tokens = last_tokens * _branches[i];
+    }
+    return len_flat_sample_tree;
+}
+auto SelfSpecDecDialog::gen_forecast_tokens(int repeat) const {
+    std::vector<int32_t> forecast_tokens(_draft, 0);
+    std::iota(forecast_tokens.begin(), forecast_tokens.end(), _forecast_token_offset);
+    std::vector<int32_t> ret;
+    for (auto i = 0; i < repeat; ++i)
+        ret.insert(ret.end(), forecast_tokens.begin(), forecast_tokens.end());
+    return ret;
+}
+std::vector<int32_t> SelfSpecDecDialog::gen_attention_map() const {
+    auto                 len_flat_sample_tree = get_len_flat_sample_tree();
+    std::vector<int32_t> attention_map(len_flat_sample_tree + len_flat_sample_tree * _draft, -1);
+    auto build_verify_tree = [&attention_map,
+                              this](auto self, int parent_begin, int parent_end, int level) {
+        if (level == _draft) return;
+        auto current = parent_end;
+        for (auto parent = parent_begin; parent < parent_end; parent += 1) {
+            for (auto child = current; child < current + _branches[level]; child += 1)
+                attention_map[child] = parent;
+            current += _branches[level];
+        }
+        self(self, parent_end, current, level + 1);
+    };
+    auto build_forecast_tree = [&attention_map, this](int parent_begin, int parent_end) {
+        auto current = parent_end;
+        for (auto parent = parent_begin; parent < parent_end; parent += 1) {
+            for (auto child = current, current_parent = parent; child < current + _draft;
+                 child += 1) {
+                attention_map[child] = current_parent;
+                current_parent       = child;
+            }
+            current += _draft;
+        }
+    };
+    build_verify_tree(build_verify_tree, 0, 1, 0);
+    build_forecast_tree(0, len_flat_sample_tree);
+    return attention_map;
+}
+std::vector<int32_t> SelfSpecDecDialog::build_sample_tree(
+        int32_t                     last_token,
+        Logits                      logits,
+        const std::vector<int32_t>& indices
+) {
+    std::vector<int32_t> tree = {last_token};
+    for (auto draft = 0, repeat = 1; draft < _draft; ++draft) {
+        auto samples = sample_to_draft(logits, indices[draft], _branches[draft]);
+        for (auto i = 0; i < repeat; ++i) {
+            tree.insert(tree.end(), samples.begin(), samples.end());
+        }
+        repeat *= _branches[draft];
+    }
+    return tree;
+}
+std::tuple<std::vector<int32_t>, std::vector<int32_t>> SelfSpecDecDialog::verify_and_select_longest(
+        std::span<int32_t> sample_tree,
+        Logits             logits
+) {
+    std::vector<std::vector<int32_t>> accepted_all = {{sample_to_verify(logits, 0)}};
+    std::vector<std::vector<int32_t>> node_ids_all = {{0}};
+    std::vector<int32_t> draft_offset(_draft, 0);
+    draft_offset[0] = 1;
+    for (int32_t i = 1, draft_count = _branches[0]; i < _draft; ++i) {
+        draft_offset[i] = draft_offset[i - 1] + draft_count;
+        draft_count     = draft_count * _branches[i];
+    }
+    size_t longest = 0, longest_size = 1;
+    auto   verify_recursive = [&](auto                 self,
+                                std::vector<int32_t> accepted,
+                                std::vector<int32_t> node_ids,
+                                int                  draft,
+                                int                  offset_in_draft) -> void {
+        auto target      = accepted.back();
+        auto branch_base = draft_offset[draft] + offset_in_draft;
+        for (auto branch = 0; branch < _branches[draft]; ++branch) {
+            auto ndx_node = branch_base + branch;
+            if (!_ctx->is_eos(target) && target == sample_tree[ndx_node]) {
+                auto sample_accepted = sample_to_verify(logits, ndx_node);
+                accepted_all.push_back(accepted);
+                accepted_all.back().push_back(sample_accepted);
+                node_ids_all.push_back(node_ids);
+                node_ids_all.back().push_back(ndx_node);
+                if (node_ids_all.back().size() > longest_size) {
+                    longest      = node_ids_all.size() - 1;
+                    longest_size = node_ids_all.back().size();
+                }
+                if (draft + 1 < _draft)
+                    self(self,
+                         accepted_all.back(),
+                         node_ids_all.back(),
+                         draft + 1,
+                         (offset_in_draft + branch) * _branches[draft + 1]);
+            }
+        }
+    };
+    verify_recursive(verify_recursive, accepted_all.back(), node_ids_all.back(), 0, 0);
+    return {accepted_all[longest], node_ids_all[longest]};
+}
+void SelfSpecDecDialog::tileAttentionMask(const std::vector<int32_t>& mask, const std::vector<size_t> streamIndices, const std::vector<size_t>& pastMap, const size_t prefixOffset, std::vector<int32_t>& tiledMask) {
+    const size_t sampleTreeLen = get_len_flat_sample_tree();
+    const size_t pastMapLen    = pastMap.size();
+    const int posVal = 1, negVal = 0;
+    const size_t maskSize = mask.size();
+    const size_t numTokens = maskSize * streamIndices.size();
+    const size_t rowLength = _n_past + numTokens;
+    tiledMask.resize(numTokens * rowLength);
+    for (int maskIdx = 0; maskIdx < streamIndices.size(); maskIdx++) {
+        // Number of rows to skip to reach the current tile.
+        const size_t tileOffset = maskIdx * maskSize;
+        int32_t* const tileStart = &tiledMask[tileOffset*rowLength + tileOffset + _n_past];
+        for (int i = 0; i < maskSize; i++) {
+            // Pointer to the start of row i of the current mask
+            int32_t* rowPtr = &tiledMask[(tileOffset + i)*rowLength];
+            // Skip kv-prefix attention for rows without speculative tokens.
+            const int prefixFillVal = (i < prefixOffset) ? negVal : posVal;
+            std::fill_n(rowPtr, _forecast_prefix, prefixFillVal);
+            rowPtr += _forecast_prefix;
+            // Always attend to prompt.
+            std::fill_n(rowPtr, _n_prompt, posVal);
+            rowPtr += _n_prompt;
+            // Fill in the past valid tokens for this stream.
+            for (const size_t& pastIdx : pastMap) {
+                *rowPtr = (pastIdx == streamIndices[maskIdx]) ? posVal : negVal;
+                rowPtr++;
+            }
+            // Clear the rest of the row. It will mostly consist of 0's.
+            std::fill_n(rowPtr, rowLength - _n_prompt - _forecast_prefix - pastMapLen, negVal);
+            // Move to the correct tile.
+            rowPtr += tileOffset;
+            // Translate the mask.
+            const auto tokenId = mask[i];
+            if (tokenId > -1) {
+                std::copy_n(tileStart + (tokenId * rowLength), tokenId + 1, rowPtr);
+            }
+            // Always attend to self.
+            rowPtr[i] = posVal;
+        }
+    }
+}
+// Takes a vector of tokens and produces a vector of embeddings via the provided T2E callback.
+static inline void convertTokensToEmbeddings(std::vector<int32_t>& tokens,
+                                             std::vector<uint8_t>& embeddings,
+                                             size_t embeddingBufferSize,
+                                             Dialog::T2ECallback t2eCallback) {
+    for(auto &token : tokens){
+        std::vector<uint8_t> embedding(embeddingBufferSize,0);
+        t2eCallback(token, embedding.data(), embeddingBufferSize);
+        embeddings.insert(embeddings.end(), embedding.begin(), embedding.end());
+    }
+}
+bool SelfSpecDecDialog::processFollowOnGeneration(std::vector<int32_t>& tokens, std::vector<float>& logits, Dialog::Callback callback){
+    // Handles the printing of the subsequent generated tokens
+    bool          keep_generating = true;
+    const size_t  context         = _ctx->n_ctx();
+    std::vector<int32_t> decode_buf(
+            1, 0
+    ); // A buffer for tokens to be decoded (one at a time, per the Middleware's request)
+    auto decode_token = [&](int32_t t) {
+        if (!keep_generating) return;
+        // Decode new token.
+        // Return true to continue generation, and false otherwise
+        decode_buf[0] = _last_tok = t;
+        ++_n_generated;
+        if (_ctx->is_eos(t)) {
+            keep_generating = false;
+            callback("", Sentence::END);
+        } else {
+            keep_generating = callback(_tokenizer->decode(decode_buf), Sentence::CONTINUE);
+        }
+        return;
+    };
+    // set decode_buf from prompt processing
+    decode_buf[0] = _last_tok;
+    auto& engine = *_engine["primary"];
+    auto update_kv = [&engine, &callback, this](size_t past, const std::vector<bool>& selected) {
+        if (!engine.updateKV(past, selected))
+            return Dialog::abort("context size exceeded", callback);
+        return true;
+    };
+    // prepare the next inference
+    std::vector<int32_t> indices(_draft, 0);
+    std::iota(indices.begin(), indices.end(), 1);
+    tokens = build_sample_tree(sample_to_verify(std::span{logits.data(),logits.size()}, 0), std::span{logits.data(),logits.size()}, indices);
+    decode_token(tokens[0]);
+    // Prepare constant options for next inferences
+    const auto len_flat_sample_tree = get_len_flat_sample_tree();
+    const auto forecast_tokens      = gen_forecast_tokens(len_flat_sample_tree);
+    const auto attention_map        = gen_attention_map();
+    engine.set({{"kv-prefix-offset", len_flat_sample_tree}});
+    std::vector<int32_t> accepted_counts(_draft + 1, 0);
+    std::vector<bool>    selected(attention_map.size(), false);
+    while (!State::canceled() && keep_generating) {
+        // Append forecast tokens
+        tokens.insert(tokens.end(), forecast_tokens.begin(), forecast_tokens.end());
+        if (_n_past + tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            break;
+        }
+        size_t n_tok_t = 0;
+        // Bifurcate based on embedding as input or token as input
+        if (m_inputType == InputType::TOKENS)
+            n_tok_t = engine.process(tokens, attention_map, logits, true /* all logits */);
+        else if (m_inputType == InputType::EMBEDDINGS) {
+            // Convert tokens to embedding for the processing in the engine.
+            auto embedBufSize = engine.getEmbeddingBufferSize();
+            std::vector<uint8_t> embedding;
+            for(auto &token: tokens){
+                std::vector<uint8_t> curTokenEmbedding(embedBufSize,0);
+                m_t2eCallback(token, curTokenEmbedding.data(), embedBufSize);
+                embedding.insert(embedding.end(), curTokenEmbedding.begin(), curTokenEmbedding.end());
+            }
+            n_tok_t = engine.process(embedding, attention_map, logits, true /* all logits */);
+        } else {
+            return Dialog::abort("No valid Input Type is used", callback);
+        }
+        if (n_tok_t != tokens.size()) return Dialog::abort("engine processing failed", callback);
+        // Accept tokens
+        auto [accepted_tokens, accepted_ids] = verify_and_select_longest(std::span{tokens.data(),tokens.size()},
+                                                                         std::span{logits.data(),logits.size()});
+        // Commit accepted tokens to kv-caches
+        selected.resize(accepted_ids.back() + 1); // trim away rejected tokens
+        std::fill(selected.begin(), selected.end(), false);
+        for (auto id : accepted_ids)
+            selected[id] = true;
+        accepted_counts[accepted_tokens.size() - 1] += 1;
+        _n_past += accepted_tokens.size();
+        update_kv(_n_past, selected);
+        // Decode tokens
+        std::for_each(accepted_tokens.begin(), accepted_tokens.end(), decode_token);
+        // Prepare new tokens
+        auto next_draft_offset = len_flat_sample_tree + accepted_ids.back() * _draft;
+        std::iota(indices.begin(), indices.end(), next_draft_offset);
+        tokens = build_sample_tree(accepted_tokens.back(), std::span{logits.data(),logits.size()}, indices);
+    }
+    State::busy(false);
+    auto total_iteration = std::accumulate(accepted_counts.begin(), accepted_counts.end(), 0);
+    auto accept_rate =
+            float(_n_generated - 1) / total_iteration; // -1: exclude first generated token
+    __KPIS("SSD{{draft:{}, branch:{}, greedy:{}}}: accepted counts: {}, accept rate = {} tokens/iteration",
+           _draft,
+           _branches,
+           _t_sampler.greedy(),
+           accepted_counts,
+           accept_rate);
+    return true;
+}
+// Multistream AR generation
+bool SelfSpecDecDialog::processFollowOnGeneration(std::vector<std::vector<int32_t>>& streams, std::vector<float>& logits, Dialog::Callback callback) {
+    auto& sampler = *_sampler["primary"];
+    auto& engine  = *_engine["primary"];
+    auto update_kv = [&engine, &callback, this](size_t past, const std::vector<bool>& selected) {
+        if (!engine.updateKV(past, selected))
+            return Dialog::abort("context size exceeded", callback);
+        return true;
+    };
+    std::vector<size_t> streamIndices(streams.size());
+    std::vector<size_t> past_map(streams.size());
+    std::iota(streamIndices.begin(), streamIndices.end(), 0);
+    // Since the first inference is done separately, it is
+    // expected that each stream already has 1 valid AR token.
+    std::iota(past_map.begin(), past_map.end(), 0);
+    bool keep_generating = true;
+    const size_t context = _ctx->n_ctx();
+    if (streams.size() == 0) {
+        callback("\n", Sentence::END);
+        return true;
+    }
+    // Prepare constant options for next inferences
+    const auto len_flat_sample_tree = get_len_flat_sample_tree();
+    const auto forecast_tokens      = gen_forecast_tokens(len_flat_sample_tree);
+    const auto attention_map        = gen_attention_map();
+    std::vector<std::vector<int32_t>> draftStreams(streams.size());
+    for (int i = 0; i < streams.size(); i++) {
+        // prepare the next inference
+        std::vector<int32_t> indices(_draft, 0);
+        std::iota(indices.begin(), indices.end(), 1);
+        draftStreams[i] = build_sample_tree(sample_to_verify(std::span{logits.data(),logits.size()}, i*(1+_draft)), std::span{logits.data(),logits.size()}, indices);
+        streams[i].push_back(draftStreams[i][0]);
+    }
+    std::vector<int32_t> multi_attn_mask;
+    std::vector<int32_t> accepted_counts(_draft + 1, 0);
+    engine.set({{"kv-prefix-offset", len_flat_sample_tree}});
+    State::busy(true);
+    while (true) {
+        if (State::canceled()) break;
+        // If this exceeds context length, truncate all streams and return
+        if (_n_past + streamIndices.size() > _ctx->size()) {
+            for (auto stream : streamIndices)
+                callback(_tokenizer->decode(streams[stream]) + "\n", Sentence::CONTINUE);
+            break;
+        }
+        // Accumulate input tokens from all streams
+        std::vector<int32_t> multi_tokens;
+        for (auto streamIdx : streamIndices) {
+            multi_tokens.insert(multi_tokens.end(), draftStreams[streamIdx].begin(), draftStreams[streamIdx].end());
+            multi_tokens.insert(multi_tokens.end(), forecast_tokens.begin(), forecast_tokens.end());
+        }
+        if (_n_past + multi_tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, multi_tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            break;
+        }
+        tileAttentionMask(attention_map, streamIndices, past_map, len_flat_sample_tree, multi_attn_mask);
+        size_t n_tok_t = 0;
+        if (m_inputType == InputType::TOKENS) {
+            // Process input tokens for all streams in one batch
+            n_tok_t = engine.process(multi_tokens, multi_attn_mask, logits, true);
+        } else if (m_inputType == InputType::EMBEDDINGS) {
+            // Accumulate input embeddings from all streams
+            auto embedBufSize = engine.getEmbeddingBufferSize();
+            std::vector<uint8_t> multi_embeddings;
+            convertTokensToEmbeddings(multi_tokens, multi_embeddings, embedBufSize, m_t2eCallback);
+            // Process input tokens for all streams in one batch
+            n_tok_t = engine.process(multi_embeddings, multi_attn_mask, logits, true);
+        }
+        if (n_tok_t != multi_tokens.size()) return Dialog::abort("engine processing failed", callback);
+        std::vector<bool> all_selected;
+        // Process all logits independently
+        std::span<float> logit_span   = std::span{logits.data(),logits.size()};
+        std::span<int32_t> token_span = std::span{multi_tokens.data(), multi_tokens.size()};
+        for (int i = 0; i < streamIndices.size(); i++) {
+            const size_t streamIdx = streamIndices[i];
+            std::vector<int32_t>& stream = streams[streamIdx];
+            const size_t tileStride = draftStreams[streamIdx].size() + forecast_tokens.size();
+            std::span<float> tiled_logits = logit_span.subspan(i * tileStride * _vocab, _vocab);
+            // Accept tokens
+            auto [accepted_tokens, accepted_ids] = verify_and_select_longest(token_span.subspan(i * tileStride, tileStride),
+                                                                            tiled_logits);
+            // Commit accepted tokens to kv-caches
+            std::vector<bool> selected(tileStride, false);
+            for (auto id : accepted_ids) {
+                selected[id] = true;
+                past_map.push_back(streamIdx);
+            }
+            all_selected.insert(all_selected.end(), selected.begin(), selected.end());
+            accepted_counts[accepted_tokens.size() - 1] += 1;
+            _n_past += accepted_tokens.size();
+            // Decode tokens
+            stream.insert(stream.end(), accepted_tokens.begin(), accepted_tokens.end());
+            _n_generated += accepted_tokens.size();
+            // Prepare new tokens
+            std::vector<int32_t> indices(_draft, 0);
+            auto next_draft_offset = len_flat_sample_tree + accepted_ids.back() * _draft;
+            std::iota(indices.begin(), indices.end(), next_draft_offset);
+            draftStreams[streamIdx] = build_sample_tree(accepted_tokens.back(), tiled_logits, indices);
+        }
+        update_kv(_n_past, all_selected);
+        for (auto it = streamIndices.begin(); it != streamIndices.end();) {
+            int32_t stream = *it;
+            if (_ctx->is_eos(streams[stream].back())) {
+                callback(_tokenizer->decode(streams[stream]) + "\n", Sentence::CONTINUE);
+                it = streamIndices.erase(it);
+            } else {
+                ++it;
+            }
+        }
+        if (streamIndices.size() == 0) break;
+    }
+    callback("\n", Sentence::END);
+    State::busy(false);
+    auto total_iteration = std::accumulate(accepted_counts.begin(), accepted_counts.end(), 0);
+    auto accept_rate =
+            float(_n_generated - 1) / total_iteration; // -1: exclude first generated token
+    __KPIS("SSD{{draft:{}, branch:{}, greedy:{}}}: accepted counts: {}, accept rate = {} tokens/iteration",
+           _draft,
+           _branches,
+           _t_sampler.greedy(),
+           accepted_counts,
+           accept_rate);
+    return true;
+}
+// Handle prompt processing and generation will be done processFollowOnGeneration
+// Pass t2e callback using setter and remove as an argument. call setter from the base query function of dialog
+bool SelfSpecDecDialog::process(std::vector<uint8_t>& embedding,
+                                T2ECallback             t2eCallback,
+                                Dialog::Callback        callback ){
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    if(m_inputType != InputType::EMBEDDINGS) {
+        __ERROR("Input type for model is not embeddings.");
+        return false;
+    }
+    Timer start;
+    State::clear();
+    std::vector<float> logits;
+    auto&              engine = *_engine["primary"];
+    auto update_kv = [&engine, &callback, this](size_t past, const std::vector<bool>& selected) {
+        if (!engine.updateKV(past, selected))
+            return Dialog::abort("context size exceeded", callback);
+        return true;
+    };
+    // Store the t2e callback for reference during follow-on generation.
+    m_t2eCallback = t2eCallback;
+    auto embedBufSize = engine.getEmbeddingBufferSize();
+    {
+        std::vector<uint8_t> eosEmbedding(embedBufSize, 0.0);
+        if (m_t2eCallback) {
+            m_t2eCallback(_ctx->eos(), eosEmbedding.data(), embedBufSize);
+        }
+        if (!engine.cacheEosEmbedding(eosEmbedding)) {
+            __DEBUG("Failed to set the eos token embedding.");
+            return false;
+        }
+    }
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    engine.set({{"kv-prefix-skip", _forecast_prefix}});
+    std::vector<int32_t> tokens(1,0);
+    // Process prompt
+    // get number of tokens in the input
+    size_t curTokensCount = embedding.size()/embedBufSize;
+    if(curTokensCount * embedBufSize != embedding.size()){
+        size_t expectedLength = (curTokensCount + (embedding.size()%embedBufSize != 0))*embedBufSize;
+        __DEBUG("Input is wrong expected {} and found {}.", expectedLength, embedding.size());
+        return Dialog::abort("Input is not an multiple for the embedding Length", callback);
+    }
+    _n_prompt += curTokensCount;
+    std::vector<int32_t> attention_map(curTokensCount);
+    std::iota(attention_map.begin(), attention_map.end(), -1);
+    engine.set({{"kv-prefix-offset", curTokensCount}}); // Do not attend prefix
+    if (_n_past + curTokensCount > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, curTokensCount, _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(embedding, attention_map, logits, false))
+        return Dialog::abort("engine prompt processing failed", callback); // Change this message also to some generic message.
+    _n_past += curTokensCount;
+    update_kv(_n_past, {});
+    bool status = true;
+    if (_n_streams <= 1) {
+        tokens[0] = sample_to_verify(std::span{logits.data(),logits.size()}, 0);
+        // Decode the first token.
+        _last_tok = tokens[0];
+        if (_ctx->is_eos(_last_tok)) {
+            callback("", Sentence::END);
+            return true;
+        }
+        if (!callback(_tokenizer->decode(tokens), Sentence::BEGIN)) return true;
+        //decode_token(tokens[0]);
+        if (!m_t2eCallback) {
+            callback("", Sentence::END);
+            return true;
+        }
+        // Mark TTFT
+        _kpis.prompt.update(start.elapsed_usec());
+        start.reset();
+        State::busy(true);
+        // Initial inference for self-speculative decoding pipeline with forecast tokens and prefix
+        // process separately because logits are required for these tokens
+        for (int i = 0; i < _draft; ++i)
+            tokens.push_back(_forecast_token_offset + i);
+        attention_map.resize(tokens.size());
+        std::iota(attention_map.begin(), attention_map.end(), -1);
+        engine.set({{"kv-prefix-offset", 1}}); // Prevent the last token from attending
+        if (_n_past + tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            return true;
+        }
+        // Convert tokens to embeddings
+        // reset embedding vector to make space for the next runs
+        embedding.clear();
+        convertTokensToEmbeddings(tokens, embedding, embedBufSize, m_t2eCallback);
+        if (!engine.process(embedding, attention_map, logits, true))
+            return Dialog::abort("initial inference for SSD pipeline failed", callback);
+        _n_past += 1;
+        update_kv(_n_past, {});
+        // Use existing as much as possible
+        status = processFollowOnGeneration(tokens, logits, callback);
+    } else {
+        std::vector<std::vector<int32_t>> streams;
+        getTopK(logits, streams, _n_streams, _p_threshold, callback);
+        if (!m_t2eCallback) {
+            for (auto& stream : streams) {
+                if (!callback(_tokenizer->decode(stream) + "\n", Sentence::BEGIN)) return true;
+            }
+            callback("", Sentence::END);
+            return true;
+        }
+        // Mark TTFT
+        _kpis.prompt.update(start.elapsed_usec());
+        start.reset();
+        State::busy(true);
+        if (streams.size() == 0) {
+            callback("\n", Sentence::END);
+            return true;
+        }
+        // Initial inference for self-speculative decoding pipeline with forecast tokens and prefix
+        // process separately because logits are required for these tokens
+        attention_map.resize(1 + _draft);
+        std::iota(attention_map.begin(), attention_map.end(), -1);
+        std::vector<size_t> stream_indices(streams.size());
+        std::iota(stream_indices.begin(), stream_indices.end(), 0);
+        std::vector<int32_t> multi_attn_mask;
+        std::vector<size_t> past_map;
+        const size_t kvPrefixOffset = 1;
+        tileAttentionMask(attention_map, stream_indices, past_map, kvPrefixOffset, multi_attn_mask);
+        // Accumulate input tokens from all streams
+        std::vector<int32_t> multi_tokens;
+        multi_tokens.reserve(streams.size() * (1 + _draft));
+        for (int i = 0; i < streams.size(); i++) {
+            multi_tokens.insert(multi_tokens.end(), streams[i].begin(), streams[i].end());
+            for (int i = 0; i < _draft; ++i) {
+                multi_tokens.push_back(_forecast_token_offset + i);
+            }
+        }
+        // Convert tokens to embeddings
+        // reset embedding vector to make space for the next runs
+        embedding.clear();
+        convertTokensToEmbeddings(multi_tokens, embedding, embedBufSize, m_t2eCallback);
+        if (_n_past + multi_tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, multi_tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            return true;
+        }
+        if (!engine.process(embedding, multi_attn_mask, logits, true))
+            return Dialog::abort("initial inference for SSD pipeline failed", callback);
+        std::vector<bool> selected(multi_tokens.size(), false);
+        for (int i = 0; i < multi_tokens.size(); i+=(_draft+1)) {
+            selected[i] = true;
+        }
+        _n_past += streams.size();
+        update_kv(_n_past, selected);
+        status = processFollowOnGeneration(streams, logits, callback);
+    }
+    _kpis.generate.update(start.elapsed_usec());
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    return status;
+}
+bool SelfSpecDecDialog::process(std::vector<int32_t>& tokens, Dialog::Callback callback) {
+    // Check for prev failures and bail out early
+    if (State::failed()) return false;
+    Timer start;
+    if(m_inputType != InputType::TOKENS) {
+        __ERROR("Input type for model is not tokens.");
+        return false;
+    }
+    State::clear();
+    std::vector<float> logits;
+    auto&              engine = *_engine["primary"];
+    auto update_kv = [&engine, &callback, this](size_t past, const std::vector<bool>& selected) {
+        if (!engine.updateKV(past, selected))
+            return Dialog::abort("context size exceeded", callback);
+        return true;
+    };
+    using FF = Engine::Feature::Flags;
+    if (engine.supports(FF::DYNAMIC_LOAD)) engine.load();
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    engine.set({{"kv-prefix-skip", _forecast_prefix}});
+    std::vector<int32_t> attention_map(tokens.size());
+    std::iota(attention_map.begin(), attention_map.end(), -1);
+    // Process prompt
+    _n_prompt += tokens.size();
+    engine.set({{"kv-prefix-offset", tokens.size()}}); // Do not attend prefix
+    if (_n_past + tokens.size() > _ctx->size()) {
+        __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+        callback("", Sentence::END);
+        return true;
+    }
+    if (!engine.process(tokens, attention_map, logits, false))
+        return Dialog::abort("engine prompt processing failed", callback);
+    _n_past += tokens.size();
+    update_kv(_n_past, {});
+    bool status = true;
+    if (_n_streams <= 1) {
+        tokens[0] = sample_to_verify(std::span{logits.data(),logits.size()}, 0);
+        tokens.resize(1);
+        // Decode the first token.
+        _last_tok = tokens[0];
+        if (_ctx->is_eos(_last_tok)) {
+            callback("", Sentence::END);
+            return true;
+        }
+        if (!callback(_tokenizer->decode(tokens), Sentence::BEGIN)) return true;
+        // decode_token(tokens[0]);
+        // Mark TTFT
+        _kpis.prompt.update(start.elapsed_usec());
+        start.reset();
+        State::busy(true);
+        // Initial inference for self-speculative decoding pipeline with forecast tokens and prefix
+        // process separately because logits are required for these tokens
+        for (int i = 0; i < _draft; ++i)
+            tokens.push_back(_forecast_token_offset + i);
+        attention_map.resize(tokens.size());
+        std::iota(attention_map.begin(), attention_map.end(), -1);
+        engine.set({{"kv-prefix-offset", 1}}); // Prevent the last token from attending
+        if (_n_past + tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            return true;
+        }
+        if (!engine.process(tokens, attention_map, logits, true))
+            return Dialog::abort("initial inference for SSD pipeline failed", callback);
+        _n_past += 1;
+        update_kv(_n_past, {});
+        status = processFollowOnGeneration(tokens, logits, callback);
+    } else {
+        std::vector<std::vector<int32_t>> streams;
+        getTopK(logits, streams, _n_streams, _p_threshold, callback);
+        // Mark TTFT
+        _kpis.prompt.update(start.elapsed_usec());
+        start.reset();
+        State::busy(true);
+        if (streams.size() == 0) {
+            callback("\n", Sentence::END);
+            return true;
+        }
+        // Initial inference for self-speculative decoding pipeline with forecast tokens and prefix
+        // process separately because logits are required for these tokens
+        attention_map.resize(1 + _draft);
+        std::iota(attention_map.begin(), attention_map.end(), -1);
+        std::vector<size_t> stream_indices(streams.size());
+        std::iota(stream_indices.begin(), stream_indices.end(), 0);
+        std::vector<int32_t> multi_attn_mask;
+        std::vector<size_t> past_map;
+        const size_t kvPrefixOffset = 1;
+        tileAttentionMask(attention_map, stream_indices, past_map, kvPrefixOffset, multi_attn_mask);
+        // Accumulate input tokens from all streams
+        std::vector<int32_t> multi_tokens;
+        multi_tokens.reserve(streams.size() * (1 + _draft));
+        for (int i = 0; i < streams.size(); i++) {
+            multi_tokens.insert(multi_tokens.end(), streams[i].begin(), streams[i].end());
+            for (int i = 0; i < _draft; ++i) {
+                multi_tokens.push_back(_forecast_token_offset + i);
+            }
+        }
+        if (_n_past + multi_tokens.size() > _ctx->size()) {
+            __WARN("Context limit exceeded ({} + {} > {})", _n_past, multi_tokens.size(), _ctx->size());
+            callback("", Sentence::END);
+            return true;
+        }
+        if (!engine.process(multi_tokens, multi_attn_mask, logits, true))
+            return Dialog::abort("initial inference for SSD pipeline failed", callback);
+        std::vector<bool> selected(multi_tokens.size(), false);
+        for (int i = 0; i < multi_tokens.size(); i+=(_draft+1)) {
+            selected[i] = true;
+        }
+        _n_past += streams.size();
+        update_kv(_n_past, selected);
+        status = processFollowOnGeneration(streams, logits, callback);
+    }
+    _kpis.generate.update(start.elapsed_usec());
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    start.reset();
+    return status;
+}
+void SelfSpecDecDialog::reset() {
+  Dialog::reset();
+  _n_past = _forecast_prefix;
+  size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name);
+  if (n_restored_prefix != _forecast_prefix) {
+    // clang-format off
+    throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",
+                                           n_restored_prefix, _kv_prefix_name, _forecast_prefix ) );
+    // clang-format on
+  }
+}
+bool SelfSpecDecDialog::save(const std::string& name) {
+    if (_n_streams > 1) {
+        throw std::runtime_error("Save is unsupported for multistream dialogs.");
+    }
+    return Dialog::save(name);
+}
+bool SelfSpecDecDialog::restore(const std::string& name) {
+    if (_n_streams > 1) {
+        throw std::runtime_error("Restore is unsupported for multistream dialogs.");
+    }
+    return Dialog::restore(name);
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Dialog::__register(
+            "ssd-q1",
+            [](std::shared_ptr<Env> env, const std::string& name, const json& conf) {
+                return (Dialog*)new SelfSpecDecDialog(env, name, conf);
+            }
+    );
+});
+// Register ssd sampler for compatibility
+static OnLoad sampler_regy([]() {
+    Sampler::__register("basic", [](Context& ctx, const json& conf) {
+        return (Sampler*)new BasicSampler(ctx, conf);
+    });
+});
+void needSsdDialog() {}
+} // namespace qualla

Genie/Genie/src/qualla/embedding.cpp ADDED Viewed

	@@ -0,0 +1,190 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/embedding.hpp>
+#include <qualla/logger.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <functional>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <filesystem>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace fs = std::filesystem;
+namespace qualla {
+Embedding::Embedding(std::shared_ptr<Env> env, const std::string& name, const qualla::json& json)
+    : _name(name), _env(env) {
+    Timer start;
+    _env->logger().debug(fmt::format("embedding-new: {} config {}", name, json.dump()));
+    using qc = qualla::Config;
+    // Parse prompt config
+    const qualla::json& pmt_conf = qc::optional<qualla::json>(json, "prompt", {});
+    _tags = qc::optional<std::vector<std::string>>(pmt_conf, "tags", {"", ""});
+    // Create the context first
+    _ctx = Context::create(*_env, name, qc::optional<qualla::json>(json, "context", {}));
+    // Create Tokenizer
+    fs::path tok_path = _env->path().models / qc::mandatory<std::string>(json, "tokenizer");
+    _tokenizer        = Tokenizer::create(*_ctx, tok_path);
+    // Create Engine
+    const qualla::json& eng_conf = qc::mandatory<qualla::json>(json, "engine");
+    _engine                      = Engine::create(*_ctx, eng_conf);
+    // Truncation of input to context
+    _input_truncation = qc::optional<qualla::json>(json, "truncate-input", false);
+    using FF = Engine::Feature::Flags;
+    if (!_engine->supports(FF::OUTPUT_EMBEDDINGS))
+        throw std::runtime_error("engine must output embeddings");
+    _kpis.init.update(start.elapsed_usec());
+}
+Embedding::~Embedding() {}
+bool Embedding::process(std::vector<int32_t>& tokens, std::vector<float>& output) {
+    Timer start;
+    State::clear();
+    size_t n = _engine->process(tokens, output, false);
+    if (!n) {
+        State::error("engine prompt processing failed");
+        return false;
+    }
+    _n_prompt += tokens.size();
+    // Clean the buffer before using
+    _output_dimensions.clear();
+    uint64_t output_size = 1;
+    // push number of tokens present in the result.
+    _output_dimensions.push_back(n);
+    // push back the dimension of the each embedding
+    _output_dimensions.push_back(_ctx->n_embd());
+    output_size = n * _ctx->n_embd();
+    output.resize(output_size);
+    _kpis.prompt.update(start.elapsed_usec());
+    // Log latest KPIs in a single line
+    _env->logger().post(Logger::KPIS, kpis().dump(" "));
+    return true;
+}
+bool Embedding::query(const std::string& str, std::vector<float>& output) {
+    std::string          p_str; // prompt string
+    std::vector<int32_t> p_vec; // prompt tokens
+    p_vec.reserve(_ctx->n_ctx());
+    p_str = _tags[0] + str + _tags[1];
+    _env->logger().debug(fmt::format("embedding-query: {}", str));
+    _env->logger().debug(fmt::format("embedding-prompt: {}", p_str));
+    _n_queries++;
+    _tokenizer->encode(p_str, p_vec);
+    _env->logger().debug(fmt::format("embedding-tokens: {}", p_vec));
+    if(p_vec.size() > (_ctx->n_ctx())){ // Condition to not allow input to exceed context.
+        if(_input_truncation == false){
+            throw std::runtime_error("Input exceeds the context of the model.");
+        }
+        else{
+            p_vec.resize(_ctx->n_ctx());
+        }
+    }
+    return process(p_vec, output);
+}
+// Embedding KPIs helpers
+void Embedding::output_dimensions(std::vector<std::uint32_t>& outputDimensions){
+    outputDimensions = _output_dimensions;
+}
+// Get latest KPIs
+Embedding::KPIs& Embedding::kpis() {
+    // Update TPS
+    if (_n_prompt) {
+        float t          = _kpis.prompt.total_usec / _n_prompt;
+        _kpis.tps.prompt = 1000000.0 / (t ? t : 1000000.0);
+    }
+    // We could synthesize more KPIs from from other layers (engine, sampler, etc)
+    return _kpis;
+}
+std::string Embedding::KPIs::dump(std::string_view sep) const {
+    return fmt::format(
+            "init:[{}]{}prompt:[{}]{} tps-prompt:{:.2f}",
+            init.dump(),
+            sep,
+            prompt.dump(),
+            sep,
+            tps.prompt
+    );
+}
+void Embedding::KPIs::reset() {
+    init.reset();
+    prompt.reset();
+    tps.prompt = 0.0;
+}
+// Create API
+std::unique_ptr<Embedding> Embedding::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        const qualla::json&  conf
+) {
+    return std::make_unique<Embedding>(env, name, conf);
+}
+std::unique_ptr<Embedding> Embedding::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        std::istream&        json_stream
+) {
+    return create(env, name, json::parse(json_stream));
+}
+std::unique_ptr<Embedding> Embedding::create(
+        std::shared_ptr<Env> env,
+        const std::string&   name,
+        const fs::path&      json_path
+) {
+    if (!fs::exists(json_path))
+        throw std::runtime_error(json_path.string() + ": file does not exist");
+    std::ifstream ifs(json_path);
+    return create(env, name, ifs);
+}
+} // namespace qualla

Genie/Genie/src/qualla/engine.cpp ADDED Viewed

	@@ -0,0 +1,198 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <qualla/engine.hpp>
+#include <qualla/detail/kpi.hpp>
+#include <qualla/detail/config.hpp>
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+namespace qualla {
+Engine::Engine(Context& ctx, const std::string& type, const qualla::json& conf)
+    : _type(type), _ctx(ctx), _env(ctx.env()) {
+    _env.logger().debug(
+            fmt::format("engine-new: {} ctx {} config {}", type, _ctx.name(), conf.dump())
+    );
+    using qc = qualla::Config;
+    _role    = qc::optional<std::string>(conf, "role", "primary");
+}
+Engine::~Engine() {}
+size_t Engine::process(
+        const std::vector<int32_t>& tokens,
+        const std::vector<int32_t>& attention_map,
+        std::vector<float>&         output,
+        bool                        output_all
+) {
+    _env.logger().error(fmt::format("{}-engine does not support attention_map", _type));
+    return 0;
+}
+size_t Engine::process(const std::vector<int32_t>& tokens) {
+    // Derived engines should overwrite this to avoid copying logits
+    std::vector<float> logits;
+    return process(tokens, logits);
+}
+size_t Engine::process(
+        std::vector<uint8_t>&       embeddings,
+        const std::vector<int32_t>& attention_map,
+        std::vector<float>&         output,
+        bool                        output_all
+) {
+    _env.logger().error(fmt::format("{}-engine does not support embedding as input", _type));
+    return 0;
+}
+bool Engine::updateKV(size_t n_past) {
+    _env.logger().error(fmt::format("{}-engine does not support sync", _type));
+    return false;
+}
+bool Engine::updateKV(size_t n_past, const std::vector<bool>& selected) {
+    _env.logger().error(fmt::format("{}-engine does not support sync with selected", _type));
+    return false;
+}
+size_t Engine::restore(const std::string& name) {
+    _env.logger().error(fmt::format("{}-engine does not support restore", _type));
+    return 0;
+}
+bool Engine::save(const std::string& name) {
+    _env.logger().error(fmt::format("{}-engine does not support save", _type));
+    return false;
+}
+void Engine::reset() {
+    _env.logger().error(fmt::format("{}-engine does not support reset", _type));
+}
+bool Engine::load() {
+    _env.logger().error(fmt::format("{}-engine does not support dynamic load", _type));
+    return 0;
+}
+bool Engine::unload() {
+    _env.logger().error(fmt::format("{}-engine does not support dynamic unload", _type));
+    return false;
+}
+bool Engine::set(qualla::json data) {
+    _env.logger().error(fmt::format("{}-engine does not support set()", _type));
+    return false;
+}
+qualla::json Engine::get() {
+    _env.logger().error(fmt::format("{}-engine does not support get()", _type));
+    return false;
+}
+bool Engine::cacheEosEmbedding(std::vector<uint8_t>& eosEmbedding) {
+    _env.logger().error(fmt::format("{}-engine does not support cache eos embedding", _type));
+    return true;
+}
+size_t Engine::getEmbeddingBufferSize() {
+    _env.logger().error(fmt::format("{}-engine does not support embedding vectors", _type));
+    return 0;
+}
+qualla::InputType Engine::getInputType(){
+    return qualla::InputType::TOKENS;
+}
+// Engine KPIs
+std::string Engine::KPIs::dump(std::string_view sep) const {
+    return fmt::format(
+            "load:[{}]{}process:[{}]{}update-kv:[{}]{}unload:[{}]",
+            load.dump(),
+            sep,
+            process.dump(),
+            sep,
+            update_kv.dump(),
+            sep,
+            unload.dump()
+    );
+}
+void Engine::KPIs::reset() {
+    load.reset();
+    process.reset();
+    update_kv.reset();
+    unload.reset();
+}
+// Engine registry type string + creator function
+using Registry = std::unordered_map<std::string, Engine::Creator>;
+static std::unique_ptr<Registry> registry;
+void Engine::__register(const std::string& type, Creator func) {
+    if (!registry) registry = std::make_unique<Registry>();
+    Registry& r = *registry;
+    r[type]     = func;
+}
+std::unique_ptr<Engine> Engine::create(Context& ctx, const qualla::json& conf) {
+    using qc         = qualla::Config;
+    std::string type = qc::mandatory<std::string>(conf, "type");
+    if (!registry) throw std::runtime_error(type + ": engine not found");
+    Registry& r = *registry;
+    if (!r.contains(type)) throw std::runtime_error(type + ": engine not found");
+    return std::unique_ptr<Engine>(r[type](ctx, conf));
+}
+std::unique_ptr<Engine> Engine::create(Context& ctx, std::istream& json_stream) {
+    return create(ctx, json::parse(json_stream));
+}
+std::unique_ptr<Engine> Engine::create(Context& ctx, const std::string& json_str) {
+    return create(ctx, json::parse(json_str));
+}
+std::vector<std::string> Engine::list() {
+    std::vector<std::string> v;
+    if (!registry) return v;
+    Registry& r = *registry;
+    for (auto k : r)
+        v.push_back(k.first);
+    return v;
+}
+bool Engine::applyLoraAdapter(std::string lora_adapter_name) {
+    _env.logger().error(fmt::format("{}-engine does not support LoraAdapter", _type));
+    return false;
+}
+bool Engine::applyLoraStrength(std::string tensor_name, float tensor_val) {
+    _env.logger().error(fmt::format("{}-engine does not support setLoraStrength", _type));
+    return false;
+}
+} // namespace qualla

Genie/Genie/src/qualla/engines/lib.cpp ADDED Viewed

	@@ -0,0 +1,9 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+// Just a stub for building qualla::engines when no built-in engines are enabled

Genie/Genie/src/qualla/engines/qnn-api/BackendExtensions.cpp ADDED Viewed

	@@ -0,0 +1,158 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "dlwrap.hpp"
+#include "BackendExtensions.hpp"
+#include "NetRunBackend.hpp"
+BackendExtensions::BackendExtensions(
+        BackendExtensionsConfigs             backendExtensionsConfig,
+        void*                                backendLibHandle,
+        PerfProfile                          perfProfile,
+        std::shared_ptr<ICommandLineManager> clManager,
+        bool                                 debug_qnn
+)
+    : m_backendExtensionsLibPath(backendExtensionsConfig.sharedLibraryPath),
+      m_backendExtensionsConfigPath(backendExtensionsConfig.configFilePath),
+      m_backendInterface(nullptr), m_isNetRunBackendInterface(false),
+      m_createBackendInterfaceFn(nullptr), m_destroyBackendInterfaceFn(nullptr),
+      m_backendLibHandle(backendLibHandle), m_perfProfile(perfProfile), m_clManager(clManager),
+      m_debugQnn(debug_qnn) {
+    (void)m_perfProfile;
+}
+BackendExtensions::~BackendExtensions() {
+    if (nullptr != m_backendInterface) {
+        if (m_isNetRunBackendInterface) {
+            QNN_DEBUG("Deleting NetRun Backend Interface");
+            delete m_backendInterface;
+        } else {
+            if (nullptr != m_destroyBackendInterfaceFn) {
+                QNN_DEBUG("Destroying Backend Interface");
+                m_destroyBackendInterfaceFn(m_backendInterface);
+            }
+        }
+    }
+}
+bool BackendExtensions::loadFunctionPointers() {
+    void* libHandle = dlopen(m_backendExtensionsLibPath.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == libHandle) {
+        QNN_ERROR(
+                "Unable to load backend extensions lib: [%s]. dlerror(): [%s]",
+                m_backendExtensionsLibPath.c_str(),
+                dlerror()
+        );
+        return false;
+    }
+    m_createBackendInterfaceFn =
+            (CreateBackendInterfaceFnType_t)dlsym(libHandle, "createBackendInterface");
+    m_destroyBackendInterfaceFn =
+            (DestroyBackendInterfaceFnType_t)dlsym(libHandle, "destroyBackendInterface");
+    if (nullptr == m_createBackendInterfaceFn || nullptr == m_destroyBackendInterfaceFn) {
+        QNN_ERROR("Unable to find symbols. dlerror(): [%s]", dlerror());
+        return false;
+    }
+    return true;
+}
+void BackendExtensions::qnnLogCallback(
+        const char*    fmt,
+        QnnLog_Level_t level,
+        uint64_t       timestamp,
+        va_list        args
+) {
+    char        buffer[1024] = "";
+    const char* levelStr     = "";
+    switch (level) {
+    case QNN_LOG_LEVEL_ERROR:
+        levelStr = " ERROR ";
+        break;
+    case QNN_LOG_LEVEL_WARN:
+        levelStr = "WARNING";
+        break;
+    case QNN_LOG_LEVEL_INFO:
+        levelStr = "  INFO ";
+        break;
+    case QNN_LOG_LEVEL_DEBUG:
+        levelStr = " DEBUG ";
+        break;
+    case QNN_LOG_LEVEL_VERBOSE:
+        levelStr = "VERBOSE";
+        break;
+    case QNN_LOG_LEVEL_MAX:
+        levelStr = "UNKNOWN";
+        break;
+    }
+    int pos = snprintf(
+            buffer, sizeof(buffer), "QNN: [%s] time=%lu:", levelStr, (unsigned long)timestamp
+    );
+    vsnprintf(buffer + pos, sizeof(buffer) - pos, fmt, args);
+    printf("%s", buffer);
+}
+bool BackendExtensions::initialize() {
+    QNN_DEBUG("DEBUG: m_backendExtensionsLibPath=%s\n", m_backendExtensionsLibPath.c_str());
+    QNN_DEBUG("DEBUG: m_backendExtensionsConfigPath=%s\n", m_backendExtensionsConfigPath.c_str());
+    if (m_backendExtensionsLibPath.empty() && m_backendExtensionsConfigPath.empty()) {
+        QNN_WARN("No BackendExtensions lib provided; initializing NetRunBackend Interface");
+        m_isNetRunBackendInterface = true;
+        m_backendInterface         = new NetRunBackend();
+    } else {
+        QNN_DEBUG("Loading supplied backend extensions lib.");
+        QNN_DEBUG("Backend extensions lib path: %s", m_backendExtensionsLibPath.c_str());
+        if (m_backendExtensionsConfigPath.empty()) {
+            QNN_DEBUG("Backend extensions lib specified without a config file.");
+        } else {
+            QNN_DEBUG("Backend extensions config path: %s", m_backendExtensionsConfigPath.c_str());
+        }
+        if (!loadFunctionPointers()) {
+            QNN_ERROR("Failed to load function pointers.");
+            return false;
+        }
+        if (nullptr != m_createBackendInterfaceFn) {
+            m_backendInterface = m_createBackendInterfaceFn();
+        }
+    }
+    if (nullptr == m_backendInterface) {
+        QNN_ERROR("Unable to load backend extensions interface.");
+        return false;
+    }
+    if (m_debugQnn) {
+        if (!(m_backendInterface->setupLogging(BackendExtensions::qnnLogCallback, QNN_LOG_LEVEL_VERBOSE))) {
+            QNN_WARN("Unable to initialize logging in backend extensions.");
+        }
+    }
+    if (!m_backendInterface->initialize(m_backendLibHandle)) {
+        QNN_ERROR("Unable to initialize backend extensions interface.");
+        return false;
+    }
+    if (!m_backendInterface->setPerfProfile(m_perfProfile)) {
+        QNN_WARN("Unable to set perf profile in  backend extensions interface.");
+        //return false;
+    }
+    if (!m_backendInterface->loadConfig(m_backendExtensionsConfigPath)) {
+        QNN_ERROR("Unable to load backend extensions interface config.");
+        return false;
+    }
+    if ((m_clManager != nullptr) && !m_backendInterface->loadCommandLineArgs(m_clManager)) {
+        QNN_ERROR("Unable to load backend extensions' command line arguments.");
+        return false;
+    }
+    return true;
+}
+IBackend* BackendExtensions::interface() {
+    return m_backendInterface;
+}

Genie/Genie/src/qualla/engines/qnn-api/BackendExtensions.hpp ADDED Viewed

	@@ -0,0 +1,62 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <string>
+#include "IBackend.hpp"
+#include "QnnConfig.hpp"
+#include "Log.hpp"
+// This is a wrapper class that handles resources/state related to
+// backend extensions interface. This is used by QnnNetRun library
+// to manage and call into an IBackend interface implementation.
+// Functionality present in this class:
+//      1. Receives the argument string related to backend_extensions
+//         argument from the front end and processes it to open the
+//         backend extensions library.
+//      2. Locates and stores symbols for creating and destroying the
+//         IBackend interface implementation.
+//      3. If there is no backend_extensions argument, this class creates
+//         the dummy IBackend implementation aka NetRunBackend.
+//      4. Gives QnnNetRun access to the implementation itself through
+//         interface() function.
+class BackendExtensions final {
+  public:
+    BackendExtensions(
+            BackendExtensionsConfigs             backendExtensionsConfig,
+            void*                                backendLibHandle,
+            PerfProfile                          perfProfile,
+            std::shared_ptr<ICommandLineManager> clManager =
+                    std::shared_ptr<ICommandLineManager>(nullptr),
+            bool debug_qnn = false
+    );
+    ~BackendExtensions();
+    bool      initialize();
+    IBackend* interface();
+  private:
+    bool                                 loadFunctionPointers();
+    std::string                          m_backendExtensionsLibPath;
+    std::string                          m_backendExtensionsConfigPath;
+    IBackend*                            m_backendInterface;
+    bool                                 m_isNetRunBackendInterface;
+    CreateBackendInterfaceFnType_t       m_createBackendInterfaceFn;
+    DestroyBackendInterfaceFnType_t      m_destroyBackendInterfaceFn;
+    void*                                m_backendLibHandle;
+    PerfProfile                          m_perfProfile;
+    std::shared_ptr<ICommandLineManager> m_clManager;
+    bool                                 m_debugQnn{false};
+    static void                          qnnLogCallback(
+                                     const char*    fmt,
+                                     QnnLog_Level_t level,
+                                     uint64_t       timestamp,
+                                     va_list        args
+                             );
+};

Genie/Genie/src/qualla/engines/qnn-api/ClientBuffer.cpp ADDED Viewed

	@@ -0,0 +1,122 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "ClientBuffer.hpp"
+#include "QnnTypeMacros.hpp"
+void* ClientBuffer::getBuffer(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_WARN("getBuffer: received a null pointer to a tensor");
+        return nullptr;
+    }
+    return QNN_TENSOR_GET_CLIENT_BUF(tensor).data;
+}
+size_t ClientBuffer::getBufferSize(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_WARN("getBufferSize: received a null pointer to a tensor");
+        return 0;
+    }
+    return QNN_TENSOR_GET_CLIENT_BUF(tensor).dataSize;
+};
+bool ClientBuffer::allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) {
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensors");
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_RAW);
+    Qnn_ClientBuffer_t clientBuffer;
+    clientBuffer.data = malloc(tensorDataSize);
+    if (nullptr == clientBuffer.data) {
+        QNN_ERROR("mem alloc failed for clientBuffer.data");
+        return false;
+    }
+    clientBuffer.dataSize = tensorDataSize;
+    QNN_TENSOR_SET_CLIENT_BUF(tensor, clientBuffer);
+    return true;
+}
+bool ClientBuffer::freeTensorBuffer(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensors");
+        return false;
+    }
+    if (QNN_TENSOR_GET_CLIENT_BUF(tensor).data) {
+        if (m_sameMemoryFreeTensors.find(tensor) == m_sameMemoryFreeTensors.end()) {
+            free(QNN_TENSOR_GET_CLIENT_BUF(tensor).data);
+        }
+        QNN_TENSOR_SET_CLIENT_BUF(tensor, Qnn_ClientBuffer_t({nullptr, 0u}));
+        QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_UNDEFINED);
+    }
+    return true;
+}
+bool ClientBuffer::useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) {
+    if (nullptr == dest || nullptr == src) {
+        QNN_ERROR("Received nullptr");
+        return false;
+    }
+    if (false == freeTensorBuffer(dest)) {
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSOR_GET_MEM_TYPE(src));
+    QNN_TENSOR_SET_CLIENT_BUF(dest, QNN_TENSOR_GET_CLIENT_BUF(src));
+    m_sameMemoryFreeTensors.insert(dest);
+    return true;
+}
+bool ClientBuffer::useExternalMemory(Qnn_Tensor_t* dest, void* extMem) {
+    if (nullptr == dest || nullptr == extMem) {
+        QNN_ERROR("Received nullptr");
+        return false;
+    }
+    Qnn_ClientBuffer_t clientBuffer;
+    clientBuffer.data     = extMem;
+    clientBuffer.dataSize = QNN_TENSOR_GET_CLIENT_BUF(dest).dataSize;
+    if (false == freeTensorBuffer(dest)) {
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSORMEMTYPE_RAW);
+    QNN_TENSOR_SET_CLIENT_BUF(dest, clientBuffer);
+    m_sameMemoryFreeTensors.insert(dest);
+    return true;
+}
+void* ClientBuffer::allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) {
+    return nullptr;
+}
+bool ClientBuffer::mapFusedBufferOffset(
+        Qnn_Tensor_t*       tensor,
+        size_t              tensorDataSize,
+        int32_t             fd,
+        uint32_t            offset,
+        uint64_t            totalBufferSize,
+        void*               memPointer,
+        Qnn_ContextHandle_t contextHandle
+) {
+    return false;
+}
+bool ClientBuffer::deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) {
+    return false;
+}
+void ClientBuffer::freeFusedBuffers() {}
+size_t ClientBuffer::getOffset(Qnn_Tensor_t* tensor) {
+    return 0;
+}
+size_t ClientBuffer::getTotalBufferSize(Qnn_Tensor_t* tensor) {
+    return 0;
+}

Genie/Genie/src/qualla/engines/qnn-api/ClientBuffer.hpp ADDED Viewed

	@@ -0,0 +1,85 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include "IBufferAlloc.hpp"
+#include "Log.hpp"
+#include <unordered_set>
+#include <stdlib.h>
+class ClientBuffer final : public IBufferAlloc {
+  public:
+    ClientBuffer() {};
+    // Disable copy constructors, r-value referencing, etc
+    ClientBuffer(const ClientBuffer&) = delete;
+    ClientBuffer& operator=(const ClientBuffer&) = delete;
+    ClientBuffer(ClientBuffer&&) = delete;
+    ClientBuffer& operator=(ClientBuffer&&) = delete;
+    bool initialize() override { return true; };
+    void* getBuffer(Qnn_Tensor_t* tensor) override;
+    int getFd(Qnn_Tensor_t* tensor) override {
+        QNN_WARN("getFd: This is not ION memory");
+        return -1;
+    };
+    size_t getOffset(Qnn_Tensor_t* tensor) override;
+    size_t getBufferSize(Qnn_Tensor_t* tensor) override;
+    size_t getTotalBufferSize(Qnn_Tensor_t* tensor) override;
+    bool allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) override;
+    bool freeTensorBuffer(Qnn_Tensor_t* tensor) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) override { return false; }
+    bool useExternalMemory(Qnn_Tensor_t* dest, void* extMem) override;
+    void* allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) override;
+    bool  allocateBuffers(
+             const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+             std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+     ) override {
+        return false;
+    };
+    bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            size_t              tensorDataSize,
+            int32_t             fd,
+            uint32_t            offset,
+            uint64_t            totalBufferSize,
+            void*               memPointer,
+            Qnn_ContextHandle_t contextHandle
+    ) override;
+    bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) override;
+    void freeFusedBuffers() override;
+    bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            int                 alloc_idx,
+            size_t              offset,
+            Qnn_ContextHandle_t ctx,
+            size_t              size
+    ) override {
+        return false;
+    }
+    virtual ~ClientBuffer() {};
+  private:
+    std::unordered_set<Qnn_Tensor_t*> m_sameMemoryFreeTensors;
+};

Genie/Genie/src/qualla/engines/qnn-api/IBackend.hpp ADDED Viewed

	@@ -0,0 +1,156 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <map>
+#include "ICommandLineManager.hpp"
+#include "QnnBackend.h"
+#include "QnnContext.h"
+#include "QnnGraph.h"
+#include "QnnLog.h"
+#include "QnnTypeDef.hpp"
+#include "QnnProfile.h"
+#include "QnnDevice.h"
+// Compile-time definition to check for QNN SDK features using the QNN API version
+#define QUALLA_QNN_API_VERSION                                                                     \
+    (QNN_API_VERSION_MAJOR * 10000 + QNN_API_VERSION_MINOR * 100 + QNN_API_VERSION_PATCH)
+const uint32_t g_profilingLevelNotSet = 0;
+enum class PerfProfile {
+    LOW_BALANCED,
+    BALANCED,
+    DEFAULT,
+    HIGH_PERFORMANCE,
+    SUSTAINED_HIGH_PERFORMANCE,
+    BURST,
+    EXTREME_POWER_SAVER,
+    LOW_POWER_SAVER,
+    POWER_SAVER,
+    HIGH_POWER_SAVER,
+    SYSTEM_SETTINGS,
+    NO_USER_INPUT,
+    CUSTOM,
+    INVALID
+};
+// This is the interface that enables backend specific extensions in qnn-net-run.
+// It is designed as hooks in the timeline of various events in NetRun.
+// Backends that intend to implement custom features through qnn-net-run will have
+// to implement this interface and add functionality in appropriate methods depending
+// on where/when the custom functionality needs to be exercised.
+// These functions/hooks will be called through the IBackend interface from within
+// qnn-net-run wherever necessary.
+class IBackend {
+  public:
+    virtual ~IBackend() {}
+    virtual bool setupLogging(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) = 0;
+    virtual bool initialize(void* backendLibHandle) = 0;
+    virtual bool setPerfProfile(PerfProfile perfProfile) = 0;
+    virtual QnnProfile_Level_t getProfilingLevel() = 0;
+    virtual bool loadConfig(std::string configFile) = 0;
+    virtual bool loadCommandLineArgs(std::shared_ptr<ICommandLineManager> clManager) = 0;
+    virtual bool beforeBackendInitialize(
+            QnnBackend_Config_t*** customConfigs,
+            uint32_t*              configCount
+    ) = 0;
+    virtual bool afterBackendInitialize() = 0;
+    virtual bool beforeContextCreate(
+            QnnContext_Config_t*** customConfigs,
+            uint32_t*              configCount
+    ) = 0;
+    virtual bool afterContextCreate() = 0;
+    virtual bool beforeComposeGraphs(
+            GraphConfigInfo_t*** customGraphConfigs,
+            uint32_t*            graphCount
+    ) = 0;
+    virtual bool afterComposeGraphs() = 0;
+#if QUALLA_QNN_API_VERSION >= 21700
+    virtual bool beforeGraphFinalizeUpdateConfig(
+            const char*          graphName,
+            Qnn_GraphHandle_t    graphHandle,
+            QnnGraph_Config_t*** customConfigs,
+            uint32_t*            configCount
+    ) = 0;
+#endif
+    virtual bool beforeGraphFinalize() = 0;
+    virtual bool afterGraphFinalize() = 0;
+    virtual bool beforeRegisterOpPackages() = 0;
+    virtual bool afterRegisterOpPackages() = 0;
+    virtual bool beforeExecute(
+            const char*          graphName,
+            QnnGraph_Config_t*** customConfigs,
+            uint32_t*            configCount
+    ) = 0;
+    virtual bool afterExecute() = 0;
+    virtual bool beforeContextFree() = 0;
+    virtual bool afterContextFree() = 0;
+    virtual bool beforeBackendTerminate() = 0;
+    virtual bool afterBackendTerminate() = 0;
+    virtual bool beforeCreateFromBinary(
+            QnnContext_Config_t*** customConfigs,
+            uint32_t*              configCount
+    ) = 0;
+    virtual bool afterCreateFromBinary() = 0;
+#if QUALLA_QNN_API_VERSION >= 21700
+    virtual bool beforeCreateContextsFromBinaryList(
+            std::map<std::string, std::tuple<QnnContext_Config_t**, uint32_t>>*
+                                   contextKeyToCustomConfigsMap,
+            QnnContext_Config_t*** commonCustomConfigs,
+            uint32_t*              commonConfigCount
+    ) = 0;
+    virtual bool afterCreateContextsFromBinaryList() = 0;
+#endif
+    virtual bool beforeCreateDevice(QnnDevice_Config_t*** deviceConfigs, uint32_t* configCount) = 0;
+    virtual bool afterCreateDevice() = 0;
+    virtual bool beforeFreeDevice() = 0;
+    virtual bool afterFreeDevice() = 0;
+};
+// These are the function types that the backend extensions shared library is
+// expected to expose. The first function helps NetRun obtain a valid implementation
+// of IBackend interface and the second is used to destroy the same interface at the end.
+// The function names themselves are expected to be these strings:
+//      1. "createBackendInterface"
+//      2. "destroyBackendInterface"
+// These functions need to be tagged with extern "C" and their symbols need to be exposed.
+typedef IBackend* (*CreateBackendInterfaceFnType_t)();
+typedef void (*DestroyBackendInterfaceFnType_t)(IBackend*);

Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp ADDED Viewed

	@@ -0,0 +1,56 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include "QnnTypes.h"
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include <unordered_map>
+class IBufferAlloc {
+  public:
+    virtual ~IBufferAlloc() {}
+    IBufferAlloc() {}
+    virtual bool   initialize()                                                      = 0;
+    virtual void*  getBuffer(Qnn_Tensor_t* tensor)                                   = 0;
+    virtual int    getFd(Qnn_Tensor_t* tensor)                                       = 0;
+    virtual size_t getOffset(Qnn_Tensor_t* tensor)                                   = 0;
+    virtual size_t getBufferSize(Qnn_Tensor_t* tensor)                               = 0;
+    virtual size_t getTotalBufferSize(Qnn_Tensor_t* tensor)                          = 0;
+    virtual bool   allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) = 0;
+    virtual bool   freeTensorBuffer(Qnn_Tensor_t* tensor)                            = 0;
+    virtual bool   useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src)              = 0;
+    virtual bool   useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset)  = 0;
+    virtual bool   useExternalMemory(Qnn_Tensor_t* dest, void* extMem)               = 0;
+    virtual void*  allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd)       = 0;
+    virtual bool   allocateBuffers(
+              const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+              std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+      ) = 0;
+    virtual bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            size_t              tensorDataSize,
+            int32_t             fd,
+            uint32_t            offset,
+            uint64_t            totalBufferSize,
+            void*               memPointer,
+            Qnn_ContextHandle_t contextHandle
+    ) = 0;
+    virtual bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            int                 alloc_idx,
+            size_t              offset,
+            Qnn_ContextHandle_t ctx,
+            size_t              size
+    ) = 0;
+    virtual bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) = 0;
+    virtual void freeFusedBuffers()                                = 0;
+};

Genie/Genie/src/qualla/engines/qnn-api/ICommandLineManager.hpp ADDED Viewed

	@@ -0,0 +1,95 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <cctype>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+class ICommandLineManager {
+  public:
+    enum class Error { SUCCESS, PARSE_FAILURE, UNUSED_ARGUMENTS, OVER_SUBSCRIBED_ARGUMENTS };
+    using ValueList_t = std::vector<std::shared_ptr<const std::string>>;
+    /**
+   * @brief Parses provided command line arguments into key value pairs
+   *
+   * @param[in] argc   Number of char* arguments in argv
+   *
+   * @param[in] argv   Pointer to first element of null terminated character arrays
+   *
+   * @return Error code:
+   *         - SUCCESS: provided command line arguments match expected format: --key=value, --key
+   *         - PARSE_FAILURE: The provided command line arguments do not match expected format
+   *
+   */
+    virtual Error parseClArgs(size_t argc, char** argv) = 0;
+    /**
+   * @brief Provides passed values for requested key if available
+   *
+   * @param[in] key   Key string of option
+   *
+   * @return (False, empty) if key is not an available argument
+   *
+   */
+    virtual std::tuple<bool, ValueList_t> serveArg(const std::string& key) = 0;
+    /**
+   * @brief Checks whether any provided commandline arguments remain unserved
+   *
+   * @return True if unconsumed arguments remain, False otherwise
+   */
+    virtual bool allArgumentsServed() const = 0;
+    /**
+   * @brief Validates command line arguments were correctly utilized
+   *
+   * @return Error code:
+   *         - SUCCESS: provided command line arguments were utilized following implementations
+   * policy
+   *         - UNUSED_ARGUMENTS: Some arguments passed were not consumed
+   *         - OVER_SUBSCRIBED_ARGUMENTS: Some arguments were requested by multiple times
+   *
+   */
+    virtual Error validateUsage() = 0;
+    virtual ~ICommandLineManager() = default;
+    static bool isKey(const std::string& arg) {
+        return (arg.length() > keyPrefix().length()) && (arg.find(keyPrefix()) == 0) &&
+               std::isalpha(arg.at(keyPrefix().length()));
+    }
+    static Error parseKey(const std::string& arg, std::string& keyOut) {
+        if (!isKey(arg)) {
+            return Error::PARSE_FAILURE;
+        }
+        auto valueSplit = arg.find(keyValueSplit());
+        keyOut          = valueSplit != arg.npos ? arg.substr(0, valueSplit) : arg;
+        return Error::SUCCESS;
+    }
+    static Error parseValue(const std::string& arg, std::string& valueOut) {
+        auto valueSplit = arg.find(keyValueSplit());
+        if (valueSplit == arg.npos || valueSplit == arg.length() - 1) {
+            return Error::PARSE_FAILURE;
+        }
+        valueOut = arg.substr(valueSplit + 1);
+        return Error::SUCCESS;
+    }
+  private:
+    static const std::string keyPrefix() { return "--"; };
+    static char              keyValueSplit() { return '='; };
+};

Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp ADDED Viewed

	@@ -0,0 +1,382 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include "ClientBuffer.hpp"
+#include "IBufferAlloc.hpp"
+#include "IOTensor.hpp"
+#include "RpcMem.hpp"
+#include "QnnTypeMacros.hpp"
+#ifdef _WIN32
+    #define __strdup _strdup
+#else
+    #define __strdup strdup
+#endif
+IOTensor::IOTensor(BufferAlloc bufferAllocIn, QNN_INTERFACE_VER_TYPE* qnnInterface)
+    : m_bufferAlloc(bufferAllocIn), m_qnnInterface(qnnInterface),
+      m_bufferManager(new ClientBuffer()) {}
+bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) {
+    if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
+        m_bufferManager = std::unique_ptr<IBufferAlloc>(new RpcMem(contextHandle, m_qnnInterface));
+    }
+    if (true != m_bufferManager->initialize()) {
+        QNN_ERROR("Failed to initialize buffer manager");
+        return false;
+    }
+    return true;
+}
+IOTensor::~IOTensor() {
+    if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
+        m_bufferManager->freeFusedBuffers();
+    }
+}
+// Setup details for Qnn_Tensor_t for execution
+// based on information in TensorWrapper provided by model.so.
+bool IOTensor::setupTensors(
+        Qnn_Tensor_t**                           tensors,
+        std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+        uint32_t                                 tensorCount,
+        TensorWrapper*                           tensorWrappers,
+        std::unordered_map<std::string, size_t>& tensorsSize,
+        Qnn_ContextHandle_t                      contextHandle,
+        bool                                     skipBufferAllocation
+) {
+    if (nullptr == tensorWrappers) {
+        QNN_ERROR("tensorWrappers is nullptr");
+        return false;
+    }
+    if (0 == tensorCount) {
+        QNN_DEBUG("tensor count is 0. Nothing to setup.");
+        return true;
+    }
+    *tensors = (Qnn_Tensor_t*)calloc(1, tensorCount * sizeof(Qnn_Tensor_t));
+    if (nullptr == *tensors) {
+        QNN_ERROR("mem alloc failed for *tensors");
+        return false;
+    }
+    auto returnStatus = true;
+    uint64_t totalBufferSize = 0;
+    void*    memPointer      = nullptr;
+    int32_t  fd              = -1;
+    if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
+        // Calculate the total size of the tensors
+        for (size_t tensorIdx = 0; tensorIdx < tensorCount; tensorIdx++) {
+            auto wrapperTensorName =
+                    std::string(GET_TENSOR_WRAPPER_NAME(tensorWrappers[tensorIdx]));
+            totalBufferSize += tensorsSize[wrapperTensorName];
+        }
+        QNN_DEBUG("Calculated total size %lu", totalBufferSize);
+        if (!skipBufferAllocation) {
+            // Allocate the buffer of this size
+            memPointer = m_bufferManager->allocateTensorFusedBuffer(totalBufferSize, &fd);
+            if (memPointer) {
+                QNN_DEBUG(
+                        "Successfully allocated a buffer of size %lu, pointer %p, fd %d",
+                        (unsigned long)totalBufferSize,
+                        memPointer,
+                        fd
+                );
+            } else {
+                QNN_ERROR(
+                        "Not able to allocate buffer of size %lu", (unsigned long)totalBufferSize
+                );
+                return false;
+            }
+        }
+    }
+    uint64_t offset = 0;
+    for (size_t tensorIdx = 0; tensorIdx < tensorCount; tensorIdx++) {
+        Qnn_Tensor_t wrapperTensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrappers[tensorIdx]);
+        auto wrapperTensorName = std::string(GET_TENSOR_WRAPPER_NAME(tensorWrappers[tensorIdx]));
+        if (true == returnStatus) {
+            (*tensors)[tensorIdx] = QNN_TENSOR_INIT;
+            returnStatus          = deepCopyQnnTensorInfo(((*tensors) + tensorIdx), &wrapperTensor);
+        }
+        if (true == returnStatus) {
+            size_t tensorDataSize = tensorsSize[wrapperTensorName];
+            if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
+                if (!skipBufferAllocation) {
+                    returnStatus = m_bufferManager->mapFusedBufferOffset(
+                            ((*tensors) + tensorIdx),
+                            tensorDataSize,
+                            fd,
+                            offset,
+                            totalBufferSize,
+                            memPointer,
+                            contextHandle
+                    );
+                    offset += tensorDataSize;
+                }
+            } else {
+                returnStatus = m_bufferManager->allocateTensorBuffer(
+                        ((*tensors) + tensorIdx), tensorDataSize
+                );
+            }
+        }
+        if (true != returnStatus) {
+            QNN_ERROR("Failure in setupTensors, cleaning up resources");
+            tearDownTensors(*tensors, tensorIdx);
+            *tensors = nullptr;
+            QNN_ERROR("Failure in setupTensors, done cleaning up resources");
+            return false;
+        } else {
+            tensorNameToTensorPointer.insert({wrapperTensorName, ((*tensors) + tensorIdx)});
+            // QNN_DEBUG("allocateBuffer successful");
+        }
+    }
+    return returnStatus;
+}
+// Setup details for all input tensors for graph execution.
+bool IOTensor::setupInputTensors(
+        Qnn_Tensor_t**                           inputs,
+        std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+        const GraphInfo_t&                       graphInfo,
+        std::unordered_map<std::string, size_t>& inputTensorsSize,
+        Qnn_ContextHandle_t                      contextHandle,
+        bool                                     skipBufferAllocation
+) {
+    if (true != setupTensors(
+                        inputs,
+                        tensorNameToTensorPointer,
+                        graphInfo.numInputTensors,
+                        (graphInfo.inputTensors),
+                        inputTensorsSize,
+                        contextHandle,
+                        skipBufferAllocation
+                )) {
+        QNN_ERROR("Failure in setupInputTensors, cleaning up resources");
+        if (nullptr != *inputs) {
+            QNN_DEBUG("cleaning up input tensors");
+            tearDownTensors(*inputs, graphInfo.numInputTensors);
+            *inputs = nullptr;
+        }
+        QNN_ERROR("Failure in setupInputTensors, done cleaning up resources");
+        return false;
+    }
+    return true;
+}
+// Setup details for all output tensors for graph execution.
+bool IOTensor::setupOutputTensors(
+        Qnn_Tensor_t**                           outputs,
+        std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+        const GraphInfo_t&                       graphInfo,
+        std::unordered_map<std::string, size_t>& outputTensorsSize,
+        Qnn_ContextHandle_t                      contextHandle,
+        bool                                     skipBufferAllocation
+) {
+    if (true != setupTensors(
+                        outputs,
+                        tensorNameToTensorPointer,
+                        graphInfo.numOutputTensors,
+                        (graphInfo.outputTensors),
+                        outputTensorsSize,
+                        contextHandle,
+                        skipBufferAllocation
+                )) {
+        QNN_ERROR("Failure in setupOutputTensors, cleaning up resources");
+        if (nullptr != *outputs) {
+            QNN_DEBUG("cleaning up output tensors");
+            tearDownTensors(*outputs, graphInfo.numOutputTensors);
+            *outputs = nullptr;
+        }
+        QNN_ERROR("Failure in setupOutputTensors, done cleaning up resources");
+        return false;
+    }
+    return true;
+}
+bool IOTensor::mapFusedBufferOffset(
+        GraphInfo_t*                                                  graph_info,
+        Qnn_ContextHandle_t                                           context_handle,
+        const std::map<std::string, std::tuple<int, size_t, size_t>>& graph_allocs
+) {
+    std::lock_guard lk(_tmp_lock); // READ COMMENT IN IOTensor.hpp _tmp_lock
+    bool ret = true;
+    for (const bool mode : {true, false}) {
+        TensorWrapper* tensor_bank = (mode) ? graph_info->inputTensors : graph_info->outputTensors;
+        uint32_t num_tensors = (mode) ? graph_info->numInputTensors : graph_info->numOutputTensors;
+        for (size_t tidx = 0; tidx < num_tensors; tidx++) {
+            TensorWrapper& tensor_wrapper = tensor_bank[tidx];
+            Qnn_Tensor_t* tensor      = &GET_TENSOR_WRAPPER_TENSOR(tensor_wrapper);
+            std::string   tensor_name = std::string(GET_TENSOR_WRAPPER_NAME(tensor_wrapper));
+            if (!graph_allocs.contains(tensor_name)) continue;
+            auto& [alloc_idx, offset, size] = graph_allocs.at(tensor_name);
+            ret &= m_bufferManager->mapFusedBufferOffset(
+                    tensor, alloc_idx, offset, context_handle, size
+            );
+        }
+    }
+    return ret;
+}
+// Clean up all tensors related data after execution.
+bool IOTensor::tearDownTensors(Qnn_Tensor_t* tensors, uint32_t tensorCount) {
+    if (nullptr != tensors) {
+        QNN_DEBUG("cleaning up resources for tensors");
+        for (size_t tensorIdx = 0; tensorIdx < tensorCount; tensorIdx++) {
+            // QNN_DEBUG("freeing resources for tensor: %zu", tensorIdx);
+            if (nullptr != QNN_TENSOR_GET_DIMENSIONS(&tensors[tensorIdx])) {
+                // QNN_DEBUG("freeing maxDimensions");
+                free(QNN_TENSOR_GET_DIMENSIONS(&tensors[tensorIdx]));
+            }
+            if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
+                m_bufferManager->deregisterTensorFusedBuffer(&(tensors[tensorIdx]));
+            } else {
+                m_bufferManager->freeTensorBuffer(&(tensors[tensorIdx]));
+            }
+            m_freeTensorsPointerSet.insert(&(tensors[tensorIdx]));
+        }
+        free(tensors);
+        tensors = nullptr;
+    }
+    return true;
+}
+// Clean up all tensors after execution.
+bool IOTensor::tearDownTensors(std::vector<Qnn_Tensor_t*>& tensors, uint32_t numTensors) {
+    for (Qnn_Tensor_t* tensor : tensors) {
+        tearDownTensors(tensor, numTensors);
+    }
+    return true;
+}
+bool IOTensor::tearDownTensors(std::vector<Qnn_Tensor_t>& tensors) {
+    return tearDownTensors(tensors.data(), tensors.size());
+}
+// Clean up all tensors after execution.
+bool IOTensor::tearDownTensors(
+        std::unordered_map<std::string, Qnn_Tensor_t*>& tensors,
+        std::unordered_map<std::string, uint32_t>&      tensorCountMap
+) {
+    for (auto& tensor : tensors) {
+        tearDownTensors(tensor.second, tensorCountMap[tensor.first]);
+    }
+    return true;
+}
+// Clean up all tensors after execution.
+bool IOTensor::tearDownTensors(
+        std::vector<std::unordered_map<std::string, Qnn_Tensor_t*>>& tensors,
+        std::unordered_map<std::string, uint32_t>&                   tensorCountMap
+) {
+    for (auto& tensor : tensors) {
+        tearDownTensors(tensor, tensorCountMap);
+    }
+    return true;
+}
+bool IOTensor::deepCopyQnnTensorInfo(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) {
+    if (nullptr == dest || nullptr == src) {
+        QNN_ERROR("Received nullptr");
+        return false;
+    }
+    // set tensor.version before using QNN_TENSOR_SET macros, as they require the version to be set
+    // to correctly assign values
+    dest->version          = src->version;
+    const char* tensorName = QNN_TENSOR_GET_NAME(src);
+    if (!tensorName) {
+        QNN_TENSOR_SET_NAME(dest, nullptr);
+    } else {
+        QNN_TENSOR_SET_NAME(dest, __strdup(tensorName));
+    }
+    QNN_TENSOR_SET_ID(dest, QNN_TENSOR_GET_ID(src));
+    QNN_TENSOR_SET_TYPE(dest, QNN_TENSOR_GET_TYPE(src));
+    QNN_TENSOR_SET_DATA_FORMAT(dest, QNN_TENSOR_GET_DATA_FORMAT(src));
+    QNN_TENSOR_SET_DATA_TYPE(dest, QNN_TENSOR_GET_DATA_TYPE(src));
+    Qnn_QuantizeParams_t qParams = QNN_QUANTIZE_PARAMS_INIT;
+    qParams.encodingDefinition   = QNN_TENSOR_GET_QUANT_PARAMS(src).encodingDefinition;
+    qParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED;
+    if (QNN_TENSOR_GET_QUANT_PARAMS(src).quantizationEncoding ==
+        QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
+        qParams.quantizationEncoding = QNN_TENSOR_GET_QUANT_PARAMS(src).quantizationEncoding;
+        qParams.scaleOffsetEncoding  = QNN_TENSOR_GET_QUANT_PARAMS(src).scaleOffsetEncoding;
+    } else if (QNN_TENSOR_GET_QUANT_PARAMS(src).quantizationEncoding ==
+               QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        qParams.quantizationEncoding = QNN_TENSOR_GET_QUANT_PARAMS(src).quantizationEncoding;
+        qParams.axisScaleOffsetEncoding.axis =
+                QNN_TENSOR_GET_QUANT_PARAMS(src).axisScaleOffsetEncoding.axis;
+        qParams.axisScaleOffsetEncoding.numScaleOffsets =
+                QNN_TENSOR_GET_QUANT_PARAMS(src).axisScaleOffsetEncoding.numScaleOffsets;
+        if (QNN_TENSOR_GET_QUANT_PARAMS(src).axisScaleOffsetEncoding.numScaleOffsets > 0) {
+            qParams.axisScaleOffsetEncoding.scaleOffset = (Qnn_ScaleOffset_t*)malloc(
+                    QNN_TENSOR_GET_QUANT_PARAMS(src).axisScaleOffsetEncoding.numScaleOffsets *
+                    sizeof(Qnn_ScaleOffset_t)
+            );
+            if (qParams.axisScaleOffsetEncoding.scaleOffset) {
+                for (size_t idx = 0;
+                     idx < QNN_TENSOR_GET_QUANT_PARAMS(src).axisScaleOffsetEncoding.numScaleOffsets;
+                     idx++) {
+                    qParams.axisScaleOffsetEncoding.scaleOffset[idx].scale =
+                            QNN_TENSOR_GET_QUANT_PARAMS(src)
+                                    .axisScaleOffsetEncoding.scaleOffset[idx]
+                                    .scale;
+                    qParams.axisScaleOffsetEncoding.scaleOffset[idx].offset =
+                            QNN_TENSOR_GET_QUANT_PARAMS(src)
+                                    .axisScaleOffsetEncoding.scaleOffset[idx]
+                                    .offset;
+                }
+            }
+        }
+    }
+    QNN_TENSOR_SET_QUANT_PARAMS(dest, qParams);
+    QNN_TENSOR_SET_RANK(dest, QNN_TENSOR_GET_RANK(src));
+    QNN_TENSOR_SET_DIMENSIONS(dest, nullptr);
+    if (QNN_TENSOR_GET_RANK(src) > 0) {
+        QNN_TENSOR_SET_DIMENSIONS(
+                dest, (uint32_t*)malloc(QNN_TENSOR_GET_RANK(src) * sizeof(uint32_t))
+        );
+        if (QNN_TENSOR_GET_DIMENSIONS(dest)) {
+            memcpy(QNN_TENSOR_GET_DIMENSIONS(dest),
+                   QNN_TENSOR_GET_DIMENSIONS(src),
+                   QNN_TENSOR_GET_RANK(src) * sizeof(uint32_t));
+        }
+    }
+    return true;
+}

Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp ADDED Viewed

	@@ -0,0 +1,170 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <memory>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <mutex>
+#include "IBufferAlloc.hpp"
+#include "QnnTypeDef.hpp"
+#include "Log.hpp"
+#include "QnnBackend.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnGraph.h"
+#include "QnnInterface.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnTypes.h"
+enum class BufferAlloc {
+    DEFAULT,       // malloc based allocator
+    SHARED_BUFFER, // shared buffer allocator; actual allocator depends on the platform
+    INVALID
+};
+class IBufferAlloc;
+class IOTensor {
+  public:
+    IOTensor(
+            BufferAlloc             bufferAllocIn = BufferAlloc::DEFAULT,
+            QNN_INTERFACE_VER_TYPE* qnnInterface  = nullptr
+    );
+    ~IOTensor();
+    bool initialize(Qnn_ContextHandle_t contextHandle = nullptr);
+    bool setupInputTensors(
+            Qnn_Tensor_t**                           inputs,
+            std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+            const GraphInfo_t&                       graphInfo,
+            std::unordered_map<std::string, size_t>& inputTensorsSize,
+            Qnn_ContextHandle_t                      contextHandle,
+            bool                                     skipBufferAllocation = false
+    );
+    bool setupOutputTensors(
+            Qnn_Tensor_t**                           outputs,
+            std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+            const GraphInfo_t&                       graphInfo,
+            std::unordered_map<std::string, size_t>& outputTensorsSize,
+            Qnn_ContextHandle_t                      contextHandle,
+            bool                                     skipBufferAllocation = false
+    );
+    bool tearDownTensors(Qnn_Tensor_t* tensors, uint32_t tensorCount);
+    bool tearDownTensors(std::vector<Qnn_Tensor_t*>& tensors, uint32_t tensorCount);
+    bool tearDownTensors(std::vector<Qnn_Tensor_t>& tensors);
+    bool tearDownTensors(
+            std::unordered_map<std::string, Qnn_Tensor_t*>& tensors,
+            std::unordered_map<std::string, uint32_t>&      tensorCountMap
+    );
+    bool tearDownTensors(
+            std::vector<std::unordered_map<std::string, Qnn_Tensor_t*>>& tensors,
+            std::unordered_map<std::string, uint32_t>&                   tensorCountMap
+    );
+    bool tearDownTensors(const GraphInfo_t* graph_info) {
+        bool status = true;
+        if (!tearDownTensors(graph_info->inputTensors, graph_info->numInputTensors)) {
+            status = false;
+            QNN_ERROR("Failed to tear down input tensors for graph %s", graph_info->graphName);
+        }
+        if (!tearDownTensors(graph_info->outputTensors, graph_info->numOutputTensors)) {
+            status = false;
+            QNN_ERROR("Failed to tear down output tensors for graph %s", graph_info->graphName);
+        }
+        return status;
+    }
+    void* getBuffer(Qnn_Tensor_t* tensor) { return m_bufferManager->getBuffer(tensor); };
+    int getFd(Qnn_Tensor_t* tensor) { return m_bufferManager->getFd(tensor); };
+    size_t getOffset(Qnn_Tensor_t* tensor) { return m_bufferManager->getOffset(tensor); };
+    size_t getBufferSize(Qnn_Tensor_t* tensor) { return m_bufferManager->getBufferSize(tensor); };
+    size_t getTotalBufferSize(Qnn_Tensor_t* tensor) {
+        return m_bufferManager->getTotalBufferSize(tensor);
+    }
+    void* allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) {
+        return m_bufferManager->allocateTensorFusedBuffer(bufferSize, fd);
+    }
+    bool allocateBuffers(
+            const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+            std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+    ) {
+        return m_bufferManager->allocateBuffers(allocs_per_chunk, tensor_offsets);
+    }
+    bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            size_t              tensorDataSize,
+            int32_t             fd,
+            uint32_t            offset,
+            uint64_t            totalBufferSize,
+            void*               memPointer,
+            Qnn_ContextHandle_t contextHandle
+    ) {
+        return m_bufferManager->mapFusedBufferOffset(
+                tensor, tensorDataSize, fd, offset, totalBufferSize, memPointer, contextHandle
+        );
+    }
+    bool mapFusedBufferOffset(
+            GraphInfo_t*                                                  graph_info,
+            Qnn_ContextHandle_t                                           context_handle,
+            const std::map<std::string, std::tuple<int, size_t, size_t>>& graph_allocs
+    );
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) {
+        return m_bufferManager->useSameMemory(dest, src);
+    }
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) {
+        return m_bufferManager->useSameMemory(dest, src, offset);
+    }
+    bool useExternalMemory(Qnn_Tensor_t* dest, void* extMem) {
+        return m_bufferManager->useExternalMemory(dest, extMem);
+    }
+    BufferAlloc getBufferAllocType() { return m_bufferAlloc; }
+    std::unordered_set<void*>& getFreeTensorsPointerSet() { return m_freeTensorsPointerSet; }
+  private:
+    BufferAlloc                   m_bufferAlloc;
+    QNN_INTERFACE_VER_TYPE*       m_qnnInterface;
+    std::unique_ptr<IBufferAlloc> m_bufferManager;
+    std::unordered_set<void*>     m_freeTensorsPointerSet;
+    // There seems to be a race condition in mapFusedBufferOffset because we are
+    // calling it from multiple threads. Maybe memRegister/memDeRegister is not thread-safe
+    // Until I figure this out, adding a temporary lock here. TODO: Fix and remove this!
+    std::mutex _tmp_lock;
+    bool deepCopyQnnTensorInfo(Qnn_Tensor_t* dest, Qnn_Tensor_t* src);
+    bool setupTensors(
+            Qnn_Tensor_t**                           tensors,
+            std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+            uint32_t                                 tensorCount,
+            TensorWrapper*                           tensorsInfo,
+            std::unordered_map<std::string, size_t>& tensorsSize,
+            Qnn_ContextHandle_t                      contextHandle,
+            bool                                     skipBufferAllocation = false
+    );
+};

Genie/Genie/src/qualla/engines/qnn-api/Log.hpp ADDED Viewed

	@@ -0,0 +1,24 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <stdio.h>
+// FIXME: Use logger from qualla::Env
+#define QNN_INFO(fmt, ...)  fprintf(stderr, "[INFO]  " #fmt "\n", ##__VA_ARGS__)
+#define QNN_ERROR(fmt, ...) fprintf(stderr, "[ERROR] " #fmt "\n", ##__VA_ARGS__)
+#define QNN_WARN(fmt, ...)  fprintf(stderr, "[WARN]  " #fmt "\n", ##__VA_ARGS__)
+#if 0
+    // #define NSP_LOG_LEVEL 2
+    #define QNN_DEBUG(fmt, ...) fprintf(stderr, "[DEBUG] " #fmt "\n", ##__VA_ARGS__)
+#else
+    #define QNN_DEBUG(fmt, ...)
+#endif

Genie/Genie/src/qualla/engines/qnn-api/NetRunBackend.hpp ADDED Viewed

	@@ -0,0 +1,173 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <string>
+#include "ICommandLineManager.hpp"
+#include "IBackend.hpp"
+// This is an implementation of IBackend interface within qnn-net-run.
+// NetRunBackend provides a dummy implementation of IBackend as a concrete
+// implementation is needed in case there is no backend extensions library
+// supplied by the user.
+// This is built as part of QnnNetRun library and is used in case of no
+// user supplied backend extensions implementation.
+class NetRunBackend final : public IBackend {
+  public:
+    NetRunBackend() {}
+    virtual ~NetRunBackend() {}
+    virtual bool setupLogging(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) override {
+        ignore(callback);
+        ignore(maxLogLevel);
+        return true;
+    }
+    virtual bool initialize(void* backendLibHandle) override {
+        ignore(backendLibHandle);
+        return true;
+    }
+    virtual bool setPerfProfile(PerfProfile perfProfile) override {
+        ignore(perfProfile);
+        return true;
+    }
+    virtual QnnProfile_Level_t getProfilingLevel() override { return g_profilingLevelNotSet; }
+    virtual bool loadConfig(std::string configFile) override {
+        ignore(configFile);
+        return true;
+    }
+    virtual bool loadCommandLineArgs(std::shared_ptr<ICommandLineManager> clManager) override {
+        ignore(clManager);
+        return true;
+    }
+    virtual bool beforeBackendInitialize(
+            QnnBackend_Config_t*** customConfigs,
+            uint32_t*              configCount
+    ) override {
+        ignore(customConfigs);
+        ignore(configCount);
+        return true;
+    }
+    virtual bool afterBackendInitialize() override { return true; }
+    virtual bool beforeContextCreate(QnnContext_Config_t*** customConfigs, uint32_t* configCount)
+            override {
+        ignore(customConfigs);
+        ignore(configCount);
+        return true;
+    }
+    virtual bool afterContextCreate() override { return true; }
+    virtual bool beforeComposeGraphs(GraphConfigInfo_t*** customGraphConfigs, uint32_t* graphCount)
+            override {
+        ignore(customGraphConfigs);
+        ignore(graphCount);
+        return true;
+    }
+    virtual bool afterComposeGraphs() override { return true; }
+#if QUALLA_QNN_API_VERSION >= 21700
+    virtual bool beforeGraphFinalizeUpdateConfig(
+            const char*          graphName,
+            Qnn_GraphHandle_t    graphHandle,
+            QnnGraph_Config_t*** customConfigs,
+            uint32_t*            configCount
+    ) override {
+        ignore(graphName);
+        ignore(graphHandle);
+        ignore(customConfigs);
+        ignore(configCount);
+        return true;
+    }
+#endif
+    virtual bool beforeGraphFinalize() override { return true; }
+    virtual bool afterGraphFinalize() override { return true; }
+    virtual bool beforeRegisterOpPackages() override { return true; }
+    virtual bool afterRegisterOpPackages() override { return true; }
+    virtual bool beforeExecute(
+            const char*          graphName,
+            QnnGraph_Config_t*** customConfigs,
+            uint32_t*            configCount
+    ) override {
+        ignore(graphName);
+        ignore(customConfigs);
+        ignore(configCount);
+        return true;
+    }
+    virtual bool afterExecute() override { return true; }
+    virtual bool beforeContextFree() override { return true; }
+    virtual bool afterContextFree() override { return true; }
+    virtual bool beforeBackendTerminate() override { return true; }
+    virtual bool afterBackendTerminate() override { return true; }
+    virtual bool beforeCreateFromBinary(QnnContext_Config_t*** customConfigs, uint32_t* configCount)
+            override {
+        ignore(customConfigs);
+        ignore(configCount);
+        return true;
+    }
+    virtual bool afterCreateFromBinary() override { return true; }
+#if QUALLA_QNN_API_VERSION >= 21700
+    virtual bool beforeCreateContextsFromBinaryList(
+            std::map<std::string, std::tuple<QnnContext_Config_t**, uint32_t>>*
+                                   contextKeyToCustomConfigsMap,
+            QnnContext_Config_t*** commonCustomConfigs,
+            uint32_t*              commonConfigCount
+    ) override {
+        ignore(contextKeyToCustomConfigsMap);
+        ignore(commonCustomConfigs);
+        ignore(commonConfigCount);
+        return true;
+    }
+    virtual bool afterCreateContextsFromBinaryList() override { return true; }
+#endif
+    virtual bool beforeCreateDevice(QnnDevice_Config_t*** deviceConfigs, uint32_t* configCount)
+            override {
+        ignore(deviceConfigs);
+        ignore(configCount);
+        return true;
+    }
+    virtual bool afterCreateDevice() override { return true; }
+    virtual bool beforeFreeDevice() override { return true; }
+    virtual bool afterFreeDevice() override { return true; }
+  private:
+    // Utility function to ignore compiler warnings when a variable
+    // is unused. Recommended by Herb Sutter in Sutter's Mill
+    // instead of (void)variable.
+    template <typename T>
+    void ignore(const T&) {}
+};

Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp ADDED Viewed

	@@ -0,0 +1,429 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include "BackendExtensions.hpp"
+#include "QnnConfig.hpp"
+#include "QnnHtpPerfInfrastructure.h"
+#include "QnnHtpDevice.h"
+#include "qnn-utils.hpp"
+#include "IOTensor.hpp"
+#include <memory>
+#include <mutex>
+#define QNN_IO_TENSOR_DEBUG 0
+enum KVManagerMode { POINTER_SHIFT = 0x0, SHIFT_CONCAT = 0x1 };
+using qualla::QnnUtils::QuantParam;
+#define QUALLA_QNN_API_VERSION                                                                     \
+    (QNN_API_VERSION_MAJOR * 10000 + QNN_API_VERSION_MINOR * 100 + QNN_API_VERSION_PATCH)
+static std::map<Qnn_DataType_t, size_t> g_qnnDataTypeToSize = {
+        {QNN_DATATYPE_INT_8, 1},
+        {QNN_DATATYPE_INT_16, 2},
+        {QNN_DATATYPE_INT_32, 4},
+        {QNN_DATATYPE_INT_64, 8},
+        {QNN_DATATYPE_UINT_8, 1},
+        {QNN_DATATYPE_UINT_16, 2},
+        {QNN_DATATYPE_UINT_32, 4},
+        {QNN_DATATYPE_UINT_64, 8},
+        {QNN_DATATYPE_FLOAT_16, 2},
+        {QNN_DATATYPE_FLOAT_32, 4},
+        {QNN_DATATYPE_SFIXED_POINT_8, 1},
+        {QNN_DATATYPE_SFIXED_POINT_16, 2},
+        {QNN_DATATYPE_SFIXED_POINT_32, 4},
+        {QNN_DATATYPE_UFIXED_POINT_8, 1},
+        {QNN_DATATYPE_UFIXED_POINT_16, 2},
+        {QNN_DATATYPE_UFIXED_POINT_32, 4},
+        {QNN_DATATYPE_BOOL_8, 1},
+};
+class QnnApi {
+  private:
+    const uint32_t s_graphConfigsReserveCount = 16;
+    // Model vars
+    typedef Qnn_ErrorHandle_t (*QnnInterfaceGetProvidersFn_t)(
+            const QnnInterface_t*** providerList,
+            uint32_t*               numProviders
+    );
+    typedef Qnn_ErrorHandle_t (*QnnSystemInterfaceGetProvidersFn_t)(
+            const QnnSystemInterface_t*** providerList,
+            uint32_t*                     numProviders
+    );
+    // Graph Related Function Handle Types
+    typedef ModelError_t (*ComposeGraphsFnHandleType_t)(
+            Qnn_BackendHandle_t,
+            QNN_INTERFACE_VER_TYPE,
+            Qnn_ContextHandle_t,
+            const GraphConfigInfo_t**,
+            const uint32_t,
+            GraphInfo_t***,
+            uint32_t*,
+            bool,
+            QnnLog_Callback_t,
+            QnnLog_Level_t
+    );
+    typedef ModelError_t (*GenAIComposeGraphsFnHandleType_t)(
+            Qnn_BackendHandle_t,
+            QNN_INTERFACE_VER_TYPE,
+            Qnn_ContextHandle_t,
+            const GraphConfigInfo_t**,
+            const uint32_t,
+            uint32_t*    inputDim,
+            uint32_t     inputRank,
+            uint32_t*    outputDim,
+            uint32_t     outputRank,
+            uint32_t*    kvDim,
+            uint32_t     kvRank,
+            Qnn_Param_t* params,
+            uint32_t     numParam,
+            GraphInfo_t***,
+            uint32_t*,
+            bool,
+            QnnLog_Callback_t,
+            QnnLog_Level_t
+    );
+    typedef ModelError_t (*FreeGraphInfoFnHandleType_t)(GraphInfo_t***, uint32_t);
+    void* m_libModelHandle{nullptr};
+    void* m_backendHandle{nullptr};
+    void* m_backendLibraryHandle{nullptr};
+    QNN_INTERFACE_VER_TYPE             m_qnnInterface{nullptr};
+    QNN_SYSTEM_INTERFACE_VER_TYPE      m_qnnSystemInterface{nullptr};
+    std::unique_ptr<BackendExtensions> m_backendExtensions{nullptr};
+    ComposeGraphsFnHandleType_t        m_composeGraphsFnHandle{nullptr};
+    GenAIComposeGraphsFnHandleType_t   m_genaiComposeGraphsFnHandle{nullptr};
+    FreeGraphInfoFnHandleType_t        m_freeGraphInfoFnHandle{nullptr};
+    uint32_t                           m_backendId{0};
+    Qnn_LogHandle_t    m_logHandle{nullptr};
+    Qnn_DeviceHandle_t m_deviceHandle{nullptr};
+    Qnn_ProfileHandle_t m_profileBackendHandle{nullptr};
+    std::vector<Qnn_ContextHandle_t>                    m_contextVec;
+    std::unordered_map<GraphInfo*, Qnn_ContextHandle_t> m_contextMap;
+    uint32_t                                            m_graphsCount{0};
+    int32_t                                             graphCountPerContext{-1};
+    GraphInfo_t**                                       m_graphsInfo;
+    std::unordered_map<std::string, uint32_t>           m_graphNameToIndex;
+    std::unordered_map<std::string, GraphInfo*>         m_graphNameToInfo;
+    std::unordered_map<std::string, uint32_t>           m_graphNameToContextIdx;
+    std::unordered_map<uint32_t, Qnn_ContextHandle_t>   m_contextIdtoHandle;
+    std::mutex                                          m_updateCallBackMutex;
+    // Useful Structure for IO Esimtation
+    std::unordered_map<int,qualla::QnnUtils::TensorMap> m_graphtoIOMap; // stores {GraphId -> IOTensorMap}
+    typedef int CtxBitVector;
+    std::map<CtxBitVector, std::map<std::string, size_t>> m_contextAllocMap; // stores {Translated ContextId -> {Tensor name, size}}
+    std::map<std::string, std::pair<int, size_t>> m_tensorAllocInfo; // stores {Tensor name -> (fd of RPC buffer, offset)}
+    std::unordered_map<uint32_t, uint32_t> m_graphIdxToContextIdx; // stores {Graph Idx -> Context Idx}
+    std::unordered_map<std::string,std::shared_ptr<uint8_t>> m_adapterNameToBuffer;
+    uint32_t              m_backendConfigCount{0};
+    QnnBackend_Config_t** m_backendConfigs{nullptr};
+    QnnHtpDevice_PerfInfrastructure_t* m_perfInfra{nullptr};
+    uint32_t                           m_powerConfigId = 1;
+     // Useful Structure for IO Esimtation
+    IOTensor*             m_ioBufferMgr{nullptr};
+    int32_t               m_ctxSize{-1};
+    int32_t               m_kvDim{-1};
+    bool                  m_loraWeightEnabled{false};
+    bool                  m_lmHeadWeightInput{false};
+    KVManagerMode         m_kvUpdateMethod{POINTER_SHIFT};
+    bool m_isLogInitialized{false};
+    bool m_isBackendInitialized{false};
+    bool m_isContextCreated{false};
+    // Variable to keep track of debug mode
+    bool m_DebugModeRequested;
+    bool m_debugQnn{false};
+    // Variable to indicate whether to mmap context bins or read them in memory
+    bool m_mmapContextBins;
+    bool m_isDeviceCreated = false;
+    std::vector<std::pair<uint8_t*, uint64_t>> m_contextBinBuffersToBeCleared;
+    void setDeviceStatus(bool status) { m_isDeviceCreated = status; }
+    bool getDeviceStatus() { return m_isDeviceCreated; }
+    bool getContextConfigs(
+            QnnContext_Config_t***          configs,
+            uint32_t&                       contextConfigCount,
+            Qnn_Priority_t                  contextPriority,
+            bool                            graphSwitching   = false,
+            const std::vector<std::string>& execSelectGraphs = {},
+            bool                            loadSelectGraphs = false
+    );
+    bool mergeAllContextConfigs(
+            QnnContext_Config_t*** allCustomContextConfigs,
+            QnnContext_Config_t**  customConfigs,
+            QnnContext_Config_t**  contextConfigs,
+            uint32_t               customConfigCount,
+            uint32_t               contextConfigCount
+    );
+    bool freeContextConfigs(QnnContext_Config_t** contextConfigs, uint32_t contextConfigCount);
+    bool setGraphConfigsBeforeExecute(
+            Qnn_GraphHandle_t   graphHandle,
+            QnnGraph_Config_t** graphConfigs,
+            uint32_t            configCount
+    );
+    bool getQnnInterface(std::string backendPath);
+    bool getQnnSystemInterface(std::string systemLibraryPath);
+    bool loadModel(std::string model_path);
+    bool initializeLogging(const QnnLog_Level_t& logLevel, bool debug_qnn);
+    void terminateLog();
+    bool initializeBackendExtensions(
+            BackendExtensionsConfigs backendExtensionsConfig,
+            PerfProfile              parsedPerfProfile,
+            bool                     debug_qnn
+    );
+    bool initializeBackend();
+    bool terminateBackend();
+    bool createDevice();
+    bool freeDevice();
+    bool createContext(ContextConfigs contextConfig);
+    bool freeContext();
+    bool composeGraphs(std::vector<GraphConfigs> graphConfigs);
+    bool composeGraphs(
+            std::vector<GraphConfigs> graphConfigs,
+            uint32_t*                 inputDim,
+            uint32_t                  inputRank,
+            uint32_t*                 outputDim,
+            uint32_t                  outputRank,
+            uint32_t*                 kvDim,
+            uint32_t                  kvRank,
+            Qnn_Param_t*              params,
+            uint32_t                  numParams
+    );
+    bool mapAndGetContextBinaryInfo(
+            const bool                            use_mmap,
+            std::shared_ptr<uint8_t>&             buffer,
+            const std::string                     binaryPath,
+            const uint64_t                        bufferSize,
+            const size_t                          contextIdx,
+            const bool                            graphSwitching,
+            QnnSystemContext_Handle_t             sysCtxHandle,
+            const QnnSystemContext_BinaryInfo_t** binaryInfo
+    );
+    bool parseIOTensorsAndAccumulate();
+    bool registerTensorsWithBackend(uint32_t& graphIdx);
+    bool finalizeGraphs();
+    bool initializePerformance();
+    bool destroyPerformance();
+    bool boostPerformance();
+    bool resetPerformance();
+    bool checkCapabilityOfCreateAsync(bool& propRet);
+    bool initProfiling();
+    bool extractBackendProfilingInfo(
+            Qnn_ProfileHandle_t                                 profileHandle,
+            std::map<std::string, std::pair<double, uint16_t>>& timeLogs,
+            std::string                                         graphName
+    );
+    bool extractProfilingSubEvents(
+            QnnProfile_EventId_t                                profileEventId,
+            std::map<std::string, std::pair<double, uint16_t>>& timeLogs,
+            std::string                                         graphName
+    );
+    bool extractProfilingEvent(
+            QnnProfile_EventId_t                                profileEventId,
+            std::map<std::string, std::pair<double, uint16_t>>& timeLogs,
+            std::string                                         graphName
+    );
+    bool extractBackendProfilingInfo(Qnn_ProfileHandle_t profileHandle);
+    bool extractProfilingSubEvents(QnnProfile_EventId_t profileEventId);
+    bool extractProfilingEvent(QnnProfile_EventId_t profileEventId);
+    Qnn_ContextHandle_t getContextWithId(uint32_t contextId) {
+        return m_contextIdtoHandle[contextId];
+    }
+  public:
+    QnnApi() {};
+    ~QnnApi();
+    bool           freeGraphs();
+    static QnnApi& getInstance();
+#if QUALLA_QNN_API_VERSION >= 21700
+    static void contextNotifyFn(
+            Qnn_ContextHandle_t                          context,
+            Qnn_GraphHandle_t                            graph,
+            const char*                                  graph_name,
+            QnnContext_createFromBinaryAsyncNotifyType_t completeType,
+            void*                                        notifyParam,
+            Qnn_ErrorHandle_t                            status
+    );
+#endif
+    bool createFromBinary(
+            std::vector<std::string>        cachedBinariesPathVec,
+            ContextConfigs                  contextConfig,
+            int64_t                         spill_fill_buffer_size = 0,
+            uint64_t                        mmap_budget            = 0,
+            bool                            graphSwitching         = false,
+            const std::vector<std::string>& execSelectGraphs       = {},
+            bool                            loadSelectGraphs       = false
+    );
+#if QUALLA_QNN_API_VERSION >= 21700
+    bool createFromBinaryListAsync(
+            std::vector<std::string>        cachedBinariesPathVec,
+            ContextConfigs                  contextConfig,
+            int64_t                         spill_fill_buffer_size = 0,
+            uint64_t                        mmap_budget            = 0,
+            bool                            graphSwitching         = false,
+            const std::vector<std::string>& execSelectGraphs       = {},
+            bool                            loadSelectGraphs       = false
+    );
+#endif
+    bool initialize(
+            std::string                     backendPath,
+            std::vector<std::string>        modelPathOrCachedBinaryPathVec,
+            BackendExtensionsConfigs        backendExtensionsConfig,
+            PerfProfile                     parsedPerfProfile      = PerfProfile::BURST,
+            ContextConfigs                  contextConfig          = ContextConfigs(),
+            std::vector<GraphConfigs>       graphConfigs           = {},
+            bool                            loadFromCachedBinary   = false,
+            std::string                     systemLibraryPath      = "",
+            bool                            debugModeRequested     = false,
+            int64_t                         spill_fill_buffer_size = 0,
+            bool                            mmapContextBins        = false,
+            bool                            asyncInit              = true,
+            uint64_t                        mmap_budget            = 0,
+            bool                            debug_qnn              = false,
+            bool                            graphSwitching         = false,
+            const std::vector<std::string>& execSelectGraphs       = {},
+            bool                            loadSelectGraphs       = false
+    );
+    bool registerOpPackage(std::string opPackagePath);
+    void setIOTensorBufferMgr(IOTensor* ioBufferMgr){
+        m_ioBufferMgr = ioBufferMgr;
+    }
+    void setKVDim(int32_t kvDim){
+        m_kvDim = kvDim;
+    }
+    void setContextSize(int32_t ctxSize){
+       m_ctxSize = ctxSize;
+    }
+    void setKVUpdateMethod(KVManagerMode kvUpdateMethod){
+       m_kvUpdateMethod = kvUpdateMethod ;
+    }
+    std::map<std::string, std::pair<int, size_t>>* getTensorAllocInfo(){
+        return &m_tensorAllocInfo;
+    }
+    bool getLmHeadWeightInputEnabled(){
+       return m_lmHeadWeightInput;
+    }
+    bool getLoraWeightEnabled(){
+       return m_loraWeightEnabled;
+    }
+    // Initalize with OpPackage
+    bool initialize(
+            std::string               backendPath,
+            std::string               modelPath,
+            std::string               opPackage,
+            ContextConfigs            contextConfig,
+            std::vector<GraphConfigs> graphConfigs,
+            uint32_t*                 inputDim,
+            uint32_t                  inputRank,
+            uint32_t*                 outputDim,
+            uint32_t                  outputRank,
+            uint32_t*                 kvDim,
+            uint32_t                  kvRank,
+            Qnn_Param_t*              params,
+            uint32_t                  numParams,
+            bool                      debugModeRequested
+    );
+    bool graphExecute(
+            Qnn_Tensor_t*                                       input,
+            Qnn_Tensor_t*                                       output,
+            std::string                                         graphName,
+            std::map<std::string, std::pair<double, uint16_t>>& timeLogs
+    );
+    bool applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch);
+    QNN_INTERFACE_VER_TYPE*           getQnnInterfaceVer() { return &m_qnnInterface; };
+    GraphInfo_t**&                    getGraphsInfo() { return m_graphsInfo; };
+    uint32_t                          getGraphsCount() { return m_graphsCount; };
+    int32_t                           getGraphCountPerContext() { return graphCountPerContext; }
+    std::vector<Qnn_ContextHandle_t>& getContexts() { return m_contextVec; };
+    const Qnn_ContextHandle_t         getContexts(GraphInfo_t* const graph) {
+        return m_contextMap.at(graph);
+    };
+    void updateContext(Qnn_ContextHandle_t context, uint32_t contextId) {
+        std::lock_guard<std::mutex> lock(m_updateCallBackMutex);
+        m_contextVec.push_back(context);
+        m_contextIdtoHandle[contextId] = context;
+    }
+    void updateQnnApiGraphsandContextsInfo(
+            std::string       graphName,
+            Qnn_GraphHandle_t graph,
+            uint32_t          contextId
+    ) {
+        // set graph handle to GraphInfo
+        std::lock_guard<std::mutex> lock(m_updateCallBackMutex);
+        m_graphNameToInfo[graphName]->graph = graph;
+        m_graphNameToContextIdx[graphName]  = contextId;
+        m_graphsCount++;
+    }
+    static inline size_t getDataTypeSize(const Qnn_DataType_t& datatype) {
+        return g_qnnDataTypeToSize[datatype];
+    }
+    static inline std::string getTensorName(const TensorWrapper& tensorWrapper) {
+        return GET_TENSOR_WRAPPER_NAME(tensorWrapper);
+    }
+    static bool getTensorQuantParams(
+            const Qnn_Tensor_t*      tensor,
+            std::vector<QuantParam>& quantParamsVec
+    );
+    static bool getTensorShape(std::vector<size_t>& tensorDims, const TensorWrapper& tensorWrapper);
+    static inline Qnn_DataType_t getTensorDtype(const Qnn_Tensor_t* tensor) {
+        return QNN_TENSOR_GET_DATA_TYPE(tensor);
+    }
+    bool getTensorNameAndShape(
+            std::string&         tensorName,
+            std::vector<size_t>& tensorDims,
+            TensorWrapper&       tensorWrapper
+    );
+    static void qnnLogCallback(
+            const char*    fmt,
+            QnnLog_Level_t level,
+            uint64_t       timestamp,
+            va_list        args
+    );
+    bool updateIOEncodings(std::shared_ptr<uint8_t>& buffer,
+                           uint64_t  bufferSize,
+                           uint32_t graphIndex);
+};

Genie/Genie/src/qualla/engines/qnn-api/QnnApiUtils.cpp ADDED Viewed

	@@ -0,0 +1,636 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "QnnApiUtils.hpp"
+#include "QnnTypeMacros.hpp"
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef _WIN32
+    #include <windows.h>
+    #define __open   ::_open
+    #define __strdup ::_strdup
+#else
+    #include <unistd.h>
+    #include <sys/mman.h>
+    #define __open   ::open
+    #define __strdup ::strdup
+#endif
+bool freeQnnTensorWrapper(TensorWrapper& tensorWrapper) {
+    // free all pointer allocations in struct
+    if (nullptr != GET_TENSOR_WRAPPER_NAME(tensorWrapper)) {
+        free((void*)GET_TENSOR_WRAPPER_NAME(tensorWrapper));
+    }
+    Qnn_Tensor_t& tensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrapper);
+    free(QNN_TENSOR_GET_DIMENSIONS(tensor));
+    return true;
+}
+bool freeQnnTensorWrappers(TensorWrapper*& tensorWrappers, uint32_t numTensors) {
+    // free all pointer allocations in struct
+    for (size_t i = 0; i < numTensors; i++) {
+        freeQnnTensorWrapper(tensorWrappers[i]);
+    }
+    free(tensorWrappers);
+    return true;
+}
+bool freeGraphsInfo(GraphInfoPtr_t** graphsInfo, uint32_t numGraphs) {
+    if (graphsInfo == nullptr || *graphsInfo == nullptr) {
+        return false;
+    }
+    for (uint32_t i = 0; i < numGraphs; i++) {
+        if (nullptr != (*graphsInfo)[i]) {
+            free((*graphsInfo)[i]->graphName);
+            freeQnnTensorWrappers(
+                    (*graphsInfo)[i]->inputTensors, (*graphsInfo)[i]->numInputTensors
+            );
+            freeQnnTensorWrappers(
+                    (*graphsInfo)[i]->outputTensors, (*graphsInfo)[i]->numOutputTensors
+            );
+        }
+    }
+    free(**graphsInfo);
+    free(*graphsInfo);
+    *graphsInfo = nullptr;
+    return true;
+}
+bool freeGraphInfo(GraphInfo_t* graphInfo) {
+    if (graphInfo == nullptr) {
+        return false;
+    }
+    if (nullptr != graphInfo->graphName) {
+        free(graphInfo->graphName);
+    }
+    freeQnnTensorWrappers(graphInfo->inputTensors, graphInfo->numInputTensors);
+    freeQnnTensorWrappers(graphInfo->outputTensors, graphInfo->numOutputTensors);
+    free(graphInfo);
+    return true;
+}
+bool updateTensorInfo(const Qnn_Tensor_t* tensorsInfoSrc,
+        TensorWrapper*     tensorWrappers,
+        uint32_t            tensorsCount
+){
+    for (size_t tIdx = 0; tIdx < tensorsCount; tIdx++) {
+        QNN_DEBUG("Extracting tensorInfo for tensor Idx: %d", (int)tIdx);
+        Qnn_Tensor_t& tensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrappers[tIdx]);
+        QNN_TENSOR_SET_ID(tensor, QNN_TENSOR_GET_ID(&tensorsInfoSrc[tIdx]));
+        QNN_TENSOR_SET_TYPE(tensor, QNN_TENSOR_GET_TYPE(&tensorsInfoSrc[tIdx]));
+        QNN_TENSOR_SET_DATA_FORMAT(tensor, QNN_TENSOR_GET_DATA_FORMAT(&tensorsInfoSrc[tIdx]));
+        QNN_TENSOR_SET_DATA_TYPE(tensor, QNN_TENSOR_GET_DATA_TYPE(&tensorsInfoSrc[tIdx]));
+        Qnn_QuantizeParams_t qParams = QNN_QUANTIZE_PARAMS_INIT;
+        qParams.encodingDefinition =
+                QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).encodingDefinition;
+        qParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED;
+        if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding ==
+            QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
+            qParams.quantizationEncoding =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding;
+            qParams.scaleOffsetEncoding =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).scaleOffsetEncoding;
+        } else if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding ==
+                    QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+            qParams.quantizationEncoding =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding;
+            qParams.axisScaleOffsetEncoding.axis =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                            .axisScaleOffsetEncoding.axis;
+            qParams.axisScaleOffsetEncoding.numScaleOffsets =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                            .axisScaleOffsetEncoding.numScaleOffsets;
+            if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                        .axisScaleOffsetEncoding.numScaleOffsets > 0) {
+                qParams.axisScaleOffsetEncoding.scaleOffset = (Qnn_ScaleOffset_t*)malloc(
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                .axisScaleOffsetEncoding.numScaleOffsets *
+                        sizeof(Qnn_ScaleOffset_t)
+                );
+                if (qParams.axisScaleOffsetEncoding.scaleOffset) {
+                    for (size_t idx = 0;
+                            idx < QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                        .axisScaleOffsetEncoding.numScaleOffsets;
+                            idx++) {
+                        qParams.axisScaleOffsetEncoding.scaleOffset[idx].scale =
+                                QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                        .axisScaleOffsetEncoding.scaleOffset[idx]
+                                        .scale;
+                        qParams.axisScaleOffsetEncoding.scaleOffset[idx].offset =
+                                QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                        .axisScaleOffsetEncoding.scaleOffset[idx]
+                                        .offset;
+                    }
+                }
+            }
+        }
+        QNN_TENSOR_SET_QUANT_PARAMS(tensor, qParams);
+        QNN_TENSOR_SET_RANK(tensor, QNN_TENSOR_GET_RANK(&tensorsInfoSrc[tIdx]));
+        if (QNN_TENSOR_GET_RANK(tensorsInfoSrc[tIdx]) > 0) {
+            if (QNN_TENSOR_GET_DIMENSIONS(tensor)) {
+                memcpy(QNN_TENSOR_GET_DIMENSIONS(tensor),
+                        QNN_TENSOR_GET_DIMENSIONS(&tensorsInfoSrc[tIdx]),
+                        QNN_TENSOR_GET_RANK(&tensorsInfoSrc[tIdx]) * sizeof(uint32_t));
+            }
+        }
+    }
+    return true;
+}
+bool copyTensorsInfo(
+        const Qnn_Tensor_t* tensorsInfoSrc,
+        TensorWrapper*&     tensorWrappers,
+        uint32_t            tensorsCount
+) {
+    auto returnStatus = true;
+    tensorWrappers    = (TensorWrapper*)calloc(tensorsCount, sizeof(TensorWrapper));
+    if (nullptr == tensorWrappers) {
+        QNN_ERROR("Failed to allocate memory for tensorWrappers.");
+        return false;
+    }
+    if (returnStatus) {
+        for (size_t tIdx = 0; tIdx < tensorsCount; tIdx++) {
+            // QNN_DEBUG("Extracting tensorInfo for tensor Idx: %d", (int)tIdx);
+            Qnn_Tensor_t& tensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrappers[tIdx]);
+            tensor               = QNN_TENSOR_INIT;
+            const char* tensorName = QNN_TENSOR_GET_NAME(&tensorsInfoSrc[tIdx]);
+            if (!tensorName) {
+                QNN_TENSOR_SET_NAME(tensor, nullptr);
+            } else {
+                QNN_TENSOR_SET_NAME(tensor, __strdup(tensorName));
+            }
+            QNN_TENSOR_SET_ID(tensor, QNN_TENSOR_GET_ID(&tensorsInfoSrc[tIdx]));
+            QNN_TENSOR_SET_TYPE(tensor, QNN_TENSOR_GET_TYPE(&tensorsInfoSrc[tIdx]));
+            QNN_TENSOR_SET_DATA_FORMAT(tensor, QNN_TENSOR_GET_DATA_FORMAT(&tensorsInfoSrc[tIdx]));
+            QNN_TENSOR_SET_DATA_TYPE(tensor, QNN_TENSOR_GET_DATA_TYPE(&tensorsInfoSrc[tIdx]));
+            Qnn_QuantizeParams_t qParams = QNN_QUANTIZE_PARAMS_INIT;
+            qParams.encodingDefinition =
+                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).encodingDefinition;
+            qParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED;
+            if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding ==
+                QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
+                qParams.quantizationEncoding =
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding;
+                qParams.scaleOffsetEncoding =
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).scaleOffsetEncoding;
+            } else if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding ==
+                       QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+                qParams.quantizationEncoding =
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx]).quantizationEncoding;
+                qParams.axisScaleOffsetEncoding.axis =
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                .axisScaleOffsetEncoding.axis;
+                qParams.axisScaleOffsetEncoding.numScaleOffsets =
+                        QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                .axisScaleOffsetEncoding.numScaleOffsets;
+                if (QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                            .axisScaleOffsetEncoding.numScaleOffsets > 0) {
+                    qParams.axisScaleOffsetEncoding.scaleOffset = (Qnn_ScaleOffset_t*)malloc(
+                            QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                    .axisScaleOffsetEncoding.numScaleOffsets *
+                            sizeof(Qnn_ScaleOffset_t)
+                    );
+                    if (qParams.axisScaleOffsetEncoding.scaleOffset) {
+                        for (size_t idx = 0;
+                             idx < QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                           .axisScaleOffsetEncoding.numScaleOffsets;
+                             idx++) {
+                            qParams.axisScaleOffsetEncoding.scaleOffset[idx].scale =
+                                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                            .axisScaleOffsetEncoding.scaleOffset[idx]
+                                            .scale;
+                            qParams.axisScaleOffsetEncoding.scaleOffset[idx].offset =
+                                    QNN_TENSOR_GET_QUANT_PARAMS(&tensorsInfoSrc[tIdx])
+                                            .axisScaleOffsetEncoding.scaleOffset[idx]
+                                            .offset;
+                        }
+                    }
+                }
+            }
+            QNN_TENSOR_SET_QUANT_PARAMS(tensor, qParams);
+            QNN_TENSOR_SET_RANK(tensor, QNN_TENSOR_GET_RANK(&tensorsInfoSrc[tIdx]));
+            QNN_TENSOR_SET_DIMENSIONS(tensor, nullptr);
+            if (QNN_TENSOR_GET_RANK(tensorsInfoSrc[tIdx]) > 0) {
+                QNN_TENSOR_SET_DIMENSIONS(
+                        tensor,
+                        (uint32_t*)malloc(
+                                QNN_TENSOR_GET_RANK(&tensorsInfoSrc[tIdx]) * sizeof(uint32_t)
+                        )
+                );
+                if (QNN_TENSOR_GET_DIMENSIONS(tensor)) {
+                    memcpy(QNN_TENSOR_GET_DIMENSIONS(tensor),
+                           QNN_TENSOR_GET_DIMENSIONS(&tensorsInfoSrc[tIdx]),
+                           QNN_TENSOR_GET_RANK(&tensorsInfoSrc[tIdx]) * sizeof(uint32_t));
+                }
+            }
+        }
+    }
+    return returnStatus;
+}
+bool updateGraphInfoV1(const QnnSystemContext_GraphInfoV1_t* graphInfoSrc,
+                       GraphInfo_t*                          graphInfoDst
+){
+    if (graphInfoSrc->graphInputs) {
+        if (!updateTensorInfo(
+                    graphInfoSrc->graphInputs,
+                    graphInfoDst->inputTensors,
+                    graphInfoSrc->numGraphInputs
+            )) {
+            return false;
+        }
+    }
+    if (graphInfoSrc->graphOutputs) {
+        if (!updateTensorInfo(
+                    graphInfoSrc->graphOutputs,
+                    graphInfoDst->outputTensors,
+                    graphInfoSrc->numGraphOutputs
+            )) {
+            return false;
+        }
+    }
+    return true;
+}
+bool updateGraphInfoV3(const QnnSystemContext_GraphInfoV3_t* graphInfoSrc,
+                       GraphInfo_t*                          graphInfoDst
+){
+    if (graphInfoSrc->graphInputs) {
+        if (!updateTensorInfo(
+                graphInfoSrc->graphInputs,
+                graphInfoDst->inputTensors,
+                graphInfoSrc->numGraphInputs
+        )) {
+            return false;
+        }
+    }
+    if (graphInfoSrc->graphOutputs) {
+        if (!updateTensorInfo(
+                graphInfoSrc->graphOutputs,
+                graphInfoDst->outputTensors,
+                graphInfoSrc->numGraphOutputs
+        )) {
+            return false;
+        }
+    }
+    return true;
+}
+bool copyGraphsInfoV1(
+        const QnnSystemContext_GraphInfoV1_t* graphInfoSrc,
+        GraphInfo_t*                          graphInfoDst
+) {
+    graphInfoDst->graphName = nullptr;
+    if (graphInfoSrc->graphName) {
+        graphInfoDst->graphName = __strdup(graphInfoSrc->graphName);
+    }
+    graphInfoDst->inputTensors    = nullptr;
+    graphInfoDst->numInputTensors = 0;
+    if (graphInfoSrc->graphInputs) {
+        if (!copyTensorsInfo(
+                    graphInfoSrc->graphInputs,
+                    graphInfoDst->inputTensors,
+                    graphInfoSrc->numGraphInputs
+            )) {
+            return false;
+        }
+        graphInfoDst->numInputTensors = graphInfoSrc->numGraphInputs;
+    }
+    graphInfoDst->outputTensors    = nullptr;
+    graphInfoDst->numOutputTensors = 0;
+    if (graphInfoSrc->graphOutputs) {
+        if (!copyTensorsInfo(
+                    graphInfoSrc->graphOutputs,
+                    graphInfoDst->outputTensors,
+                    graphInfoSrc->numGraphOutputs
+            )) {
+            return false;
+        }
+        graphInfoDst->numOutputTensors = graphInfoSrc->numGraphOutputs;
+    }
+    return true;
+}
+bool copyGraphsInfoV3(const QnnSystemContext_GraphInfoV3_t *graphInfoSrc,
+                                     GraphInfo_t *graphInfoDst) {
+    graphInfoDst->graphName = nullptr;
+    if (graphInfoSrc->graphName) {
+        graphInfoDst->graphName =
+                __strdup(graphInfoSrc->graphName);
+    }
+    graphInfoDst->inputTensors    = nullptr;
+    graphInfoDst->numInputTensors = 0;
+    if (graphInfoSrc->graphInputs) {
+        if (!copyTensorsInfo(
+                graphInfoSrc->graphInputs, graphInfoDst->inputTensors, graphInfoSrc->numGraphInputs)) {
+            return false;
+        }
+        graphInfoDst->numInputTensors = graphInfoSrc->numGraphInputs;
+    }
+    graphInfoDst->outputTensors    = nullptr;
+    graphInfoDst->numOutputTensors = 0;
+    if (graphInfoSrc->graphOutputs) {
+        if (!copyTensorsInfo(graphInfoSrc->graphOutputs,
+                             graphInfoDst->outputTensors,
+                             graphInfoSrc->numGraphOutputs)) {
+            return false;
+        }
+        graphInfoDst->numOutputTensors = graphInfoSrc->numGraphOutputs;
+    }
+    return true;
+}
+bool updateGraphInfo(const QnnSystemContext_GraphInfo_t* graphsInput,
+                     const uint32_t                      numGraphs,
+                     GraphInfo_t**                       graphsInfo,
+                     uint32_t&                           graphsCount
+){
+    for (size_t gIdx = 0; gIdx < numGraphs; gIdx++) {
+        if (graphsInput[gIdx].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
+            if(updateGraphInfoV1(&graphsInput[gIdx].graphInfoV1, graphsInfo[graphsCount]) == false) {
+                return false;
+            }
+        }
+        if (graphsInput[gIdx].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+            if(updateGraphInfoV3(&graphsInput[gIdx].graphInfoV3, graphsInfo[graphsCount]) == false) {
+                return false;
+            }
+        }
+        graphsCount++;
+    }
+    return true;
+}
+bool copyGraphsInfo(
+        const QnnSystemContext_GraphInfo_t* graphsInput,
+        const uint32_t                      numGraphs,
+        GraphInfo_t**&                      graphsInfo
+) {
+    if (!graphsInput) {
+        QNN_ERROR("Received nullptr for graphsInput.");
+        return false;
+    }
+    auto returnStatus         = true;
+    graphsInfo                = (GraphInfo_t**)calloc(numGraphs, sizeof(GraphInfo_t*));
+    GraphInfo_t* graphInfoArr = (GraphInfo_t*)calloc(numGraphs, sizeof(GraphInfo_t));
+    if (nullptr == graphsInfo || nullptr == graphInfoArr) {
+        QNN_ERROR("Failure to allocate memory for *graphInfo");
+        returnStatus = false;
+    }
+    if (true == returnStatus) {
+        for (size_t gIdx = 0; gIdx < numGraphs; gIdx++) {
+            QNN_DEBUG("Extracting graphsInfo for graph Idx: %d", (int)gIdx);
+            if (graphsInput[gIdx].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
+                copyGraphsInfoV1(&graphsInput[gIdx].graphInfoV1, &graphInfoArr[gIdx]);
+            }
+            if (graphsInput[gIdx].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+                copyGraphsInfoV3(&graphsInput[gIdx].graphInfoV3, &graphInfoArr[gIdx]);
+            }
+            graphsInfo[gIdx] = graphInfoArr + gIdx;
+        }
+    }
+    if (true != returnStatus) {
+        QNN_DEBUG("Received an ERROR during extractGraphsInfo. Freeing resources.");
+        if (graphsInfo) {
+            for (uint32_t gIdx = 0; gIdx < numGraphs; gIdx++) {
+                if (graphsInfo[gIdx]) {
+                    if (nullptr != graphsInfo[gIdx]->graphName) {
+                        free(graphsInfo[gIdx]->graphName);
+                        graphsInfo[gIdx]->graphName = nullptr;
+                    }
+                    freeQnnTensorWrappers(
+                            graphsInfo[gIdx]->inputTensors, graphsInfo[gIdx]->numInputTensors
+                    );
+                    freeQnnTensorWrappers(
+                            graphsInfo[gIdx]->outputTensors, graphsInfo[gIdx]->numOutputTensors
+                    );
+                }
+            }
+            free(*graphsInfo);
+        }
+        free(graphsInfo);
+        graphsInfo = nullptr;
+    }
+    return true;
+}
+uint32_t getNumGraphInBinary(const QnnSystemContext_BinaryInfo_t* binaryInfo)
+{
+    uint32_t numGraph = 0;
+    if (nullptr == binaryInfo) {
+       QNN_ERROR("binaryInfo is nullptr.");
+       return false;
+    }
+    if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+        numGraph =  binaryInfo->contextBinaryInfoV1.numGraphs;
+    }else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+        numGraph =  binaryInfo->contextBinaryInfoV2.numGraphs;
+    }
+    else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+        numGraph = binaryInfo->contextBinaryInfoV3.numGraphs;
+    }
+    return numGraph;
+}
+bool updateMetaDataToGraphsInfo(const QnnSystemContext_BinaryInfo_t* binaryInfo,
+                                GraphInfo_t**  graphsInfo,
+                                uint32_t& graphsCount
+){
+    if (nullptr == binaryInfo) {
+        QNN_ERROR("binaryInfo is nullptr.");
+        return false;
+    }
+    if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+        if (binaryInfo->contextBinaryInfoV1.graphs) {
+            if (!updateGraphInfo(
+                        binaryInfo->contextBinaryInfoV1.graphs,
+                        binaryInfo->contextBinaryInfoV1.numGraphs,
+                        graphsInfo,
+                        graphsCount
+                )) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            return true;
+        }
+    } else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+        if (binaryInfo->contextBinaryInfoV2.graphs) {
+            if (!updateGraphInfo(
+                        binaryInfo->contextBinaryInfoV2.graphs,
+                        binaryInfo->contextBinaryInfoV2.numGraphs,
+                        graphsInfo,
+                        graphsCount
+                )) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            return true;
+        }
+    } else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+        if (binaryInfo->contextBinaryInfoV3.graphs) {
+            if (!updateGraphInfo(
+                    binaryInfo->contextBinaryInfoV3.graphs,
+                    binaryInfo->contextBinaryInfoV3.numGraphs,
+                    graphsInfo,
+                    graphsCount
+            )) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            return true;
+        }
+    }
+    QNN_ERROR("Unrecognized system context binary info version.");
+    return false;
+}
+bool copyMetadataToGraphsInfo(
+        const QnnSystemContext_BinaryInfo_t* binaryInfo,
+        GraphInfo_t**&                       graphsInfo,
+        uint32_t&                            graphsCount
+) {
+    if (nullptr == binaryInfo) {
+        QNN_ERROR("binaryInfo is nullptr.");
+        return false;
+    }
+    graphsCount = 0;
+    if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+        if (binaryInfo->contextBinaryInfoV1.graphs) {
+            if (!copyGraphsInfo(
+                        binaryInfo->contextBinaryInfoV1.graphs,
+                        binaryInfo->contextBinaryInfoV1.numGraphs,
+                        graphsInfo
+                )) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            graphsCount = binaryInfo->contextBinaryInfoV1.numGraphs;
+            return true;
+        }
+    } else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+        if (binaryInfo->contextBinaryInfoV2.graphs) {
+            if (!copyGraphsInfo(
+                        binaryInfo->contextBinaryInfoV2.graphs,
+                        binaryInfo->contextBinaryInfoV2.numGraphs,
+                        graphsInfo
+                )) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            graphsCount = binaryInfo->contextBinaryInfoV2.numGraphs;
+            return true;
+        }
+    } else if (binaryInfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+        if (binaryInfo->contextBinaryInfoV3.graphs) {
+            if (!copyGraphsInfo(binaryInfo->contextBinaryInfoV3.graphs,
+                                binaryInfo->contextBinaryInfoV3.numGraphs,
+                                graphsInfo)) {
+                QNN_ERROR("Failed while copying graphs Info.");
+                return false;
+            }
+            graphsCount = binaryInfo->contextBinaryInfoV3.numGraphs;
+            return true;
+        }
+    }
+    QNN_ERROR("Unrecognized system context binary info version.");
+    return false;
+}
+size_t getFileSize(std::string filePath) {
+    std::ifstream in(filePath, std::ifstream::binary);
+    if (!in) {
+        QNN_ERROR("Failed to open input file: %s", filePath.c_str());
+        return 0;
+    }
+    in.seekg(0, in.end);
+    const size_t length = in.tellg();
+    in.seekg(0, in.beg);
+    return length;
+}
+bool readBinaryFromFile(std::string filePath, void* buffer, size_t bufferSize) {
+    if (nullptr == buffer) {
+        QNN_ERROR("buffer is nullptr");
+        return false;
+    }
+    std::ifstream in(filePath, std::ifstream::binary);
+    if (!in) {
+        QNN_ERROR("Failed to open input file: %s", filePath.c_str());
+        return false;
+    }
+    if (!in.read(reinterpret_cast<char*>(buffer), bufferSize)) {
+        QNN_ERROR("Failed to read the contents of: %s", filePath.c_str());
+        return false;
+    }
+    return true;
+}
+bool mmapBinaryFile(std::string filePath, void** buffer, size_t bufferSize) {
+#ifndef _WIN32
+    int fd     = open(filePath.c_str(), O_RDONLY);
+    int OFFSET = 0;
+    // read the binary file as memory map
+    *buffer = mmap(nullptr, bufferSize, PROT_READ, MAP_PRIVATE, fd, OFFSET);
+    close(fd);
+    if (madvise(*buffer, bufferSize, MADV_NOHUGEPAGE)) {
+        QNN_ERROR("Failed to advise OS on memory usage err: %s", strerror(errno));
+    }
+    return true;
+#else
+    return false;
+#endif
+}
+bool fillDims(std::vector<size_t>& dims, uint32_t* inDimensions, uint32_t rank) {
+    if (nullptr == inDimensions) {
+        QNN_ERROR("input dimensions is nullptr");
+        return false;
+    }
+    if (rank < 1) {
+        QNN_ERROR("invalid rank : %d", rank);
+        return false;
+    }
+    // In case, rank is less than 4, we are pushing 1s
+    for (size_t r = 0; r < 4 - rank; r++) {
+        dims.push_back(1);
+    }
+    for (size_t r = 0; r < rank; r++) {
+        dims.push_back(inDimensions[r]);
+    }
+    return true;
+}

Genie/Genie/src/qualla/engines/qnn-api/QnnApiUtils.hpp ADDED Viewed

	@@ -0,0 +1,94 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "QnnInterface.h"
+#include "QnnTypes.h"
+#include "System/QnnSystemInterface.h"
+#include <iostream>
+#include <map>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "QnnTypeDef.hpp"
+#include "Log.hpp"
+/**
+ * @brief Frees all memory allocated tensor attributes.
+ *
+ * @param[in] tensorWrapper tensor object to free
+ *
+ * @return Error code
+ */
+bool freeQnnTensorWrapper(TensorWrapper& tensorWrapper);
+/**
+ * @brief Loops through and frees all memory allocated tensor attributes for each tensorWrapper
+ * object.
+ *
+ * @param[in] tensorWrappers array of tensor objects to free
+ *
+ * @param[in] numTensors length of the above tensorWrappers array
+ *
+ * @return Error code
+ */
+bool freeQnnTensorWrappers(TensorWrapper*& tensorWrappers, uint32_t numTensors);
+/**
+ * @brief A helper function to free memory malloced for communicating the Graph for a model(s)
+ *
+ * @param[in] graphsInfo Pointer pointing to location of graph objects
+ *
+ * @param[in] numGraphs The number of graph objects the above pointer is pointing to
+ *
+ * @return Error code
+ *
+ */
+bool freeGraphsInfo(GraphInfoPtr_t** graphsInfo, uint32_t numGraphs);
+bool freeGraphInfo(GraphInfo_t* graphInfo);
+bool copyMetadataToGraphsInfo(
+        const QnnSystemContext_BinaryInfo_t* binaryInfo,
+        GraphInfo_t**&                       graphsInfo,
+        uint32_t&                            graphsCount
+);
+bool copyGraphsInfo(
+        const QnnSystemContext_GraphInfo_t* graphsInput,
+        const uint32_t                      numGraphs,
+        GraphInfo_t**&                      graphsInfo
+);
+bool copyGraphsInfoV1(
+        const QnnSystemContext_GraphInfoV1_t* graphInfoSrc,
+        GraphInfo_t*                          graphInfoDst
+);
+bool copyTensorsInfo(
+        const Qnn_Tensor_t* tensorsInfoSrc,
+        TensorWrapper*&     tensorWrappers,
+        uint32_t            tensorsCount
+);
+bool   fillDims(std::vector<size_t>& dims, uint32_t* inDimensions, uint32_t rank);
+size_t getFileSize(std::string filePath);
+bool   readBinaryFromFile(std::string filePath, void* buffer, size_t bufferSize);
+bool   mmapBinaryFile(std::string filePath, void** buffer, size_t bufferSize);
+bool updateMetaDataToGraphsInfo(const QnnSystemContext_BinaryInfo_t* binaryInfo,GraphInfo_t**  graphsInfo,uint32_t& graphsCount);
+bool updateGraphInfo(const QnnSystemContext_GraphInfo_t* graphsInput,
+                     const uint32_t                      currCount,
+                     GraphInfo_t*                        graphsInfo);
+bool updateGraphInfoV1(const QnnSystemContext_GraphInfoV1_t* graphInfoSrc,
+                       GraphInfo_t*                          graphInfoDst);
+bool updateTensorInfo(const Qnn_Tensor_t* tensorsInfoSrc,
+                      TensorWrapper*     tensorWrappers,
+                      uint32_t            tensorsCount);
+uint32_t getNumGraphInBinary(const QnnSystemContext_BinaryInfo_t* binaryInfo);

Genie/Genie/src/qualla/engines/qnn-api/QnnConfig.hpp ADDED Viewed

	@@ -0,0 +1,44 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include "QnnGraph.h"
+#include "QnnTypes.h"
+#include <vector>
+struct BackendExtensionsConfigs {
+    std::string sharedLibraryPath;
+    std::string configFilePath;
+    BackendExtensionsConfigs() : sharedLibraryPath(""), configFilePath("") {}
+    BackendExtensionsConfigs(std::string sharedLibraryPath, std::string configFilePath)
+        : sharedLibraryPath(sharedLibraryPath), configFilePath(configFilePath) {}
+};
+struct ContextConfigs {
+    bool           priorityPresent;
+    Qnn_Priority_t priority;
+    ContextConfigs() : priorityPresent(false), priority(QNN_PRIORITY_UNDEFINED) {}
+    ContextConfigs(Qnn_Priority_t priority) : priorityPresent(true), priority(priority) {}
+};
+struct GraphConfigs {
+    std::string graphName;
+    bool           priorityPresent;
+    Qnn_Priority_t priority;
+    GraphConfigs()
+        : graphName(),
+          priorityPresent(false), priority(QNN_PRIORITY_UNDEFINED) {
+    }
+};
+struct ConfigOptions {
+    BackendExtensionsConfigs  backendExtensionsConfigs;
+    ContextConfigs            contextConfigs;
+    std::vector<GraphConfigs> graphConfigs;
+    ConfigOptions() : backendExtensionsConfigs(), contextConfigs(), graphConfigs() {}
+};

Genie/Genie/src/qualla/engines/qnn-api/QnnTypeDef.hpp ADDED Viewed

	@@ -0,0 +1,52 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef QNN_TYPE_DEF_H_
+#define QNN_TYPE_DEF_H_
+#include "QnnInterface.h"
+#include "QnnTypes.h"
+#include "Log.hpp"
+#include "QnnTypeMacros.hpp"
+typedef enum ModelError {
+    MODEL_NO_ERROR               = 0,
+    MODEL_TENSOR_ERROR           = 1,
+    MODEL_PARAMS_ERROR           = 2,
+    MODEL_NODES_ERROR            = 3,
+    MODEL_GRAPH_ERROR            = 4,
+    MODEL_CONTEXT_ERROR          = 5,
+    MODEL_GENERATION_ERROR       = 6,
+    MODEL_SETUP_ERROR            = 7,
+    MODEL_INVALID_ARGUMENT_ERROR = 8,
+    MODEL_FILE_ERROR             = 9,
+    MODEL_MEMORY_ALLOCATE_ERROR  = 10,
+    // Value selected to ensure 32 bits.
+    MODEL_UNKNOWN_ERROR = 0x7FFFFFFF
+} ModelError_t;
+using TensorWrapper = Qnn_Tensor_t;
+    #define GET_TENSOR_WRAPPER_TENSOR(tensorWrapper) tensorWrapper
+    #define GET_TENSOR_WRAPPER_NAME(tensorWrapper)   QNN_TENSOR_GET_NAME(tensorWrapper)
+typedef struct GraphInfo {
+    Qnn_GraphHandle_t graph;
+    char*             graphName;
+    TensorWrapper*    inputTensors;
+    uint32_t          numInputTensors;
+    TensorWrapper*    outputTensors;
+    uint32_t          numOutputTensors;
+} GraphInfo_t;
+typedef GraphInfo_t* GraphInfoPtr_t;
+typedef struct GraphConfigInfo {
+    char*                     graphName;
+    const QnnGraph_Config_t** graphConfigs;
+} GraphConfigInfo_t;
+#endif // QNN_TYPE_DEF_H_

Genie/Genie/src/qualla/engines/qnn-api/QnnTypeMacros.hpp ADDED Viewed

	@@ -0,0 +1,702 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include "QnnTypes.h"
+#define QNN_OP_CFG_VALID(opConfig) ((opConfig).version == QNN_OPCONFIG_VERSION_1)
+inline Qnn_OpConfig_t createQnnOpConfig(const Qnn_OpConfigVersion_t version) {
+    Qnn_OpConfig_t opConfig = QNN_OPCONFIG_INIT;
+    opConfig.version        = version;
+    if (version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1 = QNN_OPCONFIG_V1_INIT;
+    }
+    return opConfig;
+}
+inline const char* getQnnOpConfigName(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.name;
+    }
+    return NULL;
+}
+inline const char* getQnnOpConfigName(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigName(*opConfig);
+}
+inline const char* getQnnOpConfigPackageName(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.packageName;
+    }
+    return NULL;
+}
+inline const char* getQnnOpConfigPackageName(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigPackageName(*opConfig);
+}
+inline const char* getQnnOpConfigTypeName(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.typeName;
+    }
+    return NULL;
+}
+inline const char* getQnnOpConfigTypeName(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigTypeName(*opConfig);
+}
+inline uint32_t getQnnOpConfigNumParams(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.numOfParams;
+    }
+    return 0u;
+}
+inline uint32_t getQnnOpConfigNumParams(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigNumParams(*opConfig);
+}
+inline const Qnn_Param_t* getQnnOpConfigParams(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.params;
+    }
+    return NULL;
+}
+inline const Qnn_Param_t* getQnnOpConfigParams(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigParams(*opConfig);
+}
+inline uint32_t getQnnOpConfigNumInputs(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.numOfInputs;
+    }
+    return 0u;
+}
+inline uint32_t getQnnOpConfigNumInputs(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigNumInputs(*opConfig);
+}
+inline const Qnn_Tensor_t* getQnnOpConfigInputs(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.inputTensors;
+    }
+    return NULL;
+}
+inline const Qnn_Tensor_t* getQnnOpConfigInputs(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigInputs(*opConfig);
+}
+inline uint32_t getQnnOpConfigNumOutputs(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.numOfOutputs;
+    }
+    return 0u;
+}
+inline uint32_t getQnnOpConfigNumOutputs(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigNumOutputs(*opConfig);
+}
+inline const Qnn_Tensor_t* getQnnOpConfigOutputs(const Qnn_OpConfig_t& opConfig) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        return opConfig.v1.outputTensors;
+    }
+    return NULL;
+}
+inline const Qnn_Tensor_t* getQnnOpConfigOutputs(const Qnn_OpConfig_t* const opConfig) {
+    return getQnnOpConfigOutputs(*opConfig);
+}
+inline void setQnnOpConfigName(Qnn_OpConfig_t& opConfig, const char* const name) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.name = name;
+    }
+}
+inline void setQnnOpConfigName(Qnn_OpConfig_t* const opConfig, const char* const name) {
+    setQnnOpConfigName(*opConfig, name);
+}
+inline void setQnnOpConfigPackageName(Qnn_OpConfig_t& opConfig, const char* const packageName) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.packageName = packageName;
+    }
+}
+inline void setQnnOpConfigPackageName(
+        Qnn_OpConfig_t* const opConfig,
+        const char* const     packageName
+) {
+    setQnnOpConfigPackageName(*opConfig, packageName);
+}
+inline void setQnnOpConfigTypeName(Qnn_OpConfig_t& opConfig, const char* const typeName) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.typeName = typeName;
+    }
+}
+inline void setQnnOpConfigTypeName(Qnn_OpConfig_t* const opConfig, const char* const typeName) {
+    setQnnOpConfigTypeName(*opConfig, typeName);
+}
+inline void setQnnOpConfigParams(
+        Qnn_OpConfig_t&    opConfig,
+        uint32_t const     numOfParams,
+        Qnn_Param_t* const params
+) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.numOfParams = numOfParams;
+        opConfig.v1.params      = params;
+    }
+}
+inline void setQnnOpConfigParams(
+        Qnn_OpConfig_t* const opConfig,
+        uint32_t const        numOfParams,
+        Qnn_Param_t* const    params
+) {
+    setQnnOpConfigParams(*opConfig, numOfParams, params);
+}
+inline void setQnnOpConfigInputs(
+        Qnn_OpConfig_t&     opConfig,
+        uint32_t const      numOfInputs,
+        Qnn_Tensor_t* const inputTensors
+) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.numOfInputs  = numOfInputs;
+        opConfig.v1.inputTensors = inputTensors;
+    }
+}
+inline void setQnnOpConfigInputs(
+        Qnn_OpConfig_t* const opConfig,
+        uint32_t const        numOfInputs,
+        Qnn_Tensor_t* const   inputTensors
+) {
+    setQnnOpConfigInputs(*opConfig, numOfInputs, inputTensors);
+}
+inline void setQnnOpConfigOutputs(
+        Qnn_OpConfig_t&     opConfig,
+        uint32_t const      numOfOutputs,
+        Qnn_Tensor_t* const outputTensors
+) {
+    if (opConfig.version == QNN_OPCONFIG_VERSION_1) {
+        opConfig.v1.numOfOutputs  = numOfOutputs;
+        opConfig.v1.outputTensors = outputTensors;
+    }
+}
+inline void setQnnOpConfigOutputs(
+        Qnn_OpConfig_t* const opConfig,
+        uint32_t const        numOfOutputs,
+        Qnn_Tensor_t* const   outputTensors
+) {
+    setQnnOpConfigOutputs(*opConfig, numOfOutputs, outputTensors);
+}
+inline Qnn_Tensor_t createQnnTensor(const Qnn_TensorVersion_t version) {
+    Qnn_Tensor_t tensor = QNN_TENSOR_INIT;
+    tensor.version      = version;
+    if (version == QNN_TENSOR_VERSION_1) {
+        tensor.v1 = QNN_TENSOR_V1_INIT;
+    } else if (version == QNN_TENSOR_VERSION_2) {
+        tensor.v2 = QNN_TENSOR_V2_INIT;
+    }
+    return tensor;
+}
+inline uint32_t getQnnTensorId(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.id;
+}
+inline uint32_t getQnnTensorId(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorId(*tensor);
+}
+inline const char* getQnnTensorName(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.name;
+}
+inline const char* getQnnTensorName(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorName(*tensor);
+}
+inline Qnn_TensorType_t getQnnTensorType(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.type;
+}
+inline Qnn_TensorType_t getQnnTensorType(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorType(*tensor);
+}
+inline Qnn_TensorDataFormat_t getQnnTensorDataFormat(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.dataFormat;
+}
+inline Qnn_TensorDataFormat_t getQnnTensorDataFormat(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorDataFormat(*tensor);
+}
+inline Qnn_DataType_t getQnnTensorDataType(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.dataType;
+}
+inline Qnn_DataType_t getQnnTensorDataType(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorDataType(*tensor);
+}
+inline Qnn_QuantizeParams_t getQnnTensorQuantParams(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.quantizeParams;
+}
+inline Qnn_QuantizeParams_t getQnnTensorQuantParams(const Qnn_Tensor_t* const tensor) {
+    if (tensor != nullptr) {
+        return getQnnTensorQuantParams(*tensor);
+    }
+    return QNN_QUANTIZE_PARAMS_INIT;
+}
+inline uint32_t getQnnTensorRank(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.rank;
+}
+inline uint32_t getQnnTensorRank(const Qnn_Tensor_t* const tensor) {
+    if (tensor != nullptr) {
+        return getQnnTensorRank(*tensor);
+    }
+    return 0u;
+}
+inline uint32_t* getQnnTensorDimensions(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.dimensions;
+}
+inline uint32_t* getQnnTensorDimensions(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorDimensions(*tensor);
+}
+inline uint8_t* getQnnTensorIsDynamicDimensions(const Qnn_Tensor_t& tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return NULL;
+    } else if (tensor.version == QNN_TENSOR_VERSION_2) {
+        return tensor.v2.isDynamicDimensions;
+    }
+    return NULL;
+}
+inline uint8_t* getQnnTensorIsDynamicDimensions(const Qnn_Tensor_t* tensor) {
+    return getQnnTensorIsDynamicDimensions(*tensor);
+}
+inline Qnn_SparseParams_t getQnnTensorSparseParams(const Qnn_Tensor_t& tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return QNN_SPARSE_PARAMS_INIT;
+    } else if (tensor.version == QNN_TENSOR_VERSION_2) {
+        return tensor.v2.sparseParams;
+    }
+    return QNN_SPARSE_PARAMS_INIT;
+}
+inline Qnn_SparseParams_t getQnnTensorSparseParams(const Qnn_Tensor_t* tensor) {
+    return getQnnTensorSparseParams(*tensor);
+}
+inline Qnn_TensorMemType_t getQnnTensorMemType(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.memType;
+}
+inline Qnn_TensorMemType_t getQnnTensorMemType(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorMemType(*tensor);
+}
+inline Qnn_ClientBuffer_t getQnnTensorClientBuf(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.clientBuf;
+}
+inline Qnn_ClientBuffer_t getQnnTensorClientBuf(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorClientBuf(*tensor);
+}
+inline Qnn_MemHandle_t getQnnTensorMemHandle(const Qnn_Tensor_t& tensor) {
+    // TensorCompatTest justifies no need to check version
+    return tensor.v1.memHandle;
+}
+inline Qnn_MemHandle_t getQnnTensorMemHandle(const Qnn_Tensor_t* const tensor) {
+    return getQnnTensorMemHandle(*tensor);
+}
+inline void setQnnTensorId(Qnn_Tensor_t& tensor, const uint32_t id) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.id = id;
+}
+inline void setQnnTensorId(Qnn_Tensor_t* const tensor, const uint32_t id) {
+    setQnnTensorId(*tensor, id);
+}
+inline void setQnnTensorName(Qnn_Tensor_t& tensor, const char* const name) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.name = name;
+}
+inline void setQnnTensorName(Qnn_Tensor_t* const tensor, const char* const name) {
+    setQnnTensorName(*tensor, name);
+}
+inline void setQnnTensorType(Qnn_Tensor_t& tensor, const Qnn_TensorType_t type) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.type = type;
+}
+inline void setQnnTensorType(Qnn_Tensor_t* const tensor, const Qnn_TensorType_t type) {
+    setQnnTensorType(*tensor, type);
+}
+inline void setQnnTensorDataFormat(Qnn_Tensor_t& tensor, const Qnn_TensorDataFormat_t dataFormat) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.dataFormat = dataFormat;
+}
+inline void setQnnTensorDataFormat(
+        Qnn_Tensor_t* const          tensor,
+        const Qnn_TensorDataFormat_t format
+) {
+    setQnnTensorDataFormat(*tensor, format);
+}
+inline void setQnnTensorDataType(Qnn_Tensor_t& tensor, const Qnn_DataType_t dataType) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.dataType = dataType;
+}
+inline void setQnnTensorDataType(Qnn_Tensor_t* const tensor, const Qnn_DataType_t dataType) {
+    setQnnTensorDataType(*tensor, dataType);
+}
+inline void setQnnTensorQuantParams(
+        Qnn_Tensor_t&              tensor,
+        const Qnn_QuantizeParams_t quantizeParams
+) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.quantizeParams = quantizeParams;
+}
+inline void setQnnTensorQuantParams(Qnn_Tensor_t* const tensor, const Qnn_QuantizeParams_t params) {
+    setQnnTensorQuantParams(*tensor, params);
+}
+inline void setQnnTensorRank(Qnn_Tensor_t& tensor, const uint32_t rank) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.rank = rank;
+}
+inline void setQnnTensorRank(Qnn_Tensor_t* const tensor, const uint32_t rank) {
+    setQnnTensorRank(*tensor, rank);
+}
+inline void setQnnTensorDimensions(Qnn_Tensor_t& tensor, uint32_t* const dimensions) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.dimensions = dimensions;
+}
+inline void setQnnTensorDimensions(Qnn_Tensor_t* const tensor, uint32_t* const dimensions) {
+    setQnnTensorDimensions(*tensor, dimensions);
+}
+inline void setQnnTensorIsDynamicDimensions(
+        Qnn_Tensor_t&  tensor,
+        uint8_t* const isDynamicDimensions
+) {
+    if (tensor.version == QNN_TENSOR_VERSION_2) {
+        tensor.v2.isDynamicDimensions = isDynamicDimensions;
+    }
+}
+inline void setQnnTensorIsDynamicDimensions(
+        Qnn_Tensor_t*  tensor,
+        uint8_t* const isDynamicDimensions
+) {
+    setQnnTensorIsDynamicDimensions(*tensor, isDynamicDimensions);
+}
+inline void setQnnTensorSparseParams(Qnn_Tensor_t& tensor, const Qnn_SparseParams_t sparseParams) {
+    if (tensor.version == QNN_TENSOR_VERSION_2) {
+        tensor.v2.sparseParams = sparseParams;
+    }
+}
+inline void setQnnTensorSparseParams(Qnn_Tensor_t* tensor, Qnn_SparseParams_t sparseParams) {
+    setQnnTensorSparseParams(*tensor, sparseParams);
+}
+inline void setQnnTensorMemType(Qnn_Tensor_t& tensor, const Qnn_TensorMemType_t memType) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.memType = memType;
+}
+inline void setQnnTensorMemType(Qnn_Tensor_t* const tensor, const Qnn_TensorMemType_t memType) {
+    setQnnTensorMemType(*tensor, memType);
+}
+inline void setQnnTensorClientBuf(Qnn_Tensor_t& tensor, const Qnn_ClientBuffer_t clientBuf) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.clientBuf = clientBuf;
+}
+inline void setQnnTensorClientBuf(Qnn_Tensor_t* const tensor, const Qnn_ClientBuffer_t clientBuf) {
+    setQnnTensorClientBuf(*tensor, clientBuf);
+}
+inline void setQnnTensorMemHandle(Qnn_Tensor_t& tensor, const Qnn_MemHandle_t memHandle) {
+    // TensorCompatTest justifies no need to check version
+    tensor.v1.memHandle = memHandle;
+}
+inline void setQnnTensorMemHandle(Qnn_Tensor_t* const tensor, const Qnn_MemHandle_t handle) {
+    setQnnTensorMemHandle(*tensor, handle);
+}
+inline Qnn_TensorSet_t createQnnTensorSet(const Qnn_TensorSetVersion_t version) {
+    Qnn_TensorSet_t tensorSet = QNN_TENSOR_SET_INIT;
+    tensorSet.version         = version;
+    if (version == QNN_TENSOR_SET_VERSION_1) {
+        tensorSet.v1 = QNN_TENSOR_SET_V1_INIT;
+    }
+    return tensorSet;
+}
+inline uint32_t getQnnTensorSetNumInputs(const Qnn_TensorSet_t& tensorSet) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        return tensorSet.v1.numInputs;
+    }
+    return 0;
+}
+inline uint32_t getQnnTensorSetNumInputs(const Qnn_TensorSet_t* tensorSet) {
+    return getQnnTensorSetNumInputs(*tensorSet);
+}
+inline Qnn_Tensor_t* getQnnTensorSetInputTensors(const Qnn_TensorSet_t& tensorSet) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        return tensorSet.v1.inputs;
+    }
+    return 0;
+}
+inline Qnn_Tensor_t* getQnnTensorSetInputTensors(const Qnn_TensorSet_t* tensorSet) {
+    return getQnnTensorSetInputTensors(*tensorSet);
+}
+inline uint32_t getQnnTensorSetNumOutputs(const Qnn_TensorSet_t& tensorSet) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        return tensorSet.v1.numOutputs;
+    }
+    return 0;
+}
+inline uint32_t getQnnTensorSetNumOutputs(const Qnn_TensorSet_t* tensorSet) {
+    return getQnnTensorSetNumOutputs(*tensorSet);
+}
+inline Qnn_Tensor_t* getQnnTensorSetOutputTensors(const Qnn_TensorSet_t& tensorSet) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        return tensorSet.v1.outputs;
+    }
+    return 0;
+}
+inline Qnn_Tensor_t* getQnnTensorSetOutputTensors(const Qnn_TensorSet_t* tensorSet) {
+    return getQnnTensorSetOutputTensors(*tensorSet);
+}
+inline void setQnnTensorSetInputTensors(
+        Qnn_TensorSet_t& tensorSet,
+        Qnn_Tensor_t*    inputTensors,
+        uint32_t const   numInputs
+) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        tensorSet.v1.inputs    = inputTensors;
+        tensorSet.v1.numInputs = numInputs;
+    }
+}
+inline void setQnnTensorSetInputTensors(
+        Qnn_TensorSet_t* tensorSet,
+        Qnn_Tensor_t*    inputTensors,
+        uint32_t const   numInputs
+) {
+    setQnnTensorSetInputTensors(*tensorSet, inputTensors, numInputs);
+}
+inline void setQnnTensorSetOutputTensors(
+        Qnn_TensorSet_t& tensorSet,
+        Qnn_Tensor_t*    outputTensors,
+        const uint32_t   numOutputs
+) {
+    if (tensorSet.version == QNN_TENSOR_SET_VERSION_1) {
+        tensorSet.v1.outputs    = outputTensors;
+        tensorSet.v1.numOutputs = numOutputs;
+    }
+}
+inline void setQnnTensorSetOutputTensors(
+        Qnn_TensorSet_t* tensorSet,
+        Qnn_Tensor_t*    outputTensors,
+        const uint32_t   numOutputs
+) {
+    setQnnTensorSetOutputTensors(*tensorSet, outputTensors, numOutputs);
+}
+// Creator for QNN Op Config
+#define QNN_OP_CFG_CREATE(version) createQnnOpConfig(version)
+// Accessors for QNN Op Config
+#define QNN_OP_CFG_GET_NAME(opConfig)         getQnnOpConfigName(opConfig)
+#define QNN_OP_CFG_GET_PACKAGE_NAME(opConfig) getQnnOpConfigPackageName(opConfig)
+#define QNN_OP_CFG_GET_TYPE_NAME(opConfig)    getQnnOpConfigTypeName(opConfig)
+#define QNN_OP_CFG_GET_NUM_PARAMS(opConfig)   getQnnOpConfigNumParams(opConfig)
+#define QNN_OP_CFG_GET_PARAMS(opConfig)       getQnnOpConfigParams(opConfig)
+#define QNN_OP_CFG_GET_NUM_INPUTS(opConfig)   getQnnOpConfigNumInputs(opConfig)
+#define QNN_OP_CFG_GET_INPUTS(opConfig)       getQnnOpConfigInputs(opConfig)
+#define QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig)  getQnnOpConfigNumOutputs(opConfig)
+#define QNN_OP_CFG_GET_OUTPUTS(opConfig)      getQnnOpConfigOutputs(opConfig)
+// Modifiers for QNN Op Config
+#define QNN_OP_CFG_SET_NAME(opConfig, value)         setQnnOpConfigName(opConfig, value)
+#define QNN_OP_CFG_SET_PACKAGE_NAME(opConfig, value) setQnnOpConfigPackageName(opConfig, value)
+#define QNN_OP_CFG_SET_TYPE_NAME(opConfig, value)    setQnnOpConfigTypeName(opConfig, value)
+#define QNN_OP_CFG_SET_PARAMS(opConfig, numOfParams, params)                                       \
+    setQnnOpConfigParams(opConfig, numOfParams, params)
+#define QNN_OP_CFG_SET_INPUTS(opConfig, numOfInputs, inputTensors)                                 \
+    setQnnOpConfigInputs(opConfig, numOfInputs, inputTensors)
+#define QNN_OP_CFG_SET_OUTPUTS(opConfig, numOfOutputs, outputTensors)                              \
+    setQnnOpConfigOutputs(opConfig, numOfOutputs, outputTensors)
+// Creator for QNN Tensor
+#define QNN_TENSOR_CREATE(version) createQnnTensor(version)
+// Accessors for QNN Tensor
+#define QNN_TENSOR_GET_ID(tensor)                    getQnnTensorId(tensor)
+#define QNN_TENSOR_GET_NAME(tensor)                  getQnnTensorName(tensor)
+#define QNN_TENSOR_GET_TYPE(tensor)                  getQnnTensorType(tensor)
+#define QNN_TENSOR_GET_DATA_FORMAT(tensor)           getQnnTensorDataFormat(tensor)
+#define QNN_TENSOR_GET_DATA_TYPE(tensor)             getQnnTensorDataType(tensor)
+#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)          getQnnTensorQuantParams(tensor)
+#define QNN_TENSOR_GET_RANK(tensor)                  getQnnTensorRank(tensor)
+#define QNN_TENSOR_GET_DIMENSIONS(tensor)            getQnnTensorDimensions(tensor)
+#define QNN_TENSOR_GET_IS_DYNAMIC_DIMENSIONS(tensor) getQnnTensorIsDynamicDimensions(tensor)
+#define QNN_TENSOR_GET_SPARSE_PARAMS(tensor)         getQnnTensorSparseParams(tensor)
+#define QNN_TENSOR_GET_MEM_TYPE(tensor)              getQnnTensorMemType(tensor)
+#define QNN_TENSOR_GET_CLIENT_BUF(tensor)            getQnnTensorClientBuf(tensor)
+#define QNN_TENSOR_GET_MEM_HANDLE(tensor)            getQnnTensorMemHandle(tensor)
+// Modifiers for QNN Tensor
+#define QNN_TENSOR_SET_ID(tensor, value)           setQnnTensorId(tensor, value)
+#define QNN_TENSOR_SET_NAME(tensor, value)         setQnnTensorName(tensor, value)
+#define QNN_TENSOR_SET_TYPE(tensor, value)         setQnnTensorType(tensor, value)
+#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)  setQnnTensorDataFormat(tensor, value)
+#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)    setQnnTensorDataType(tensor, value)
+#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) setQnnTensorQuantParams(tensor, value)
+#define QNN_TENSOR_SET_RANK(tensor, value)         setQnnTensorRank(tensor, value)
+#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)   setQnnTensorDimensions(tensor, value)
+#define QNN_TENSOR_SET_IS_DYNAMIC_DIMENSIONS(tensor, value)                                        \
+    setQnnTensorIsDynamicDimensions(tensor, value)
+#define QNN_TENSOR_SET_SPARSE_PARAMS(tensor, value) setQnnTensorSparseParams(tensor, value)
+#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)      setQnnTensorMemType(tensor, value)
+#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)    setQnnTensorClientBuf(tensor, value)
+#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)    setQnnTensorMemHandle(tensor, value)
+// Creator for QNN Tensor Set
+#define QNN_TENSORSET_CREATE(version) createQnnTensorSet(version)
+// Accessors for QNN Tensor Set
+#define QNN_TENSORSET_GET_NUM_INPUTS(tensorSet)     getQnnTensorSetNumInputs(tensorSet)
+#define QNN_TENSORSET_GET_INPUT_TENSORS(tensorSet)  getQnnTensorSetInputTensors(tensorSet)
+#define QNN_TENSORSET_GET_NUM_OUTPUTS(tensorSet)    getQnnTensorSetNumOutputs(tensorSet)
+#define QNN_TENSORSET_GET_OUTPUT_TENSORS(tensorSet) getQnnTensorSetOutputTensors(tensorSet)
+// Modifiers for QNN Tensor Set
+#define QNN_TENSORSET_SET_INPUT_TENSORS(tensorSet, inputTensors, numInputs)                        \
+    setQnnTensorSetInputTensors(tensorSet, inputTensors, numInputs)
+#define QNN_TENSORSET_SET_OUTPUT_TENSORS(tensorSet, outputTensors, numOutputs)                     \
+    setQnnTensorSetOutputTensors(tensorSet, outputTensors, numOutputs)
+inline bool isQnnTensorV1Compatible(const Qnn_Tensor_t& tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_2) {
+        if (tensor.v2.isDynamicDimensions != NULL) {
+            return false;
+        }
+        if (tensor.v2.dataFormat == QNN_TENSOR_DATA_FORMAT_SPARSE) {
+            return false;
+        }
+    }
+    return true;
+}
+inline bool isQnnTensorV1Compatible(const Qnn_Tensor_t* const tensor) {
+    return isQnnTensorV1Compatible(*tensor);
+}
+inline bool isQnnTensorV1Compatible(const Qnn_OpConfig_t& opConfig) {
+    if ((QNN_OP_CFG_GET_INPUTS(opConfig) != NULL) && (QNN_OP_CFG_GET_NUM_INPUTS(opConfig) > 0u)) {
+        for (uint32_t tensorIdx = 0u; tensorIdx < QNN_OP_CFG_GET_NUM_INPUTS(opConfig);
+             tensorIdx++) {
+            if (!isQnnTensorV1Compatible(QNN_OP_CFG_GET_INPUTS(opConfig)[tensorIdx])) {
+                return false;
+            }
+        }
+    }
+    if ((QNN_OP_CFG_GET_OUTPUTS(opConfig) != NULL) && (QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig) > 0u)) {
+        for (uint32_t tensorIdx = 0u; tensorIdx < QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig);
+             tensorIdx++) {
+            if (!isQnnTensorV1Compatible(QNN_OP_CFG_GET_OUTPUTS(opConfig)[tensorIdx])) {
+                return false;
+            }
+        }
+    }
+    if ((QNN_OP_CFG_GET_PARAMS(opConfig) != NULL) && (QNN_OP_CFG_GET_NUM_PARAMS(opConfig) > 0)) {
+        for (uint32_t paramIdx = 0u; paramIdx < QNN_OP_CFG_GET_NUM_PARAMS(opConfig); paramIdx++) {
+            const Qnn_Param_t& param = QNN_OP_CFG_GET_PARAMS(opConfig)[paramIdx];
+            if (QNN_PARAMTYPE_TENSOR == param.paramType) {
+                if (!isQnnTensorV1Compatible(param.tensorParam)) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+inline bool isQnnTensorV1Compatible(const Qnn_OpConfig_t* const opConfig) {
+    return isQnnTensorV1Compatible(*opConfig);
+}

Genie/Genie/src/qualla/engines/qnn-api/RpcMem.cpp ADDED Viewed

	@@ -0,0 +1,481 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "QnnMem.h"
+#include "QnnHtpMem.h"
+#include "RpcMem.hpp"
+#include "QnnTypeMacros.hpp"
+#include "dlwrap.hpp"
+#define RPCMEM_HEAP_ID_SYSTEM 25
+#define RPCMEM_DEFAULT_FLAGS  1
+#if 1
+    #define TRACE_MEMORY_ALLOC QNN_DEBUG
+#else
+    #define TRACE_MEMORY_ALLOC(fmt, ...)
+#endif
+RpcMem::RpcMem(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface)
+    : m_libCdspRpc(nullptr), m_rpcMemAlloc(nullptr), m_rpcMemFree(nullptr), m_rpcMemToFd(nullptr),
+      m_qnnInterface(qnnInterface), m_contextHandle(contextHandle) {
+    (void)m_contextHandle;
+}
+bool RpcMem::initialize() {
+    // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib and /vendor/lib64 respectively.
+    // On Windows, it's installed into something like this
+    //      c:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_30b9cc995571de6a\libcdsprpc.dll
+#ifdef _WIN32
+    const char* dsprpc_so = "libcdsprpc.dll";
+#else
+    const char* dsprpc_so = "libcdsprpc.so";
+#endif
+    m_libCdspRpc = dlopen(dsprpc_so, RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == m_libCdspRpc) {
+        QNN_ERROR("Unable to load backend. dlerror(): %s", dlerror());
+        return false;
+    }
+    m_rpcMemAlloc = (RpcMemAllocFn_t)dlsym(m_libCdspRpc, "rpcmem_alloc");
+    m_rpcMemFree  = (RpcMemFreeFn_t)dlsym(m_libCdspRpc, "rpcmem_free");
+    m_rpcMemToFd  = (RpcMemToFdFn_t)dlsym(m_libCdspRpc, "rpcmem_to_fd");
+    if (nullptr == m_rpcMemAlloc || nullptr == m_rpcMemFree || nullptr == m_rpcMemToFd) {
+        QNN_ERROR("Unable to access symbols in libcdsprpc. dlerror(): %s", dlerror());
+        return false;
+    }
+    return true;
+}
+RpcMem::~RpcMem() {
+    if (m_libCdspRpc) {
+        QNN_DEBUG("Closing libcdsprpc.so handle");
+        dlclose(m_libCdspRpc);
+    }
+}
+RpcMemTensorData* RpcMem::getRpcMemTensorData(Qnn_Tensor_t* tensor) {
+    if (tensor == nullptr) return nullptr;
+    Qnn_MemHandle_t mem_handle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    if (mem_handle == nullptr) return nullptr;
+    return &m_memHandleToRpcMem.at(mem_handle);
+}
+void* RpcMem::getBuffer(Qnn_Tensor_t* tensor) {
+    RpcMemTensorData* data = getRpcMemTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("getBuffer : Couldn't find tensor %p", tensor);
+        return nullptr;
+    }
+    return data->memPointer;
+}
+int RpcMem::getFd(Qnn_Tensor_t* tensor) {
+    RpcMemTensorData* data = getRpcMemTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("getFd : Couldn't find tensor %p", tensor);
+        return -1;
+    }
+    return data->fd;
+}
+size_t RpcMem::getOffset(Qnn_Tensor_t* tensor) {
+    RpcMemTensorData* data = getRpcMemTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("getOffset : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->offset;
+}
+size_t RpcMem::getBufferSize(Qnn_Tensor_t* tensor) {
+    RpcMemTensorData* data = getRpcMemTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("getBufferSize : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->size;
+};
+size_t RpcMem::getTotalBufferSize(Qnn_Tensor_t* tensor) {
+    RpcMemTensorData* data = getRpcMemTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("getTotalBufferSize : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->totalBufferSize;
+}
+bool RpcMem::allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) {
+    if (m_libCdspRpc == nullptr) {
+        QNN_ERROR("RpcMem not initialized");
+        return false;
+    }
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensor");
+        return false;
+    }
+    if (m_tensorToRpcMem.find(tensor) != m_tensorToRpcMem.end()) {
+        QNN_ERROR("Tensor already allocated");
+        return false;
+    }
+    auto memPointer = m_rpcMemAlloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, tensorDataSize);
+    auto status     = true;
+    if (!memPointer) {
+        QNN_ERROR("rpcmem_alloc failure");
+        status = false;
+    }
+    int memfd = -1;
+    if (status == true) {
+        memfd = m_rpcMemToFd(memPointer);
+        if (memfd == -1) {
+            QNN_ERROR("rpcmem_to_fd failure");
+            status = false;
+        }
+    }
+    if (status == true) {
+        Qnn_MemDescriptor_t memDescriptor = {
+                {QNN_TENSOR_GET_RANK(tensor), QNN_TENSOR_GET_DIMENSIONS(tensor), nullptr},
+                QNN_TENSOR_GET_DATA_TYPE(tensor),
+                QNN_MEM_TYPE_ION,
+                {{-1}}
+        };
+        memDescriptor.ionInfo.fd = memfd;
+        QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
+        QNN_TENSOR_SET_MEM_HANDLE(tensor, nullptr);
+        Qnn_MemHandle_t memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+        if (QNN_SUCCESS != m_qnnInterface->memRegister(
+                                   m_contextHandle,
+                                   &memDescriptor,
+                                   1,
+                                   &(memHandle)
+                           )) {
+            const char* tname = QNN_TENSOR_GET_NAME(tensor);
+            QNN_ERROR("memRegister fail %s (ctx=%p fd=%d)", tname, m_contextHandle, memfd);
+            status = false;
+        }
+        QNN_TENSOR_SET_MEM_HANDLE(tensor, memHandle);
+    }
+    if (status == true) {
+        m_tensorToRpcMem.insert({tensor, RpcMemTensorData(memfd, memPointer, tensorDataSize)});
+    }
+    if (status == false) {
+        if (m_rpcMemFree) {
+            m_rpcMemFree(memPointer);
+        }
+    }
+    return status;
+}
+bool RpcMem::freeTensorBuffer(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensor");
+        return false;
+    }
+    if (m_sameMemoryFreeTensors.find(tensor) != m_sameMemoryFreeTensors.end()) {
+        if (m_tensorToRpcMem.find(tensor) == m_tensorToRpcMem.end()) {
+            QNN_ERROR("Tensor not found");
+            return false;
+        }
+        m_tensorToRpcMem.erase(tensor);
+    } else {
+        auto memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+        if (QNN_SUCCESS != m_qnnInterface->memDeRegister(&memHandle, 1)) {
+            QNN_ERROR("Failed to deregister ion memory with the backend");
+            return false;
+        }
+        QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_UNDEFINED);
+        if (m_tensorToRpcMem.find(tensor) == m_tensorToRpcMem.end()) {
+            QNN_ERROR("Tensor not found");
+            return false;
+        }
+        if (m_rpcMemFree) {
+            m_rpcMemFree(m_tensorToRpcMem[tensor].memPointer);
+        }
+        m_tensorToRpcMem.erase(tensor);
+    }
+    return true;
+}
+bool RpcMem::useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) {
+    if (nullptr == dest || nullptr == src) {
+        QNN_ERROR("Received nullptr");
+        return false;
+    }
+    if (m_tensorToRpcMem.find(src) == m_tensorToRpcMem.end()) {
+        QNN_ERROR("Src Tensor not found");
+        return false;
+    }
+    if (false == freeTensorBuffer(dest)) {
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSOR_GET_MEM_TYPE(src));
+    QNN_TENSOR_SET_MEM_HANDLE(dest, QNN_TENSOR_GET_MEM_HANDLE(src));
+    m_tensorToRpcMem.insert({dest, m_tensorToRpcMem[src]});
+    m_sameMemoryFreeTensors.insert(dest);
+    return true;
+}
+bool RpcMem::useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) {
+    if (nullptr == dest || nullptr == src) {
+        QNN_ERROR("Received nullptr");
+        return false;
+    }
+    if (m_tensorToRpcMem.find(src) == m_tensorToRpcMem.end()) {
+        QNN_ERROR("Src Tensor not found");
+        return false;
+    }
+    if (false == freeTensorBuffer(dest)) {
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSOR_GET_MEM_TYPE(src));
+    QNN_TENSOR_SET_MEM_HANDLE(dest, QNN_TENSOR_GET_MEM_HANDLE(src));
+    m_tensorToRpcMem.insert({dest, m_tensorToRpcMem[src]});
+    m_sameMemoryFreeTensors.insert(dest);
+    return true;
+}
+bool RpcMem::useExternalMemory(Qnn_Tensor_t* dest, void* extMem) {
+    QNN_ERROR("We don't support external memory feature for shared buffers yet!");
+    return false;
+}
+void* RpcMem::allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) {
+    *fd = -1;
+    if (m_libCdspRpc == nullptr) {
+        QNN_ERROR("RpcMem not initialized for fused buffer");
+        return nullptr;
+    }
+    void* memPointer = m_rpcMemAlloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, bufferSize);
+    if (!memPointer) {
+        QNN_ERROR("Not able to allocate fused buffer of size: %lu", (unsigned long)bufferSize);
+        return nullptr;
+    }
+    m_fusedBuffers.push_back({memPointer, bufferSize});
+    QNN_DEBUG(
+            "Successfully allocated fused buffer at %p with size %lu",
+            memPointer,
+            (unsigned long)bufferSize
+    );
+    if ((*fd = m_rpcMemToFd(memPointer)) == -1) {
+        QNN_ERROR(
+                "Not able to get fd for the fused buffer of size: %lu", (unsigned long)bufferSize
+        );
+        return nullptr;
+    }
+    QNN_DEBUG("Retrieved fd %d for pointer %p", *fd, memPointer);
+    return memPointer;
+}
+bool RpcMem::allocateBuffers(
+        const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+        std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+) {
+    int    alloc_chunk_idx  = m_fusedBuffers.size();
+    int    num_alloc_chunks = 0;
+    size_t total_alloc_size = 0;
+    for (auto& [_, tensor_sizes] : allocs_per_chunk) {
+        // Calculate total allocation chunk size
+        size_t alloc_chunk_size = 0;
+        for (const auto& [tensor_name, tensor_size] : tensor_sizes) {
+            tensor_offsets[tensor_name] = {alloc_chunk_idx, alloc_chunk_size};
+            alloc_chunk_size += tensor_size;
+        }
+        // Allocate chunk for this unique context set
+        if (alloc_chunk_size <= 0) {
+            QNN_ERROR("Unexpected chunk size detected. Please re-check IO allocations");
+            return false;
+        }
+        m_fusedFds.push_back(0);
+        if (!allocateTensorFusedBuffer(alloc_chunk_size, &m_fusedFds.back())) //
+            return false;
+        total_alloc_size += alloc_chunk_size;
+        alloc_chunk_idx++;
+        num_alloc_chunks++;
+    }
+    QNN_INFO(
+            "Allocated total size = %lu across %d buffers",
+            (unsigned long)total_alloc_size,
+            num_alloc_chunks
+    );
+    return true;
+}
+bool RpcMem::mapFusedBufferOffset(
+        Qnn_Tensor_t*       tensor,
+        size_t              tensorDataSize,
+        int32_t             fd,
+        uint32_t            offset,
+        uint64_t            totalBufferSize,
+        void*               memPointer,
+        Qnn_ContextHandle_t contextHandle
+) {
+    if (m_libCdspRpc == nullptr) {
+        QNN_ERROR("RpcMem not initialized");
+        return false;
+    }
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensor");
+        return false;
+    }
+    Qnn_ErrorHandle_t ret;
+    const char*       tname = QNN_TENSOR_GET_NAME(tensor);
+    // Check if tensor already has a memHandle assigned
+    Qnn_MemHandle_t cur_mem_handle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    if (cur_mem_handle != nullptr) {
+        // Check if memHandle is already identical to requested buffer and offset
+        RpcMemTensorData& cur_rpc_mem_data = m_memHandleToRpcMem.at(cur_mem_handle);
+        if (cur_rpc_mem_data.fd == fd && cur_rpc_mem_data.offset == offset) {
+            return true;
+        }
+        // updated offset, deregister previous mem_handle
+        if (tensorDataSize == 0) tensorDataSize = cur_rpc_mem_data.size;
+        // clang-format off
+        TRACE_MEMORY_ALLOC( "memDeRegister %-20s (fd=%d offset=%lu) memHandle=%p",
+            tname, cur_rpc_mem_data.fd, cur_rpc_mem_data.offset, cur_mem_handle);
+        // clang-format on
+        m_memHandleToRpcMem.erase(cur_mem_handle);
+        if ((ret = m_qnnInterface->memDeRegister(&cur_mem_handle, 1)) != QNN_SUCCESS) {
+            QNN_ERROR(
+                    "memDeRegister ERROR(%lu) - %s memHandle=%p",
+                    (unsigned long)ret,
+                    tname,
+                    cur_mem_handle
+            );
+            return false;
+        }
+    } else {
+        // For inital tensors, we need to check if the tensor can re-use a memHandle
+        // from another tensor in the same context
+        auto memConfig = std::make_tuple(fd, offset, contextHandle);
+        if (memConfigList.contains(memConfig)) {
+            auto&           parentTensor    = memConfigList[memConfig];
+            Qnn_MemHandle_t parentMemHandle = QNN_TENSOR_GET_MEM_HANDLE(parentTensor);
+            QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
+            QNN_TENSOR_SET_MEM_HANDLE(tensor, parentMemHandle);
+            TRACE_MEMORY_ALLOC("%-20s : Mapping to memHandle %p", tname, parentMemHandle);
+            return true;
+        }
+    }
+    // Register a new memHandle based on function arguments
+    QnnMemHtp_Descriptor_t htp_mem_desciptor    = {QNN_HTP_MEM_SHARED_BUFFER, totalBufferSize, {0}};
+    htp_mem_desciptor.sharedBufferConfig.fd     = fd;
+    htp_mem_desciptor.sharedBufferConfig.offset = offset;
+    Qnn_MemDescriptor_t mem_descriptor = {
+            {QNN_TENSOR_GET_RANK(tensor), QNN_TENSOR_GET_DIMENSIONS(tensor), nullptr},
+            QNN_TENSOR_GET_DATA_TYPE(tensor),
+            QNN_MEM_TYPE_CUSTOM,
+            {{-1}}
+    };
+    mem_descriptor.customInfo = &htp_mem_desciptor;
+    Qnn_MemHandle_t mem_handle = nullptr;
+    ret = m_qnnInterface->memRegister(contextHandle, &mem_descriptor, 1, &mem_handle);
+    if (ret != QNN_SUCCESS) {
+        QNN_ERROR("%-20s (ctx=%p fd=%d offset=%u)", tname, contextHandle, fd, offset);
+        QNN_ERROR("memRegister ERROR(%lu)", (unsigned long)ret);
+        return false;
+    }
+    // clang-format off
+    TRACE_MEMORY_ALLOC("%-20s (ctx=%p fd=%d offset=%u) memPointer=%p memHandle=%p",
+        tname, contextHandle, fd, offset, ((uint8_t*)memPointer) + offset, mem_handle);
+    // clang-format on
+    m_memHandleToRpcMem[mem_handle] = RpcMemTensorData(
+            fd, ((uint8_t*)memPointer) + offset, tensorDataSize, totalBufferSize, offset
+    );
+    QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
+    QNN_TENSOR_SET_MEM_HANDLE(tensor, mem_handle);
+    if (cur_mem_handle == nullptr) // Cache memory config for initial memRegisters only
+        memConfigList[std::make_tuple(fd, offset, contextHandle)] = tensor;
+    return true;
+}
+bool RpcMem::mapFusedBufferOffset(
+        Qnn_Tensor_t*       tensor,
+        int                 alloc_idx,
+        size_t              offset,
+        Qnn_ContextHandle_t ctx,
+        size_t              size
+) {
+    return mapFusedBufferOffset(
+            tensor,
+            size,
+            m_fusedFds[alloc_idx],
+            offset,
+            m_fusedBuffers[alloc_idx].second,
+            m_fusedBuffers[alloc_idx].first,
+            ctx
+    );
+}
+bool RpcMem::deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_ERROR("Received nullptr for tensor");
+        return false;
+    }
+    if (m_tensorToRpcMem.find(tensor) == m_tensorToRpcMem.end()) {
+        QNN_ERROR("Tensor not found");
+        return false;
+    }
+    // We are not freeing memhandles here since they are already freed when
+    // freeContext() gets called in the destructor of QnnApi class which
+    // happens before this point
+    // Qnn_MemHandle_t memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    // QNN_ERROR("Interface handle %p memhandle %p", m_qnnInterface, memHandle);
+    // if (QNN_SUCCESS != m_qnnInterface->memDeRegister(&memHandle, 1)) {
+    //   QNN_ERROR("Failed to deregister ion memory with the backend");
+    //   return false;
+    // }
+    QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_UNDEFINED);
+    QNN_TENSOR_SET_MEM_HANDLE(tensor, nullptr);
+    m_tensorToRpcMem.erase(tensor);
+    return true;
+}
+void RpcMem::freeFusedBuffers() {
+    // for (auto& memHandle : m_orphanedMemHandles) {
+    //   if (QNN_SUCCESS != m_qnnInterface->memDeRegister(&memHandle, 1)) {
+    //     QNN_ERROR("Failed to deregister ion memory with the backend");
+    //   }
+    // }
+    for (auto& [mem_ptr, buffer_size] : m_fusedBuffers) {
+        QNN_DEBUG("Freeing fused buffer %p (size=%lu)", mem_ptr, buffer_size);
+        m_rpcMemFree(mem_ptr);
+    }
+}

Genie/Genie/src/qualla/engines/qnn-api/RpcMem.hpp ADDED Viewed

	@@ -0,0 +1,115 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <unordered_set>
+#include "IBufferAlloc.hpp"
+#include "QnnInterface.h"
+#include "Log.hpp"
+typedef void* (*RpcMemAllocFn_t)(int, uint32_t, int);
+typedef void (*RpcMemFreeFn_t)(void*);
+typedef int (*RpcMemToFdFn_t)(void*);
+struct RpcMemTensorData {
+    int    fd;
+    void*  memPointer;
+    size_t size;
+    size_t totalBufferSize;
+    size_t offset;
+    RpcMemTensorData() : fd(-1), memPointer(nullptr), size(0) {}
+    RpcMemTensorData(int fdIn, void* memPointerIn, size_t sizeIn)
+        : fd(fdIn), memPointer(memPointerIn), size(sizeIn) {}
+    RpcMemTensorData(
+            int    fdIn,
+            void*  memPointerIn,
+            size_t sizeIn,
+            size_t totalBufferSizeIn,
+            size_t offsetIn
+    )
+        : fd(fdIn), memPointer(memPointerIn), size(sizeIn), totalBufferSize(totalBufferSizeIn),
+          offset(offsetIn) {}
+};
+class RpcMem final : public IBufferAlloc {
+  public:
+    RpcMem(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface);
+    // Disable copy constructors, r-value referencing, etc
+    RpcMem(const RpcMem&)            = delete;
+    RpcMem& operator=(const RpcMem&) = delete;
+    RpcMem(RpcMem&&)                 = delete;
+    RpcMem& operator=(RpcMem&&)      = delete;
+    bool    initialize() override;
+    void*   getBuffer(Qnn_Tensor_t* tensor) override;
+    int     getFd(Qnn_Tensor_t* tensor) override;
+    size_t getOffset(Qnn_Tensor_t* tensor) override;
+    size_t getBufferSize(Qnn_Tensor_t* tensor) override;
+    size_t getTotalBufferSize(Qnn_Tensor_t* tensor) override;
+    bool allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) override;
+    bool freeTensorBuffer(Qnn_Tensor_t* tensor) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) override;
+    bool useExternalMemory(Qnn_Tensor_t* dest, void* extMem) override;
+    void* allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) override;
+    bool  allocateBuffers(
+             const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+             std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+     ) override;
+    bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            size_t              tensorDataSize,
+            int32_t             fd,
+            uint32_t            offset,
+            uint64_t            totalBufferSize,
+            void*               memPointer,
+            Qnn_ContextHandle_t contextHandle
+    ) override;
+    bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) override;
+    void freeFusedBuffers() override;
+    bool mapFusedBufferOffset(
+            Qnn_Tensor_t*       tensor,
+            int                 alloc_idx,
+            size_t              offset,
+            Qnn_ContextHandle_t ctx,
+            size_t              size
+    ) override;
+    virtual ~RpcMem();
+  private:
+    RpcMemTensorData* getRpcMemTensorData(Qnn_Tensor_t* tensor);
+    // Pointer to the dlopen'd libcdsprpc.so shared library which contains
+    // rpcmem_alloc, rpcmem_free, rpcmem_to_fd APIs
+    void* m_libCdspRpc;
+    // Function pointer to rpcmem_alloc
+    RpcMemAllocFn_t m_rpcMemAlloc;
+    // Function pointer to rpcmem_free
+    RpcMemFreeFn_t m_rpcMemFree;
+    // Function pointer to rpcmem_to_fd
+    RpcMemToFdFn_t          m_rpcMemToFd;
+    QNN_INTERFACE_VER_TYPE* m_qnnInterface;
+    Qnn_ContextHandle_t     m_contextHandle;
+    std::unordered_map<Qnn_Tensor_t*, RpcMemTensorData> m_tensorToRpcMem;
+    std::unordered_set<Qnn_Tensor_t*>                   m_sameMemoryFreeTensors;
+    std::vector<std::pair<void*, size_t>> m_fusedBuffers; // vector<<memPointer, bufferSize>>
+    std::vector<int32_t>                  m_fusedFds;
+    std::unordered_set<Qnn_MemHandle_t>   m_orphanedMemHandles;
+    std::unordered_map<Qnn_MemHandle_t, RpcMemTensorData>                 m_memHandleToRpcMem;
+    std::map<std::tuple<int, size_t, Qnn_ContextHandle_t>, Qnn_Tensor_t*> memConfigList;
+};

Genie/Genie/src/qualla/engines/qnn-api/dlwrap.cpp ADDED Viewed

	@@ -0,0 +1,66 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifdef _WIN32
+    #pragma warning(disable : 4133 4996)
+    #include <inttypes.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <windows.h>
+    #include <wchar.h>
+    #include "dlwrap.hpp"
+static const char* last_func;
+static long        last_err;
+void* dlopen(const char* dll, int flags) {
+    HINSTANCE h = LoadLibraryA(dll);
+    if (h == NULL) {
+        last_err  = GetLastError();
+        last_func = "dlopen";
+    }
+    return h;
+}
+int dlclose(void* h) {
+    if (!FreeLibrary((HINSTANCE)h)) {
+        last_err  = GetLastError();
+        last_func = "dlclose";
+        return -1;
+    }
+    return 0;
+}
+void* dlsym(void* h, const char* name) {
+    FARPROC p = GetProcAddress((HINSTANCE)h, name);
+    if (!p) {
+        last_err  = GetLastError();
+        last_func = "dlsym";
+    }
+    return (void*)(intptr_t)p;
+}
+const char* dlerror(void) {
+    static char str[88];
+    if (!last_err) return NULL;
+    sprintf(str, "%s error #%ld", last_func, last_err);
+    last_err  = 0;
+    last_func = NULL;
+    return str;
+}
+#endif // _WIN32

Genie/Genie/src/qualla/engines/qnn-api/dlwrap.hpp ADDED Viewed

	@@ -0,0 +1,33 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef DLWRAP_HPP
+#define DLWRAP_HPP
+#ifndef _WIN32
+    // Just include regular dlfcn
+    #include <dlfcn.h>
+#else // _WIN32
+    // Define basic set dl functions and flags
+    #define RTLD_GLOBAL 0x100
+    #define RTLD_LOCAL  0x000
+    #define RTLD_LAZY   0x000
+    #define RTLD_NOW    0x001
+void*       dlopen(const char* filename, int flag);
+int         dlclose(void* handle);
+void*       dlsym(void* handle, const char* name);
+const char* dlerror(void);
+#endif // _WIN32
+#endif // DLWRAP_HPP

Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.cpp ADDED Viewed

	@@ -0,0 +1,104 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "qnn-utils.hpp"
+#include <string>
+#include <fstream>
+#include <filesystem>
+#include <sstream>
+#include "QnnApi.hpp"
+#include <fmt/format.h>
+namespace fs = std::filesystem;
+namespace qualla {
+namespace QnnUtils {
+  // Alternate implementation for bw() = lambda x: (10 * ((x & 0xf0)>>4) + (x & 0xf)) // 8
+  int DataType::bw() { return (_dtype == QNN_DATATYPE_UNDEFINED) ? -1 : QnnApi::getDataTypeSize(_dtype);}
+  int DataType::type() {return (_dtype == QNN_DATATYPE_UNDEFINED) ? -1 : _dtype >> 4; }
+  int32_t DataType::val() { return static_cast<int32_t>(_dtype); }
+bool writeRawData(void* data, size_t size, const fs::path& path) {
+    auto p = path.parent_path();
+    if (!fs::exists(p) && !fs::create_directories(p)) return false;
+    std::ofstream f(path, std::ofstream::binary);
+    f.write((char*)data, size);
+    f.close();
+    return true;
+}
+bool readRawData(void* data, size_t size, const fs::path& path) {
+    if (fs::file_size(path) != size) {
+        throw std::runtime_error(fmt::format(
+                "file size doesnot match: {} size {}, buf-size {}",
+                path.string(),
+                fs::file_size(path),
+                size
+        ));
+    }
+    std::ifstream f(path, std::ifstream::binary);
+    f.read((char*)data, size);
+    f.close();
+    return true;
+}
+void getQuantParamString(
+        const std::vector<QuantParam>& quantParam,
+        std::string&                   scale_string,
+        std::string&                   offset_string
+) {
+    std::ostringstream scales_s;
+    std::ostringstream offsets_s;
+    for (int i = 0; i < quantParam.size(); i++) {
+        if (i != 0) {
+            scales_s << ", ";
+            offsets_s << ", ";
+        }
+        scales_s << std::fixed << std::setprecision(20) << quantParam[i].scale;
+        offsets_s << quantParam[i].offset;
+    }
+    scale_string  = std::move(scales_s.str());
+    offset_string = std::move(offsets_s.str());
+}
+const char* DataType::str() {
+    // clang-format off
+    switch (_dtype) {
+    case QNN_DATATYPE_INT_8: return "QNN_DATATYPE_INT_8";
+    case QNN_DATATYPE_INT_16: return "QNN_DATATYPE_INT_16";
+    case QNN_DATATYPE_INT_32: return "QNN_DATATYPE_INT_32";
+    case QNN_DATATYPE_INT_64: return "QNN_DATATYPE_INT_64";
+    case QNN_DATATYPE_UINT_8: return "QNN_DATATYPE_UINT_8";
+    case QNN_DATATYPE_UINT_16: return "QNN_DATATYPE_UINT_16";
+    case QNN_DATATYPE_UINT_32: return "QNN_DATATYPE_UINT_32";
+    case QNN_DATATYPE_UINT_64: return "QNN_DATATYPE_UINT_64";
+    case QNN_DATATYPE_FLOAT_16: return "QNN_DATATYPE_FLOAT_16";
+    case QNN_DATATYPE_FLOAT_32: return "QNN_DATATYPE_FLOAT_32";
+    case QNN_DATATYPE_FLOAT_64: return "QNN_DATATYPE_FLOAT_64";
+    case QNN_DATATYPE_SFIXED_POINT_4: return "QNN_DATATYPE_SFIXED_POINT_4";
+    case QNN_DATATYPE_SFIXED_POINT_8: return "QNN_DATATYPE_SFIXED_POINT_8";
+    case QNN_DATATYPE_SFIXED_POINT_16: return "QNN_DATATYPE_SFIXED_POINT_16";
+    case QNN_DATATYPE_SFIXED_POINT_32: return "QNN_DATATYPE_SFIXED_POINT_32";
+    case QNN_DATATYPE_UFIXED_POINT_4: return "QNN_DATATYPE_UFIXED_POINT_4";
+    case QNN_DATATYPE_UFIXED_POINT_8: return "QNN_DATATYPE_UFIXED_POINT_8";
+    case QNN_DATATYPE_UFIXED_POINT_16: return "QNN_DATATYPE_UFIXED_POINT_16";
+    case QNN_DATATYPE_UFIXED_POINT_32: return "QNN_DATATYPE_UFIXED_POINT_32";
+    case QNN_DATATYPE_BOOL_8: return "QNN_DATATYPE_BOOL_8";
+    case QNN_DATATYPE_STRING: return "QNN_DATATYPE_STRING";
+    default: return "QNN_DATATYPE_UNDEFINED";
+    }
+    // clang-format on
+}
+} // namespace QnnUtils
+} // namespace qualla

Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp ADDED Viewed

	@@ -0,0 +1,157 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#ifdef _MSC_VER
+    #pragma warning(disable : 4068)
+#endif
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <filesystem>
+#include "QnnApiUtils.hpp"
+#include "QnnInterface.h"
+namespace qualla {
+namespace QnnUtils {
+class DataType {
+  private:
+    Qnn_DataType_t _dtype{QNN_DATATYPE_UNDEFINED};
+  public:
+    DataType() = default;
+    DataType(const Qnn_Tensor_t* tensor) : _dtype(QNN_TENSOR_GET_DATA_TYPE(tensor)) {}
+    DataType(Qnn_DataType_t dtype) : _dtype(dtype) {};
+    // Enable switch and comparisons
+    constexpr operator Qnn_DataType_t() const { return _dtype; }
+    int bw();
+    int type();
+    int32_t val();
+    const char* str();
+};
+bool writeRawData(void* tensorData, size_t tensorSize, const std::filesystem::path& path);
+bool readRawData(void* tensorData, size_t tensorSize, const std::filesystem::path& path);
+struct Dims {
+    int32_t batch = 1;
+    int32_t height, width, channel, bitWidth;
+    Dims() : height(0), width(0), channel(0), bitWidth(0) {}
+    Dims(int32_t height, int32_t width, int32_t channel, int32_t bitWidth)
+        : height(height), width(width), channel(channel), bitWidth(bitWidth) {}
+    Dims(std::vector<size_t>& tDims)
+        : height((int32_t)tDims[1]), width((int32_t)tDims[2]), channel((int32_t)tDims[3]),
+          bitWidth((int32_t)tDims[4]) {
+        // Hack to mix batch dimension
+        if (tDims[0] != 1 && tDims[1] == 1) height = tDims[0];
+        if (tDims[0] >  1 && tDims[1] != 1) batch  = tDims[0];
+    }
+    bool operator==(const Dims& rhs) const {
+        return (height == rhs.height) && (width == rhs.width) && (channel == rhs.channel) &&
+               (bitWidth == rhs.bitWidth);
+    }
+    bool   operator!=(const Dims& rhs) const { return !(operator==(rhs)); }
+    size_t getNumElements() const { return (size_t)(height * width * channel); }
+    size_t getSize() const { return (size_t)(batch * height * width * channel * bitWidth); }
+    size_t getAlignedSize() const {
+        size_t size = getSize();
+        if ((size & uint64_t{7}) != uint64_t{0}) {
+            size += (uint64_t{8} - (size & uint64_t{7}));
+        }
+        return size;
+    }
+    int32_t getMaxDim() const { return std::max({height, width, channel}); };
+    Dims    T() const { return Dims(width, height, channel, bitWidth); }
+};
+struct QuantParam {
+    double  scale;
+    int32_t offset;
+    QuantParam() {}
+    QuantParam(double scale_val, int32_t offset_val) : scale(scale_val), offset(offset_val) {}
+};
+struct Tensor {
+    Qnn_Tensor_t*           tensor = nullptr;
+    Dims                    dims;
+    std::vector<QuantParam> quantParam;
+    DataType                dtype;
+    Tensor() {}
+    Tensor(Qnn_Tensor_t* tensorVal, Dims dimsVal, std::vector<QuantParam> quantParamVec)
+        : tensor(tensorVal), dims(dimsVal), quantParam(quantParamVec),
+          dtype(QNN_TENSOR_GET_DATA_TYPE(tensorVal)) {}
+};
+// Maps tensor name to QnnUtils::Tensor<Qnn_Tensor_t* tensor, dims, quantparams>
+typedef std::map<std::string, Tensor> TensorMap;
+static inline uint8_t sat_round(const uint16_t x) {
+    const uint16_t rounded   = x + 0x80;             // add 0.5
+    const uint16_t corrected = std::max(rounded, x); // catch unsigned wrap around
+    const uint16_t shifted   = corrected >> 8;       // divide by 256
+    return static_cast<uint8_t>(shifted);            // to 8-bit
+}
+static inline void downcast_u16_to_u8(uint8_t* dest, const uint16_t* src, size_t nmemb) {
+    for (size_t i = 0; i < nmemb; i++)
+        dest[i] = sat_round(src[i]);
+}
+template <typename FloatType, typename IntType>
+static inline void quantizeTensorPtr(
+        FloatType* tensor_float,
+        IntType*   tensor_quant,
+        int32_t    offset,
+        double     scale,
+        size_t     nmemb
+) {
+#pragma clang loop vectorize(enable) interleave(enable)
+    for (size_t i = 0; i < nmemb; i++) {
+        double val      = tensor_float[i];
+        tensor_quant[i] = static_cast<IntType>(val / scale - offset);
+    }
+}
+template <typename FloatType, typename IntType>
+static inline void perWidthQuantizeTensorPtr(
+        FloatType*                         tensor_float,
+        IntType*                           tensor_quant,
+        std::vector<QnnUtils::QuantParam>& quantParam,
+        int32_t                            height,
+        int32_t                            width,
+        int32_t                            channel
+) {
+    for (size_t h = 0; h < height; h++) {
+        for (size_t w = 0; w < width; w++) {
+            double  scale  = quantParam[w].scale;
+            int32_t offset = quantParam[w].offset;
+#pragma clang loop vectorize(enable) interleave(enable)
+            for (size_t c = 0; c < channel; c++) {
+                int32_t i       = (h * width * channel) + (w * channel) + c;
+                double  val     = tensor_float[i];
+                tensor_quant[i] = static_cast<IntType>(val / scale - offset);
+            }
+        }
+    }
+}
+void getQuantParamString(
+        const std::vector<QuantParam>& quantParam,
+        std::string&                   scale_string,
+        std::string&                   offset_string
+);
+} // namespace QnnUtils
+} // namespace qualla