jstzwjr commited on Apr 28, 2025

Commit

c71c7c5

1 Parent(s): 11481cd

add genie2.29

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Genie/Genie/GenieSymbols.default +5 -0
Genie/Genie/make/Android.mk +2 -0
Genie/Genie/make/Application.mk +1 -1
Genie/Genie/make/Makefile.linux-x86_64 +14 -5
Genie/Genie/src/Dialog.cpp +54 -80
Genie/Genie/src/Dialog.hpp +6 -1
Genie/Genie/src/Embedding.cpp +740 -0
Genie/Genie/src/Embedding.hpp +56 -0
Genie/Genie/src/Exception.hpp +1 -0
Genie/Genie/src/GenieDialog.cpp +19 -1
Genie/Genie/src/GenieEmbedding.cpp +118 -0
Genie/Genie/src/GenieSampler.cpp +93 -0
Genie/Genie/src/Macro.hpp +2 -0
Genie/Genie/src/Sampler.cpp +275 -0
Genie/Genie/src/Sampler.hpp +60 -0
Genie/Genie/src/qualla/context.cpp +8 -0
Genie/Genie/src/qualla/dialogs/ssd-q1.cpp +2 -2
Genie/Genie/src/qualla/engine.cpp +1 -1
Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.cpp +317 -0
Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.hpp +128 -0
Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp +15 -1
Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp +76 -1
Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp +25 -0
Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp +369 -90
Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp +9 -0
Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp +5 -5
Genie/Genie/src/qualla/engines/qnn-cpu.cpp +55 -3
Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp +51 -0
Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp +25 -0
Genie/Genie/src/qualla/engines/qnn-gpu.cpp +193 -0
Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.cpp +603 -0
Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.hpp +136 -0
Genie/Genie/src/qualla/engines/qnn-htp.cpp +2 -2
Genie/Genie/src/qualla/engines/qnn-htp.hpp +1 -1
Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp +9 -3
Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp +2 -1
Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp +8 -4
Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp +4 -6
Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp +1 -0
Genie/Genie/src/qualla/include/qualla/dialog.hpp +1 -0
Genie/Genie/src/qualla/include/qualla/engine.hpp +1 -1
Genie/Genie/src/qualla/include/qualla/sampler.hpp +1 -0
Genie/Genie/src/qualla/sampler.cpp +4 -0
Genie/Genie/src/qualla/samplers/basic.cpp +8 -0
Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock +26 -26
Genie/Model/model.cpp +23 -2
Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json +2 -1
Genie/configs/llama2-7b/llama2-7b-genaitransformer-lora.json +62 -0
Genie/configs/llama2-7b/llama2-7b-genaitransformer.json +4 -1
Genie/configs/llama2-7b/llama2-7b-gpu.json +43 -0

Genie/Genie/GenieSymbols.default CHANGED Viewed

@@ -14,6 +14,11 @@
     GenieDialogConfig_free*;
     GenieDialog_create*;
     GenieDialog_query*;
     GenieDialog_tokenQuery*;
     GenieDialog_embeddingQuery*;
     GenieDialog_save*;

     GenieDialogConfig_free*;
     GenieDialog_create*;
     GenieDialog_query*;
+    GenieDialog_getSampler*;
+    GenieSampler_applyConfig*;
+    GenieSamplerConfig_createFromJson*;
+    GenieSamplerConfig_setParam*;
+    GenieSamplerConfig_free*;
     GenieDialog_tokenQuery*;
     GenieDialog_embeddingQuery*;
     GenieDialog_save*;

Genie/Genie/make/Android.mk CHANGED Viewed

@@ -29,6 +29,7 @@ PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../include/QNN/HTP
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/tokenizers
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-api
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-htp
 #========================== Define T2T Lib variables =============================================
@@ -45,6 +46,7 @@ MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/dialogs
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-api/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-htp/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/utils/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/loggers/*.cpp)

 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/tokenizers
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-api
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu
+PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-gpu
 PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-htp
 #========================== Define T2T Lib variables =============================================
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-api/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu/*.cpp)
+MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-gpu/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-htp/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/utils/*.cpp)
 MY_SRC_FILES                   += $(wildcard $(LOCAL_PATH)/../src/qualla/loggers/*.cpp)

Genie/Genie/make/Application.mk CHANGED Viewed

@@ -10,5 +10,5 @@ APP_ABI      := arm64-v8a
 APP_STL      := c++_shared
 APP_PLATFORM := android-21
 APP_MODULES := Genie
-APP_CPPFLAGS += -std=c++2a -O3 -Wall -frtti -fexceptions -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_HTP=TRUE -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 APP_LDFLAGS  += -lc -lm -ldl -Wl,--version-script=GenieSymbols.default -Wl,--strip-all

 APP_STL      := c++_shared
 APP_PLATFORM := android-21
 APP_MODULES := Genie
+APP_CPPFLAGS += -std=c++2a -O3 -Wall -frtti -fexceptions -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_HTP=TRUE -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE  -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 APP_LDFLAGS  += -lc -lm -ldl -Wl,--version-script=GenieSymbols.default -Wl,--strip-all

Genie/Genie/make/Makefile.linux-x86_64 CHANGED Viewed

@@ -17,6 +17,7 @@ SRC_DIR_SAMPLE_DIALOGS := src/qualla/dialogs
 SRC_DIR_GENIE_ENGINES := src/qualla/engines
 SRC_DIR_GENIE_QNN_API := src/qualla/engines/qnn-api
 SRC_DIR_GENIE_ENGINES_CPU := src/qualla/engines/qnn-cpu
 SRC_DIR_GENIE_UTILS := src/qualla/utils
 #
 SRC_DIR_GENIE_LOGGERS := src/qualla/loggers
@@ -29,6 +30,7 @@ SRC_DIR_GENIE := src
 # Includes
 GENIE_ENGINES_CPU_INCLUDE := src/qualla/engines/qnn-cpu
 GENIE_ENGINES_API_INCLUDE := src/qualla/engines/qnn-api
 GENIE_ENGINES_HTP_INCLUDE := src/qualla/engines/qnn-htp
 GENIE_TOKENIZER_INCLUDE   := src/qualla/tokenizers
@@ -62,7 +64,7 @@ endif
 GENIE_all: $(libGenie)
 # Include paths
-INCLUDES += -I$(GENIE_INCLUDE) -I$(QUALLA_INCLUDE) -I$(SRC_DIR_GENIE_TOKENIZERS) -I$(QNN_API_INCLUDE) -I$(GENIE_ENGINES_CPU_INCLUDE) -I$(QNN_API_HTP_INCLUDE) -I$(GENIE_ENGINES_API_INCLUDE) -I$(GENIE_TOKENIZER_INCLUDE) -I$(GENIE_C_API_HEADERS_INCLUDE)
 # set compiler flags
 COMMON_CXXFLAGS = -std=c++2a -frtti -fPIC -Wall -pg -pthread -nostdinc++ -stdlib=libc++ -idirafter /usr/lib/llvm-14/include/c++/v1 -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include $(INCLUDES)
@@ -71,11 +73,11 @@ COMMON_LDFLAGS = -shared -s -fPIC -pthread -L/usr/lib/x86_64-linux-gnu  -L./src/
 COMMON_CFLAGS = -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include
 ifdef QNN_DEBUG_ENABLE
-CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API="" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 CFLAGS += $(COMMON_CFLAGS)
 LDFLAGS += $(COMMON_LDFLAGS)
 else
-CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 CFLAGS += $(COMMON_CFLAGS)
 LDFLAGS += $(COMMON_LDFLAGS) -fvisibility=hidden -flto
 endif
@@ -89,6 +91,7 @@ SOURCES_GENIE_QNN_API_CPP := $(wildcard $(SRC_DIR_GENIE_QNN_API)/*.cpp)
 SOURCES_GENIE_ENGINES_CPP := $(filter-out $(SRC_DIR_GENIE_ENGINES)/qnn-htp.cpp, $(wildcard $(SRC_DIR_GENIE_ENGINES)/*.cpp))
 SOURCES_GENIE_DIALOGS_CPP := $(wildcard $(SRC_DIR_SAMPLE_DIALOGS)/*.cpp)
 SOURCES_GENIE_ENGINES_CPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_CPU)/*.cpp)
 SOURCES_GENIE_UTILS_CPP := $(wildcard $(SRC_DIR_GENIE_UTILS)/*.cpp)
@@ -108,6 +111,8 @@ OBJ_DIR_GENIE_ENGINES := $(OBJ_DIR_QUALLA)/engines
 OBJ_DIR_GENIE_UTILS := $(OBJ_DIR_QUALLA)/utils
 OBJ_DIR_GENIE_ENGINES_CPU := $(OBJ_DIR_QUALLA)/engines/qnn-cpu
 $(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_CPU))
 OBJ_DIR_GENIE_LOGGERS := obj/$(QNN_TARGET)/qualla/loggers
 OBJ_DIR_GENIE_SAMPLERS := obj/$(QNN_TARGET)/qualla/samplers
@@ -125,6 +130,7 @@ OBJECTS_GENIE_ENGINES := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES)/%.o,$(foreach
 OBJECTS_GENIE_DIALOGS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_DIALOGS)/%.o,$(foreach x,$(SOURCES_GENIE_DIALOGS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_UTILS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_UTILS)/%.o,$(foreach x,$(SOURCES_GENIE_UTILS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_ENGINES_CPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_CPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_CPU_CPP),$(notdir $(x))))
 OBJECTS_GENIE_LOGGERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_LOGGERS)/%.o,$(foreach x,$(SOURCES_GENIE_LOGGERS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_SAMPLERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_SAMPLERS)/%.o,$(foreach x,$(SOURCES_GENIE_SAMPLERS_CPP),$(notdir $(x))))
@@ -157,16 +163,18 @@ $(OBJ_DIR_GENIE_UTILS)/%.o: $(SRC_DIR_GENIE_UTILS)/%.cpp $(CXX) $(CXXFLAGS) -c $
 $(OBJ_DIR_GENIE_ENGINES_CPU)/%.o: $(SRC_DIR_GENIE_ENGINES_CPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 $(OBJ_DIR_GENIE_LOGGERS)/%.o: $(SRC_DIR_GENIE_LOGGERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 $(OBJ_DIR_GENIE_SAMPLERS)/%.o: $(SRC_DIR_GENIE_SAMPLERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 # set up resources
-directories := $(TARGET_DIR) $(OBJ_DIR_GENIE) $(OBJ_DIR_GENIE_QNN_API) $(OBJ_DIR_QUALLA) $(OBJ_DIR_GENIE_TOKENIZERS) $(OBJ_DIR_GENIE_ENGINES) $(OBJ_DIR_GENIE_DIALOGS) $(OBJ_DIR_GENIE_UTILS) $(OBJ_DIR_GENIE_ENGINES_CPU) $(OBJ_DIR_GENIE_LOGGERS) $(OBJ_DIR_GENIE_SAMPLERS)
 # Compile
-$(libGenie): $(OBJECTS_GENIE) $(OBJECTS_QUALLA) $(OBJECTS_GENIE_QNN_API) $(OBJECTS_GENIE_TOKENIZERS) $(OBJECTS_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU) $(OBJECTS_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS) | $(directories)
 	$(CXX) $(CXXFLAGS) -shared -o $@ $^ $(LIBS) $(libtokenizers)
@@ -179,6 +187,7 @@ $(OBJECTS_GENIE_ENGINES): | $(OBJ_DIR_GENIE_ENGINES)
 $(OBJECTS_GENIE_DIALOGS): | $(OBJ_DIR_GENIE_DIALOGS)
 $(OBJECTS_GENIE_UTILS): | $(OBJ_DIR_GENIE_UTILS)
 $(OBJECTS_GENIE_ENGINES_CPU): | $(OBJ_DIR_GENIE_ENGINES_CPU)
 $(OBJECTS_GENIE_LOGGERS): | $(OBJ_DIR_GENIE_LOGGERS)
 $(OBJECTS_GENIE_SAMPLERS): | $(OBJ_DIR_GENIE_SAMPLERS)

 SRC_DIR_GENIE_ENGINES := src/qualla/engines
 SRC_DIR_GENIE_QNN_API := src/qualla/engines/qnn-api
 SRC_DIR_GENIE_ENGINES_CPU := src/qualla/engines/qnn-cpu
+SRC_DIR_GENIE_ENGINES_GPU := src/qualla/engines/qnn-gpu
 SRC_DIR_GENIE_UTILS := src/qualla/utils
 #
 SRC_DIR_GENIE_LOGGERS := src/qualla/loggers
 # Includes
 GENIE_ENGINES_CPU_INCLUDE := src/qualla/engines/qnn-cpu
+GENIE_ENGINES_GPU_INCLUDE := src/qualla/engines/qnn-gpu
 GENIE_ENGINES_API_INCLUDE := src/qualla/engines/qnn-api
 GENIE_ENGINES_HTP_INCLUDE := src/qualla/engines/qnn-htp
 GENIE_TOKENIZER_INCLUDE   := src/qualla/tokenizers
 GENIE_all: $(libGenie)
 # Include paths
+INCLUDES += -I$(GENIE_INCLUDE) -I$(QUALLA_INCLUDE) -I$(SRC_DIR_GENIE_TOKENIZERS) -I$(QNN_API_INCLUDE) -I$(GENIE_ENGINES_CPU_INCLUDE) -I$(GENIE_ENGINES_GPU_INCLUDE) -I$(QNN_API_HTP_INCLUDE) -I$(GENIE_ENGINES_API_INCLUDE) -I$(GENIE_TOKENIZER_INCLUDE) -I$(GENIE_C_API_HEADERS_INCLUDE)
 # set compiler flags
 COMMON_CXXFLAGS = -std=c++2a -frtti -fPIC -Wall -pg -pthread -nostdinc++ -stdlib=libc++ -idirafter /usr/lib/llvm-14/include/c++/v1 -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include $(INCLUDES)
 COMMON_CFLAGS = -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include
 ifdef QNN_DEBUG_ENABLE
+CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API="" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 CFLAGS += $(COMMON_CFLAGS)
 LDFLAGS += $(COMMON_LDFLAGS)
 else
+CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE
 CFLAGS += $(COMMON_CFLAGS)
 LDFLAGS += $(COMMON_LDFLAGS) -fvisibility=hidden -flto
 endif
 SOURCES_GENIE_ENGINES_CPP := $(filter-out $(SRC_DIR_GENIE_ENGINES)/qnn-htp.cpp, $(wildcard $(SRC_DIR_GENIE_ENGINES)/*.cpp))
 SOURCES_GENIE_DIALOGS_CPP := $(wildcard $(SRC_DIR_SAMPLE_DIALOGS)/*.cpp)
 SOURCES_GENIE_ENGINES_CPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_CPU)/*.cpp)
+SOURCES_GENIE_ENGINES_GPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_GPU)/*.cpp)
 SOURCES_GENIE_UTILS_CPP := $(wildcard $(SRC_DIR_GENIE_UTILS)/*.cpp)
 OBJ_DIR_GENIE_UTILS := $(OBJ_DIR_QUALLA)/utils
 OBJ_DIR_GENIE_ENGINES_CPU := $(OBJ_DIR_QUALLA)/engines/qnn-cpu
 $(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_CPU))
+OBJ_DIR_GENIE_ENGINES_GPU := $(OBJ_DIR_QUALLA)/engines/qnn-gpu
+$(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_GPU))
 OBJ_DIR_GENIE_LOGGERS := obj/$(QNN_TARGET)/qualla/loggers
 OBJ_DIR_GENIE_SAMPLERS := obj/$(QNN_TARGET)/qualla/samplers
 OBJECTS_GENIE_DIALOGS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_DIALOGS)/%.o,$(foreach x,$(SOURCES_GENIE_DIALOGS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_UTILS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_UTILS)/%.o,$(foreach x,$(SOURCES_GENIE_UTILS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_ENGINES_CPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_CPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_CPU_CPP),$(notdir $(x))))
+OBJECTS_GENIE_ENGINES_GPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_GPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_GPU_CPP),$(notdir $(x))))
 OBJECTS_GENIE_LOGGERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_LOGGERS)/%.o,$(foreach x,$(SOURCES_GENIE_LOGGERS_CPP),$(notdir $(x))))
 OBJECTS_GENIE_SAMPLERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_SAMPLERS)/%.o,$(foreach x,$(SOURCES_GENIE_SAMPLERS_CPP),$(notdir $(x))))
 $(OBJ_DIR_GENIE_ENGINES_CPU)/%.o: $(SRC_DIR_GENIE_ENGINES_CPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
+$(OBJ_DIR_GENIE_ENGINES_GPU)/%.o: $(SRC_DIR_GENIE_ENGINES_GPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 $(OBJ_DIR_GENIE_LOGGERS)/%.o: $(SRC_DIR_GENIE_LOGGERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 $(OBJ_DIR_GENIE_SAMPLERS)/%.o: $(SRC_DIR_GENIE_SAMPLERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@
 # set up resources
+directories := $(TARGET_DIR) $(OBJ_DIR_GENIE) $(OBJ_DIR_GENIE_QNN_API) $(OBJ_DIR_QUALLA) $(OBJ_DIR_GENIE_TOKENIZERS) $(OBJ_DIR_GENIE_ENGINES) $(OBJ_DIR_GENIE_DIALOGS) $(OBJ_DIR_GENIE_UTILS) $(OBJ_DIR_GENIE_ENGINES_CPU) $(OBJ_DIR_GENIE_ENGINES_GPU) $(OBJ_DIR_GENIE_LOGGERS) $(OBJ_DIR_GENIE_SAMPLERS)
 # Compile
+$(libGenie): $(OBJECTS_GENIE) $(OBJECTS_QUALLA) $(OBJECTS_GENIE_QNN_API) $(OBJECTS_GENIE_TOKENIZERS) $(OBJECTS_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU) $(OBJECTS_GENIE_ENGINES_GPU) $(OBJECTS_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS) | $(directories)
 	$(CXX) $(CXXFLAGS) -shared -o $@ $^ $(LIBS) $(libtokenizers)
 $(OBJECTS_GENIE_DIALOGS): | $(OBJ_DIR_GENIE_DIALOGS)
 $(OBJECTS_GENIE_UTILS): | $(OBJ_DIR_GENIE_UTILS)
 $(OBJECTS_GENIE_ENGINES_CPU): | $(OBJ_DIR_GENIE_ENGINES_CPU)
+$(OBJECTS_GENIE_ENGINES_GPU): | $(OBJ_DIR_GENIE_ENGINES_GPU)
 $(OBJECTS_GENIE_LOGGERS): | $(OBJ_DIR_GENIE_LOGGERS)
 $(OBJECTS_GENIE_SAMPLERS): | $(OBJ_DIR_GENIE_SAMPLERS)

Genie/Genie/src/Dialog.cpp CHANGED Viewed

@@ -95,81 +95,6 @@ static void translateContextConfig(const qualla::json& genieConfig, qualla::json
   }
 }
-//=============================================================================
-// Sampler::Config functions
-//=============================================================================
-static void validateSamplerConfig(const qualla::json& config) {
-  if (!config.is_object()) {
-    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "sampler config is not an object");
-  }
-  std::set<std::string> mandatoryFields{"version"};
-  for (const auto& field : mandatoryFields) {
-    if (!config.contains(field)) {
-      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing sampler field: " + field);
-    }
-  }
-  // component is used in the "ENFORCE" macros
-  std::string component = "sampler";
-  for (auto& item : config.items()) {
-    if (item.key() == "version") {
-      JSON_ENFORCE_NUMERIC();
-      if (item.value().get<int>() != 1) {
-        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
-                        "Invalid sampler config: unsupported version: " + item.value().dump());
-      }
-    } else if (item.key() == "seed") {
-      JSON_ENFORCE_NUMERIC();
-    } else if (item.key() == "temp") {
-      JSON_ENFORCE_NUMERIC();
-    } else if (item.key() == "top-k") {
-      JSON_ENFORCE_NUMERIC();
-    } else if (item.key() == "top-p") {
-      JSON_ENFORCE_NUMERIC();
-    } else if (item.key() == "greedy") {
-      JSON_ENFORCE_BOOLEAN();
-    } else {
-      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key());
-    }
-  }
-}
-static void translateSamplerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
-  if (genieConfig["dialog"].contains("sampler")) {
-    quallaConfig["sampler"]["type"] = "basic";
-    if (genieConfig["dialog"]["sampler"].contains("seed")) {
-      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
-    }
-    if (genieConfig["dialog"]["sampler"].contains("temp")) {
-      quallaConfig["sampler"]["temp"] = genieConfig["dialog"]["sampler"]["temp"];
-    }
-    quallaConfig["sampler"]["role"] = "primary";
-#if defined(GENIE_SPD_FEATURE)
-    if (genieConfig["dialog"]["type"] == "spd") {
-      quallaConfig["sampler"]["role"] = "target";
-    }
-#endif
-    if (genieConfig["dialog"]["sampler"].contains("top-k")) {
-      quallaConfig["sampler"]["top-k"] = genieConfig["dialog"]["sampler"]["top-k"];
-    }
-    if (genieConfig["dialog"]["sampler"].contains("top-p")) {
-      quallaConfig["sampler"]["top-p"] = genieConfig["dialog"]["sampler"]["top-p"];
-    }
-    if (genieConfig["dialog"]["sampler"].contains("greedy")) {
-      quallaConfig["sampler"]["greedy"] = genieConfig["dialog"]["sampler"]["greedy"];
-    }
-    if (genieConfig["dialog"]["sampler"].contains("seed")) {
-      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
-    }
-  }
-}
 //=============================================================================
 // Tokenizer::Config functions
 //=============================================================================
@@ -322,6 +247,8 @@ static void validateBackendHtpConfig(const qualla::json& config) {
     } else if (item.key() == "rope-theta") {
       rope_theta_set = true;
       JSON_ENFORCE_NUMERIC();
     } else {
       throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key());
     }
@@ -410,7 +337,7 @@ static void validateBackendConfig(const qualla::json& config) {
         htp = true;
       } else if (type == "QnnGenAiTransformer") {
         genai = true;
-      } else {
         throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
                         "Invalid backend config: unsupported type: " + item.value().dump());
       }
@@ -629,6 +556,9 @@ static void validateModelLibraryConfig(const qualla::json& config) {
       }
     } else if (item.key() == "model-bin") {
       JSON_ENFORCE_STRING();
     } else {
       throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key());
     }
@@ -956,6 +886,10 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig,
         quallaEngineConfig["use-async-Init"] =
             genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"];
       }
     } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") {
       quallaEngineConfig["type"]        = "qnn-cpu";
       quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer");
@@ -979,6 +913,8 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig,
         quallaEngineConfig["n_heads"] =
             genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"];
       }
     }
     if (genieEngineConfig["backend"].contains("extensions")) {
@@ -1020,6 +956,21 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig,
       quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"];
       quallaEngineConfig["op-package"] =
           getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider";
     }
     if (genieEngineConfig["model"].contains("positional-encoding")) {
       quallaEngineConfig["positional-encoding"]["type"] =
@@ -1424,7 +1375,7 @@ static void validateDialogConfig(const qualla::json& config) {
       validateTokenizerConfig(item.value());
     } else if (item.key() == "sampler") {
       JSON_ENFORCE_OBJECT();
-      validateSamplerConfig(item.value());
     } else if (item.key() == "engine") {
       JSON_ENFORCE_ARRAY_OR_OBJECT();
     } else if (item.key() == "embedding") {
@@ -1550,7 +1501,7 @@ static void translateDialogConfig(const qualla::json& genieConfig, qualla::json&
   translateContextConfig(genieConfig, quallaConfig);
   translateTokenizerConfig(genieConfig, quallaConfig);
-  translateSamplerConfig(genieConfig, quallaConfig);
   translateMultiEngineConfig(genieConfig, quallaConfig);
   translateEmbeddingConfig(genieConfig, quallaConfig);
 }
@@ -1611,7 +1562,7 @@ Dialog::Config::Config(const char* configStr) {
   m_config = config;
 }
-qualla::json Dialog::Config::getJson() const { return m_config; }
 //=============================================================================
 // Dialog functions
@@ -1640,6 +1591,27 @@ Dialog::Dialog(std::shared_ptr<Config> config) {
   if (!m_quallaDialog) {
     throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a dialog object");
   }
 }
 static_assert(qualla::Sentence::Code::COMPLETE ==
@@ -1801,4 +1773,6 @@ int32_t Dialog::tokenQuery(const uint32_t* tokens,
       kpis.generate.last_usec,
       kpis.tps.generate);
   return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
-}

   }
 }
 //=============================================================================
 // Tokenizer::Config functions
 //=============================================================================
     } else if (item.key() == "rope-theta") {
       rope_theta_set = true;
       JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "enable-graph-switching") {
+      JSON_ENFORCE_BOOLEAN();
     } else {
       throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key());
     }
         htp = true;
       } else if (type == "QnnGenAiTransformer") {
         genai = true;
+      } else if (type != "QnnGpu") {
         throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
                         "Invalid backend config: unsupported type: " + item.value().dump());
       }
       }
     } else if (item.key() == "model-bin") {
       JSON_ENFORCE_STRING();
+    } else if (item.key() == "lora") {
+      JSON_ENFORCE_OBJECT();
+      validateLoraConfig(item.value());
     } else {
       throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key());
     }
         quallaEngineConfig["use-async-Init"] =
             genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"];
       }
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("enable-graph-switching")) {
+        quallaEngineConfig["enable-graph-switching"] =
+            genieEngineConfig["backend"]["QnnHtp"]["enable-graph-switching"];
+      }
     } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") {
       quallaEngineConfig["type"]        = "qnn-cpu";
       quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer");
         quallaEngineConfig["n_heads"] =
             genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"];
       }
+    } else if (genieEngineConfig["backend"]["type"] == "QnnGpu") {
+      quallaEngineConfig["type"] = "qnn-gpu";
     }
     if (genieEngineConfig["backend"].contains("extensions")) {
       quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"];
       quallaEngineConfig["op-package"] =
           getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider";
+      if (genieEngineConfig["model"]["library"].contains("lora")) {
+        for (int i = 0; i < genieEngineConfig["model"]["library"]["lora"]["adapters"].size(); i++) {
+          quallaEngineConfig["lora"][i]["adapter-name"] =
+              genieEngineConfig["model"]["library"]["lora"]["adapters"][i]["name"];
+          if (genieEngineConfig["model"]["library"]["lora"].contains("alpha-tensor-name")) {
+            quallaEngineConfig["lora"][i]["alpha-tensor-name"] =
+                genieEngineConfig["model"]["library"]["lora"]
+                                 ["alpha-tensor-name"];
+          }
+          quallaEngineConfig["lora"][i]["alpha-tensor-value"] = 1.0f;
+          quallaEngineConfig["lora"][i]["binsection-basedir"] = "";
+          quallaEngineConfig["lora"][i]["bin-sections"] =
+              genieEngineConfig["model"]["library"]["lora"]["adapters"][i]["bin-sections"];
+        }
+      }
     }
     if (genieEngineConfig["model"].contains("positional-encoding")) {
       quallaEngineConfig["positional-encoding"]["type"] =
       validateTokenizerConfig(item.value());
     } else if (item.key() == "sampler") {
       JSON_ENFORCE_OBJECT();
+      Sampler::SamplerConfig::validateSamplerConfig(item.value());
     } else if (item.key() == "engine") {
       JSON_ENFORCE_ARRAY_OR_OBJECT();
     } else if (item.key() == "embedding") {
   translateContextConfig(genieConfig, quallaConfig);
   translateTokenizerConfig(genieConfig, quallaConfig);
+  Sampler::SamplerConfig::translateSamplerConfig(genieConfig, quallaConfig);
   translateMultiEngineConfig(genieConfig, quallaConfig);
   translateEmbeddingConfig(genieConfig, quallaConfig);
 }
   m_config = config;
 }
+qualla::json& Dialog::Config::getJson() { return m_config; }
 //=============================================================================
 // Dialog functions
   if (!m_quallaDialog) {
     throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a dialog object");
   }
+  /*
+   * spec-dec has a mandatory "target" sampler and an optional "draft" sampler
+   * Check their availability and pass their references to Dialog Sampler to update with
+   * applyConfig()
+   */
+  std::shared_ptr<Sampler> sampler;
+  std::vector<std::reference_wrapper<qualla::Sampler>> quallaSamplers;
+  if (quallaConfig["type"] == "spec-dec") {
+    quallaSamplers.push_back(m_quallaDialog->sampler("target"));
+    if (m_quallaDialog->isSamplerPresent("draft"))
+      quallaSamplers.push_back(m_quallaDialog->sampler("draft"));
+    sampler = std::make_shared<Sampler>(config->getJson()["dialog"], quallaSamplers);
+  } else {
+    quallaSamplers.push_back(m_quallaDialog->sampler());  // Default role is "primary"
+    sampler = std::make_shared<Sampler>(config->getJson()["dialog"], quallaSamplers);
+  }
+  m_samplerHandle = Sampler::add(sampler);
+}
+GenieSampler_Handle_t Dialog::getSamplerHandle(std::shared_ptr<Dialog> dialog) {
+  return dialog->m_samplerHandle;
 }
 static_assert(qualla::Sentence::Code::COMPLETE ==
       kpis.generate.last_usec,
       kpis.tps.generate);
   return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED);
+}
+Dialog::~Dialog() { Sampler::remove(m_samplerHandle); }

Genie/Genie/src/Dialog.hpp CHANGED Viewed

@@ -10,11 +10,13 @@
 #include <atomic>
 #include <memory>
 #include "GenieDialog.h"
 #include "Util/HandleManager.hpp"
 #include "qualla/dialog.hpp"
 #include "qualla/DialogCallback.hpp"
 namespace genie {
@@ -33,7 +35,7 @@ class Dialog {
     static void remove(GenieDialogConfig_Handle_t handle);
     Config(const char* configStr);
-    qualla::json getJson() const;
    private:
     static qnn::util::HandleManager<Config> s_manager;
@@ -43,10 +45,12 @@ class Dialog {
   static GenieDialog_Handle_t add(std::shared_ptr<Dialog> dialog);
   static std::shared_ptr<Dialog> get(GenieDialog_Handle_t handle);
   static void remove(GenieDialog_Handle_t handle);
   qualla::DialogCallback dialogCallback;
   Dialog(std::shared_ptr<Config> config);
   Dialog(const Dialog&)            = delete;
   Dialog& operator=(const Dialog&) = delete;
@@ -91,5 +95,6 @@ class Dialog {
   uint32_t m_tokenLimit{UINT32_MAX};
   static qnn::util::HandleManager<Dialog> s_manager;
   static std::atomic<std::uint32_t> s_nameCounter;
 };
 }  // namespace genie

 #include <atomic>
 #include <memory>
+#include <functional>
 #include "GenieDialog.h"
 #include "Util/HandleManager.hpp"
 #include "qualla/dialog.hpp"
 #include "qualla/DialogCallback.hpp"
+#include "Sampler.hpp"
 namespace genie {
     static void remove(GenieDialogConfig_Handle_t handle);
     Config(const char* configStr);
+    qualla::json& getJson();
    private:
     static qnn::util::HandleManager<Config> s_manager;
   static GenieDialog_Handle_t add(std::shared_ptr<Dialog> dialog);
   static std::shared_ptr<Dialog> get(GenieDialog_Handle_t handle);
   static void remove(GenieDialog_Handle_t handle);
+  static GenieSampler_Handle_t getSamplerHandle(std::shared_ptr<genie::Dialog> dialog);
   qualla::DialogCallback dialogCallback;
   Dialog(std::shared_ptr<Config> config);
+  ~Dialog();
   Dialog(const Dialog&)            = delete;
   Dialog& operator=(const Dialog&) = delete;
   uint32_t m_tokenLimit{UINT32_MAX};
   static qnn::util::HandleManager<Dialog> s_manager;
   static std::atomic<std::uint32_t> s_nameCounter;
+  GenieSampler_Handle_t m_samplerHandle;
 };
 }  // namespace genie

Genie/Genie/src/Embedding.cpp ADDED Viewed

	@@ -0,0 +1,740 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <exception>
+#include <set>
+#include <sstream>
+#include "Embedding.hpp"
+#include "Exception.hpp"
+#include "Macro.hpp"
+#include "qualla/detail/json.hpp"
+#include "qualla/env.hpp"
+using namespace genie;
+#ifdef _WIN32
+inline std::string libPrefix = "";
+inline std::string libSuffix = ".dll";
+#else
+inline std::string libPrefix = "lib";
+inline std::string libSuffix = ".so";
+#endif
+inline std::string getLibName(std::string baseName) { return libPrefix + baseName + libSuffix; }
+//=============================================================================
+// Context::Config functions
+//=============================================================================
+static void validateContextConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "context config is not an object");
+  }
+  std::set<std::string> mandatoryFields{
+      "version", "n-vocab", "ctx-size", "embed-size", "pad-token"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing context field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "context";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid context config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "n-vocab") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "ctx-size") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "embed-size") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "pad-token") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown context config key: " + item.key());
+    }
+  }
+}
+static void translateContextConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  quallaConfig["n-vocab"]   = genieConfig["n-vocab"];
+  quallaConfig["size"]      = genieConfig["ctx-size"];
+  quallaConfig["n-embd"]    = genieConfig["embed-size"];
+  quallaConfig["pad-token"] = genieConfig["pad-token"];
+}
+//=============================================================================
+// Tokenizer::Config functions
+//=============================================================================
+static void validateTokenizerConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "tokenizer config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "path"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing tokenizer field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "tokenizer";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid tokenizer config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "path") {
+      JSON_ENFORCE_STRING();
+      // Note: the existence of this file is checked by qualla
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown tokenizer config key: " + item.key());
+    }
+  }
+}
+static void translateTokenizerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  quallaConfig["tokenizer"] = genieConfig["path"];
+}
+//=============================================================================
+// Backend::Config functions
+//=============================================================================
+static void validateBackendHtpConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnHtp config is not an object");
+  }
+  std::set<std::string> mandatoryFields{
+      "version", "spill-fill-bufsize", "use-mmap", "pooled-output", "allow-async-init"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "QnnHtp";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid QnnHtp config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "spill-fill-bufsize") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "use-mmap") {
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "pooled-output") {
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "allow-async-init") {
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "disable-kv-cache") {
+      JSON_ENFORCE_BOOLEAN();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key());
+    }
+  }
+}
+static void validateBackendGenaiConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnGenAiTransformer config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Missing QnnGenAiTransformer field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "QnnGenAiTransformer";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(
+            GENIE_STATUS_ERROR_JSON_VALUE,
+            "Invalid QnnGenAiTransformer config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "n-logits") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-layer") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-embd") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "n-heads") {
+      JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown QnnGenAiTransformer config key: " + item.key());
+    }
+  }
+}
+static void validateBackendConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "backend config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "type"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing backend field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "backend";
+  std::string type;
+  bool htp = false;
+  qualla::json htpConfig;
+  bool genai = false;
+  qualla::json genaiConfig;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid backend config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "type") {
+      JSON_ENFORCE_STRING();
+      type = item.value().get<std::string>();
+      if (type == "QnnHtp") {
+        htp = true;
+      } else if (type == "QnnGenAiTransformer") {
+        genai = true;
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid backend config: unsupported type: " + item.value().dump());
+      }
+    } else if (item.key() == "extensions") {
+      JSON_ENFORCE_STRING();
+    } else if (item.key() == "QnnHtp") {
+      JSON_ENFORCE_OBJECT();
+      htpConfig = item.value();
+    } else if (item.key() == "QnnGenAiTransformer") {
+      JSON_ENFORCE_OBJECT();
+      genaiConfig = item.value();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown backend config key: " + item.key());
+    }
+  }
+  if (htp) {
+    if (!htpConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp embedding config");
+    }
+    validateBackendHtpConfig(htpConfig);
+  } else {
+    if (htpConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "QnnHtp backend config for incorrect backend type: " + type);
+    }
+  }
+  if (genai) {
+    if (!genaiConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Missing QnnGenAiTransformer embedding config");
+    }
+    validateBackendGenaiConfig(genaiConfig);
+  } else {
+    if (genaiConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "QnnGenAiTransformer backend config for incorrect backend type: " + type);
+    }
+  }
+}
+//=============================================================================
+// Model::Config functions
+//=============================================================================
+static void validateModelBinaryConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "binary config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "ctx-bins"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "binary";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid binary config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "ctx-bins") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        if (!elem.is_string()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "ctx-bins must be an array of strings");
+        }
+      }
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown binary config key: " + item.key());
+    }
+  }
+}
+static void validateModelLibraryConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "library config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "model-bin"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "library";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid library config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "model-bin") {
+      JSON_ENFORCE_STRING();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key());
+    }
+  }
+}
+static void validateModelConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "model config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "type"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing model field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "model";
+  std::string type;
+  bool binary = false;
+  qualla::json binaryConfig;
+  bool library = false;
+  qualla::json libraryConfig;
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid model config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "type") {
+      JSON_ENFORCE_STRING();
+      type = item.value().get<std::string>();
+      if (type == "binary") {
+        binary = true;
+      } else if (type == "library") {
+        library = true;
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid model config: unsupported type: " + item.value().dump());
+      }
+    } else if (item.key() == "binary") {
+      JSON_ENFORCE_OBJECT();
+      binaryConfig = item.value();
+    } else if (item.key() == "library") {
+      JSON_ENFORCE_OBJECT();
+      libraryConfig = item.value();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown model config key: " + item.key());
+    }
+  }
+  if (binary) {
+    if (!binaryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary model config");
+    }
+    validateModelBinaryConfig(binaryConfig);
+  } else {
+    if (binaryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "binary model config for incorrect model type: " + type);
+    }
+  }
+  if (library) {
+    if (!libraryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library model config");
+    }
+    validateModelLibraryConfig(libraryConfig);
+  } else {
+    if (libraryConfig.is_object()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "library model config for incorrect model type: " + type);
+    }
+  }
+}
+//=============================================================================
+// Engine::Config functions
+//=============================================================================
+static void validateEngineConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "backend", "model"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing engine field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "engine";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid engine config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "backend") {
+      JSON_ENFORCE_OBJECT();
+      validateBackendConfig(item.value());
+    } else if (item.key() == "model") {
+      JSON_ENFORCE_OBJECT();
+      validateModelConfig(item.value());
+    } else if (item.key() == "n-threads") {
+        JSON_ENFORCE_NUMERIC();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown engine config key: " + item.key());
+    }
+  }
+}
+static void translateEngineConfig(const qualla::json& genieEngineConfig,
+                                  qualla::json& quallaEngineConfig) {
+  if (genieEngineConfig["version"] == 1) {
+    if (genieEngineConfig.contains("n-threads"))
+      quallaEngineConfig["n-threads"] = genieEngineConfig["n-threads"];
+    if (genieEngineConfig["backend"]["type"] == "QnnHtp") {
+      quallaEngineConfig["type"]               = "qnn-htp";
+      quallaEngineConfig["model-architecture-type"] = "encoder",
+      quallaEngineConfig["backend-lib"]        = getLibName("QnnHtp");
+      quallaEngineConfig["use-mmap"]           = genieEngineConfig["backend"]["QnnHtp"]["use-mmap"];
+      quallaEngineConfig["spill-fill-bufsize"] =
+          genieEngineConfig["backend"]["QnnHtp"]["spill-fill-bufsize"];
+      quallaEngineConfig["pooled-output"] = genieEngineConfig["backend"]["QnnHtp"]["pooled-output"];
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("disable-kv-cache")) {
+        quallaEngineConfig["disable-kv-cache"] =
+            genieEngineConfig["backend"]["QnnHtp"]["disable-kv-cache"];
+      }
+      // By default, Qualla will default to the async init path.
+      // For now, we are forcing async init off unless explicitly
+      // specified in the Genie config. It is HTP specific feature only.
+      quallaEngineConfig["use-async-Init"] = false;
+      if (genieEngineConfig["backend"]["QnnHtp"].contains("allow-async-init")) {
+        quallaEngineConfig["use-async-Init"] =
+            genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"];
+      }
+    } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") {
+      quallaEngineConfig["type"]         = "qnn-cpu";
+      quallaEngineConfig["model-output"] = "embeddings";
+      quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer");
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-logits")) {
+        quallaEngineConfig["n_logits"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-logits"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-layer")) {
+        quallaEngineConfig["n_layer"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-layer"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-embd")) {
+        quallaEngineConfig["n_embd"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-embd"];
+      }
+      if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-heads")) {
+        quallaEngineConfig["n_heads"] =
+            genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"];
+      }
+    }
+    if (genieEngineConfig["backend"].contains("extensions")) {
+      quallaEngineConfig["backend-ext-conf"] = genieEngineConfig["backend"]["extensions"];
+    }
+    if (genieEngineConfig["model"]["type"] == "binary") {
+      quallaEngineConfig["model-list"] = genieEngineConfig["model"]["binary"]["ctx-bins"];
+    } else if (genieEngineConfig["model"]["type"] == "library") {
+      quallaEngineConfig["model"]          = getLibName("QnnGenAiTransformerModel");
+      quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"];
+      quallaEngineConfig["op-package"] =
+          getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider";
+    }
+  }
+}
+//=============================================================================
+// Prompt::Config functions
+//=============================================================================
+static void validatePromptConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "prompt config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "prompt-template"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing prompt field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "prompt";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid context config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "prompt-template") {
+      JSON_ENFORCE_ARRAY();
+      for (auto& elem : item.value()) {
+        if (!elem.is_string()) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "prompt tags must be an array of strings");
+        }
+      }
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown context config key: " + item.key());
+    }
+  }
+}
+static void translatePromptConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  quallaConfig["tags"] = genieConfig["prompt-template"];
+}
+//=============================================================================
+// Embedding::Config functions
+//=============================================================================
+qnn::util::HandleManager<Embedding::Config> Embedding::Config::s_manager;
+GenieEmbeddingConfig_Handle_t Embedding::Config::add(std::shared_ptr<Embedding::Config> config) {
+  return (GenieEmbeddingConfig_Handle_t)s_manager.add(config);
+}
+std::shared_ptr<Embedding::Config> Embedding::Config::get(GenieEmbeddingConfig_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Embedding::Config::remove(GenieEmbeddingConfig_Handle_t handle) {
+  s_manager.remove((qnn::util::Handle_t)handle);
+}
+static void validateEmbeddingConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Embedding config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"version", "context", "tokenizer", "engine"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing embedding field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "embedding";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid embedding config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "context") {
+      JSON_ENFORCE_OBJECT();
+      validateContextConfig(item.value());
+    } else if (item.key() == "tokenizer") {
+      JSON_ENFORCE_OBJECT();
+      validateTokenizerConfig(item.value());
+    } else if (item.key() == "prompt") {  // optional parameter
+      JSON_ENFORCE_OBJECT();
+      validatePromptConfig(item.value());
+    } else if (item.key() == "truncate-input") {  // optional parameter
+      JSON_ENFORCE_BOOLEAN();
+    } else if (item.key() == "engine") {
+      JSON_ENFORCE_OBJECT();
+      validateEngineConfig(config["engine"]);
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown embedding config key: " + item.key());
+    }
+  }
+}
+static void translateEmbeddingConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) {
+  translateContextConfig(genieConfig["context"], quallaConfig["context"]);
+  translatePromptConfig(genieConfig["prompt"], quallaConfig["prompt"]);
+  translateTokenizerConfig(genieConfig["tokenizer"], quallaConfig);
+  translateEngineConfig(genieConfig["engine"], quallaConfig["engine"]);
+  if (genieConfig.contains(
+          "truncate-input")) {  // to allow truncation of input incase it exceeds the context.
+    quallaConfig["truncate-input"] = genieConfig["truncate-input"];
+  }
+}
+Embedding::Config::Config(const char* configStr) {
+  qualla::json config;
+  {
+    std::set<qualla::json> keys;
+    auto callback = [&keys](int depth, qualla::json::parse_event_t event, qualla::json& parsed) {
+      if ((depth == 1) && (event == qualla::json::parse_event_t::key)) {
+        if (keys.count(parsed) > 0) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                          "Multiple embedding config key: " + parsed.dump());
+        }
+        keys.insert(parsed);
+      }
+      return true;
+    };
+    config = qualla::json::parse(configStr, callback);
+  }
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Embedding config is not an object");
+  }
+  std::set<std::string> mandatoryFields{"embedding"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing embedding field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  std::string component = "embedding";
+  for (auto& item : config.items()) {
+    if (item.key() == "embedding") {
+      JSON_ENFORCE_OBJECT();
+      validateEmbeddingConfig(item.value());
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Unknown embedding config key: " + item.key());
+    }
+  }
+  m_config = config;
+}
+qualla::json Embedding::Config::getJson() const { return m_config; }
+//=============================================================================
+// Embedding functions
+//=============================================================================
+qnn::util::HandleManager<Embedding> Embedding::s_manager;
+std::atomic<std::uint32_t> Embedding::s_nameCounter{0u};
+GenieEmbedding_Handle_t Embedding::add(std::shared_ptr<Embedding> embedding) {
+  return (GenieEmbedding_Handle_t)s_manager.add(embedding);
+}
+std::shared_ptr<Embedding> Embedding::get(GenieEmbedding_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Embedding::remove(GenieEmbedding_Handle_t handle) {
+  s_manager.remove((qnn::util::Handle_t)handle);
+}
+Embedding::Embedding(std::shared_ptr<Config> config) {
+  auto env = qualla::Env::create(qualla::json{});
+  qualla::json quallaConfig;
+  translateEmbeddingConfig(config->getJson()["embedding"], quallaConfig);
+  m_quallaEmbedding = qualla::Embedding::create(
+      env, "embedding" + std::to_string(s_nameCounter.fetch_add(1u)), quallaConfig);
+  if (!m_quallaEmbedding) {
+    throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a embedding object");
+  }
+}
+int32_t Embedding::generate(const char* queryStr,
+                            GenieEmbedding_GenerateCallback_t callback,
+                            const void* userData) {
+  std::string query(queryStr);
+  std::vector<float> outputEmbedding;
+  bool status = false;
+  status      = m_quallaEmbedding->query(query, outputEmbedding);
+  if (status) {
+    std::vector<uint32_t> dimensions;
+    m_quallaEmbedding->output_dimensions(dimensions);
+    callback(dimensions.data(), dimensions.size(), outputEmbedding.data(), userData);
+    qualla::Embedding::KPIs kpis = m_quallaEmbedding->kpis();
+    printf(
+        "\n\n[KPIS]:\nInit Time: %zu us\nPrompt Processing Time: %zu us, Prompt Processing Rate : "
+        "%f toks/sec\n",
+        kpis.init.total_usec,
+        kpis.prompt.last_usec,
+        kpis.tps.prompt);
+  }
+  return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_GENERATE_FAILED);
+}

Genie/Genie/src/Embedding.hpp ADDED Viewed

	@@ -0,0 +1,56 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <atomic>
+#include <memory>
+#include "GenieEmbedding.h"
+#include "Util/HandleManager.hpp"
+#include "qualla/embedding.hpp"
+namespace genie {
+class Embedding {
+ public:
+  class Config {
+   public:
+    static GenieEmbeddingConfig_Handle_t add(std::shared_ptr<Config> config);
+    static std::shared_ptr<Config> get(GenieEmbeddingConfig_Handle_t handle);
+    static void remove(GenieEmbeddingConfig_Handle_t handle);
+    Config(const char* configStr);
+    qualla::json getJson() const;
+   private:
+    static qnn::util::HandleManager<Config> s_manager;
+    qualla::json m_config;
+  };
+  static GenieEmbedding_Handle_t add(std::shared_ptr<Embedding> embedding);
+  static std::shared_ptr<Embedding> get(GenieEmbedding_Handle_t handle);
+  static void remove(GenieEmbedding_Handle_t handle);
+  Embedding(std::shared_ptr<Config> config);
+  Embedding(const Embedding&)            = delete;
+  Embedding& operator=(const Embedding&) = delete;
+  Embedding(Embedding&&)                 = delete;
+  Embedding& operator=(Embedding&&)      = delete;
+  int32_t generate(const char* queryStr,
+                   GenieEmbedding_GenerateCallback_t callback,
+                   const void* userData);
+ private:
+  std::unique_ptr<qualla::Embedding> m_quallaEmbedding;
+  static qnn::util::HandleManager<Embedding> s_manager;
+  static std::atomic<std::uint32_t> s_nameCounter;
+};
+}  // namespace genie

Genie/Genie/src/Exception.hpp CHANGED Viewed

@@ -9,6 +9,7 @@
 #pragma once
 #include <exception>
 #include <string>
 #include "GenieCommon.h"

 #pragma once
 #include <exception>
+#include <stdexcept>
 #include <string>
 #include "GenieCommon.h"

Genie/Genie/src/GenieDialog.cpp CHANGED Viewed

@@ -232,6 +232,24 @@ Genie_Status_t GenieDialog_tokenQuery(const GenieDialog_Handle_t dialogHandle,
   return status;
 }
 GENIE_API
 Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) {
   try {
@@ -246,4 +264,4 @@ Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) {
     return GENIE_STATUS_ERROR_GENERAL;
   }
   return GENIE_STATUS_SUCCESS;
-}

   return status;
 }
+GENIE_API
+Genie_Status_t GenieDialog_getSampler(const GenieDialog_Handle_t dialogHandle,
+                                      GenieSampler_Handle_t* dialogSamplerHandle) {
+  try {
+    GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto dialog = genie::Dialog::get(dialogHandle);
+    GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(dialogSamplerHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    *dialogSamplerHandle = genie::Dialog::getSamplerHandle(dialog);
+    GENIE_ENSURE(*dialogSamplerHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GET_HANDLE_FAILED;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
 GENIE_API
 Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) {
   try {
     return GENIE_STATUS_ERROR_GENERAL;
   }
   return GENIE_STATUS_SUCCESS;
+}

Genie/Genie/src/GenieEmbedding.cpp ADDED Viewed

	@@ -0,0 +1,118 @@

+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#include "Embedding.hpp"
+#include "Exception.hpp"
+#include "GenieEmbedding.h"
+#include "Macro.hpp"
+#include "Util/HandleManager.hpp"
+#include "qualla/detail/json.hpp"
+using namespace genie;
+GENIE_API
+Genie_Status_t GenieEmbeddingConfig_createFromJson(const char* str,
+                                                   GenieEmbeddingConfig_Handle_t* configHandle) {
+  try {
+    GENIE_ENSURE(str, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    auto config = std::make_shared<Embedding::Config>(str);
+    GENIE_ENSURE(config, GENIE_STATUS_ERROR_MEM_ALLOC);
+    *configHandle = genie::Embedding::Config::add(config);
+  } catch (const qualla::json::parse_error& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_JSON_FORMAT;
+  } catch (const Exception& e) {
+    std::cerr << e.what() << std::endl;
+    return e.status();
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieEmbeddingConfig_free(const GenieEmbeddingConfig_Handle_t configHandle) {
+  try {
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    {
+      // Check if the embedding actually exists
+      auto configObj = genie::Embedding::Config::get(configHandle);
+      GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    }
+    genie::Embedding::Config::remove(configHandle);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieEmbedding_create(const GenieEmbeddingConfig_Handle_t configHandle,
+                                     GenieEmbedding_Handle_t* embeddingHandle) {
+  try {
+    GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    // Get config object
+    auto configObj = genie::Embedding::Config::get(configHandle);
+    GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    // Create embedding
+    auto embedding = std::make_shared<genie::Embedding>(configObj);
+    GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_MEM_ALLOC);
+    // Create Handle
+    *embeddingHandle = genie::Embedding::add(embedding);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  // Return SUCCESS
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieEmbedding_generate(const GenieEmbedding_Handle_t embeddingHandle,
+                                       const char* queryStr,
+                                       const GenieEmbedding_GenerateCallback_t callback,
+                                       const void* userData) {
+  int32_t status;
+  try {
+    GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto embedding = genie::Embedding::get(embeddingHandle);
+    GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(queryStr, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(callback, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    status = embedding->generate(queryStr, callback, userData);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return status;
+}
+GENIE_API
+Genie_Status_t GenieEmbedding_free(const GenieEmbedding_Handle_t embeddingHandle) {
+  try {
+    GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    {
+      // Check if the embedding actually exists
+      auto embedding = genie::Embedding::get(embeddingHandle);
+      GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    }
+    genie::Embedding::remove(embeddingHandle);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}

Genie/Genie/src/GenieSampler.cpp ADDED Viewed

	@@ -0,0 +1,93 @@

+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+#include <iostream>
+#include "Exception.hpp"
+#include "GenieSampler.h"
+#include "Macro.hpp"
+#include "Sampler.hpp"
+#include "Util/HandleManager.hpp"
+#include "qualla/detail/json.hpp"
+using namespace genie;
+GENIE_API
+Genie_Status_t GenieSamplerConfig_createFromJson(const char* str,
+                                                 GenieSamplerConfig_Handle_t* configHandle) {
+  try {
+    GENIE_ENSURE(str, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT);
+    auto config = std::make_shared<Sampler::Sampler::SamplerConfig>(str);
+    GENIE_ENSURE(config, GENIE_STATUS_ERROR_MEM_ALLOC);
+    *configHandle = Sampler::Sampler::SamplerConfig::add(config);
+  } catch (const qualla::json::parse_error& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_JSON_FORMAT;
+  } catch (const Exception& e) {
+    std::cerr << e.what() << std::endl;
+    return e.status();
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieSamplerConfig_free(const GenieSamplerConfig_Handle_t configHandle) {
+  try {
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    {
+      // Check if the dialog actually exists
+      auto configObj = Sampler::SamplerConfig::get(configHandle);
+      GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    }
+    Sampler::SamplerConfig::remove(configHandle);
+  } catch (const std::exception& e) {
+    return GENIE_STATUS_ERROR_GENERAL;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieSamplerConfig_setParam(const GenieSamplerConfig_Handle_t configHandle,
+                                           const char* keyStr,
+                                           const char* valueStr) {
+  try {
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto samplerConfig = Sampler::SamplerConfig::get(configHandle);
+    GENIE_ENSURE(samplerConfig, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    samplerConfig->setParam(keyStr, valueStr);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_SET_PARAMS_FAILED;
+  }
+  return GENIE_STATUS_SUCCESS;
+}
+GENIE_API
+Genie_Status_t GenieSampler_applyConfig(const GenieSampler_Handle_t samplerHandle,
+                                        const GenieSamplerConfig_Handle_t configHandle) {
+  try {
+    GENIE_ENSURE(samplerHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto sampler = Sampler::get(samplerHandle);
+    GENIE_ENSURE(sampler, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    auto samplerConfig = Sampler::SamplerConfig::get(configHandle);
+    GENIE_ENSURE(samplerConfig, GENIE_STATUS_ERROR_INVALID_HANDLE);
+    sampler->applyConfig(samplerConfig->getJson());
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    return GENIE_STATUS_ERROR_APPLY_CONFIG_FAILED;
+  }
+  return GENIE_STATUS_SUCCESS;
+}

Genie/Genie/src/Macro.hpp CHANGED Viewed

@@ -8,6 +8,8 @@
 #pragma once
 //======================================================================================================================
 // Error generation macros
 //======================================================================================================================

 #pragma once
+#define ENABLE_DEBUG_LOGS 0
 //======================================================================================================================
 // Error generation macros
 //======================================================================================================================

Genie/Genie/src/Sampler.cpp ADDED Viewed

	@@ -0,0 +1,275 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <exception>
+#include <set>
+#include "Exception.hpp"
+#include "Macro.hpp"
+#include "Sampler.hpp"
+#include "qualla/detail/json.hpp"
+using namespace genie;
+//=============================================================================
+// Sampler functions
+//=============================================================================
+qnn::util::HandleManager<Sampler> Sampler::s_manager;
+GenieSampler_Handle_t Sampler::add(std::shared_ptr<Sampler> config) {
+  return (GenieSampler_Handle_t)s_manager.add(config);
+}
+std::shared_ptr<Sampler> Sampler::get(GenieSampler_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Sampler::remove(GenieSampler_Handle_t handle) {
+  s_manager.remove((qnn::util::Handle_t)handle);
+}
+Sampler::Sampler(qualla::json& origJson,
+                 std::vector<std::reference_wrapper<qualla::Sampler>>& quallaSamplers)
+    : m_origJson(origJson), m_quallaSamplers(quallaSamplers) {}
+void Sampler::applyConfig(qualla::json samplerConfigJson) {
+  m_origJson["sampler"]["seed"] = qualla::Config::optional<int32_t>(
+      samplerConfigJson["sampler"], "seed", m_origJson["sampler"]["seed"]);
+  m_origJson["sampler"]["temp"] = qualla::Config::optional<float>(
+      samplerConfigJson["sampler"], "temp", m_origJson["sampler"]["temp"]);
+  m_origJson["sampler"]["top-k"] = qualla::Config::optional<size_t>(
+      samplerConfigJson["sampler"], "top-k", m_origJson["sampler"]["top-k"]);
+  m_origJson["sampler"]["top-p"] = qualla::Config::optional<float>(
+      samplerConfigJson["sampler"], "top-p", m_origJson["sampler"]["top-p"]);
+  m_origJson["sampler"]["version"] =
+      qualla::Config::optional<int32_t>(samplerConfigJson["sampler"], "version", 1);
+  m_origJson["sampler"]["type"] = "basic";
+#if ENABLE_DEBUG_LOGS
+  std::cout << "Updated sampler config: " << std::endl;
+  std::cout << "temp: " << m_origJson["sampler"]["temp"].get<double>() << std::endl;
+  std::cout << "top-k: " << m_origJson["sampler"]["top-k"] << std::endl;
+  std::cout << "top-p: " << m_origJson["sampler"]["top-p"].get<double>() << std::endl;
+  std::cout << "seed: " << m_origJson["sampler"]["seed"] << std::endl;
+#endif
+  // Loop through the live qualla sampler instances and update the parameters
+  for (auto& quallaSampler : m_quallaSamplers) {
+    quallaSampler.get().applyConfig(m_origJson["sampler"]);
+  }
+}
+//=============================================================================
+// Sampler::SamplerConfig functions
+//=============================================================================
+qnn::util::HandleManager<Sampler::SamplerConfig> Sampler::SamplerConfig::s_manager;
+GenieSamplerConfig_Handle_t Sampler::SamplerConfig::add(
+    std::shared_ptr<Sampler::SamplerConfig> config) {
+  return (GenieSamplerConfig_Handle_t)s_manager.add(config);
+}
+std::shared_ptr<Sampler::SamplerConfig> Sampler::SamplerConfig::get(
+    GenieSamplerConfig_Handle_t handle) {
+  return s_manager.get((qnn::util::Handle_t)handle);
+}
+void Sampler::SamplerConfig::remove(GenieSamplerConfig_Handle_t handle) {
+  s_manager.remove((qnn::util::Handle_t)handle);
+}
+Sampler::SamplerConfig::SamplerConfig(const char* configStr) {
+  qualla::json quallaConfig;
+  qualla::json config;
+  {
+    std::set<qualla::json> keys;
+    auto callback = [&keys](int depth, qualla::json::parse_event_t event, qualla::json& parsed) {
+      if ((depth == 1) && (event == qualla::json::parse_event_t::key)) {
+        if (keys.count(parsed) > 0) {
+          throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                          "Multiple sampler config key: " + parsed.dump());
+        }
+        keys.insert(parsed);
+      }
+      return true;
+    };
+    config = qualla::json::parse(configStr, callback);
+  }
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Sampler config is not an object");
+  }
+  if (!config.contains("sampler")) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing field: sampler");
+  }
+  // component is used in the "ENFORCE" macros
+  const std::string component = "sampler";
+  for (auto& item : config.items()) {
+    if (item.key() == "sampler") {
+      JSON_ENFORCE_OBJECT();
+      validateSamplerConfig(item.value());
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key());
+    }
+  }
+  if (config["sampler"].contains("seed"))
+    quallaConfig["sampler"]["seed"] = config["sampler"]["seed"];
+  if (config["sampler"].contains("temp"))
+    quallaConfig["sampler"]["temp"] = config["sampler"]["temp"];
+  if (config["sampler"].contains("top-k"))
+    quallaConfig["sampler"]["top-k"] = config["sampler"]["top-k"];
+  if (config["sampler"].contains("top-p"))
+    quallaConfig["sampler"]["top-p"] = config["sampler"]["top-p"];
+  if (config["sampler"].contains("greedy"))
+    quallaConfig["sampler"]["greedy"] = config["sampler"]["greedy"];
+  if (config["sampler"].contains("version"))
+    quallaConfig["sampler"]["version"] = config["sampler"]["version"];
+  else
+    quallaConfig["sampler"]["version"] = 1;
+  quallaConfig["sampler"]["type"] = "basic";
+  m_config = quallaConfig;
+}
+void Sampler::SamplerConfig::setParam(const std::string& keyStr, const std::string& valueStr) {
+  if (!keyStr.empty()) {
+    // Case 1: Only the parameter mentioned in keyStr is to be updated by valueStr
+    std::set<std::string> validParams = {"seed", "top-p", "top-k", "temp"};
+    if (std::find(validParams.begin(), validParams.end(), keyStr) == validParams.end()) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Invalid key obtained: " + keyStr);
+    }
+    try {
+      if (keyStr == "seed")
+        m_config["sampler"]["seed"] = std::stoi(valueStr);
+      else if (keyStr == "top-p")
+        m_config["sampler"]["top-p"] = std::stof(valueStr);
+      else if (keyStr == "top-k")
+        m_config["sampler"]["top-k"] = std::stof(valueStr);
+      else if (keyStr == "temp")
+        m_config["sampler"]["temp"] = std::stof(valueStr);
+    } catch (const std::invalid_argument& e) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                      "Invalid value obtained: " + valueStr + " for key: " + keyStr);
+    }
+  } else {
+    // Case 2: User has passed entire json as a string in valueStr
+    if (valueStr.empty())
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Both keyStr and valueStr cannot be empty");
+    qualla::json config = qualla::json::parse(valueStr);
+    if (!config.contains("sampler")) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing field: sampler");
+    }
+    // component is used in the "ENFORCE" macros
+    const std::string component = "sampler";
+    for (auto& item : config.items()) {
+      if (item.key() == "sampler") {
+        JSON_ENFORCE_OBJECT();
+        validateSamplerConfig(item.value());
+      } else {
+        throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA,
+                        "Unknown sampler config key: " + item.key());
+      }
+    }
+    m_config["sampler"]["seed"] =
+        qualla::Config::optional<int32_t>(config["sampler"], "seed", m_config["sampler"]["seed"]);
+    m_config["sampler"]["temp"] =
+        qualla::Config::optional<float>(config["sampler"], "temp", m_config["sampler"]["temp"]);
+    m_config["sampler"]["top-k"] =
+        qualla::Config::optional<size_t>(config["sampler"], "top-k", m_config["sampler"]["top-k"]);
+    m_config["sampler"]["top-p"] =
+        qualla::Config::optional<float>(config["sampler"], "top-p", m_config["sampler"]["top-p"]);
+    m_config["sampler"]["version"] = qualla::Config::optional<int32_t>(
+        config["sampler"], "version", m_config["sampler"]["version"]);
+    m_config["sampler"]["type"] = "basic";
+  }
+}
+void Sampler::SamplerConfig::validateSamplerConfig(const qualla::json& config) {
+  if (!config.is_object()) {
+    throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "sampler config is not an object");
+  }
+  const std::set<std::string> mandatoryFields{"version"};
+  for (const auto& field : mandatoryFields) {
+    if (!config.contains(field)) {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing sampler field: " + field);
+    }
+  }
+  // component is used in the "ENFORCE" macros
+  const std::string component = "sampler";
+  for (auto& item : config.items()) {
+    if (item.key() == "version") {
+      JSON_ENFORCE_NUMERIC();
+      if (item.value().get<int>() != 1) {
+        throw Exception(GENIE_STATUS_ERROR_JSON_VALUE,
+                        "Invalid sampler config: unsupported version: " + item.value().dump());
+      }
+    } else if (item.key() == "seed") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "temp") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "top-k") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "top-p") {
+      JSON_ENFORCE_NUMERIC();
+    } else if (item.key() == "greedy") {
+      JSON_ENFORCE_BOOLEAN();
+    } else {
+      throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key());
+    }
+  }
+}
+void Sampler::SamplerConfig::translateSamplerConfig(const qualla::json& genieConfig,
+                                                    qualla::json& quallaConfig) {
+  if (genieConfig["dialog"].contains("sampler")) {
+    quallaConfig["sampler"]["type"] = "basic";
+    if (genieConfig["dialog"]["sampler"].contains("seed")) {
+      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("temp")) {
+      quallaConfig["sampler"]["temp"] = genieConfig["dialog"]["sampler"]["temp"];
+    }
+    quallaConfig["sampler"]["role"] = "primary";
+#if defined(GENIE_SPD_FEATURE)
+    if (genieConfig["dialog"]["type"] == "spd") {
+      quallaConfig["sampler"]["role"] = "target";
+    }
+#endif
+    if (genieConfig["dialog"]["sampler"].contains("top-k")) {
+      quallaConfig["sampler"]["top-k"] = genieConfig["dialog"]["sampler"]["top-k"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("top-p")) {
+      quallaConfig["sampler"]["top-p"] = genieConfig["dialog"]["sampler"]["top-p"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("greedy")) {
+      quallaConfig["sampler"]["greedy"] = genieConfig["dialog"]["sampler"]["greedy"];
+    }
+    if (genieConfig["dialog"]["sampler"].contains("seed")) {
+      quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"];
+    }
+  }
+}
+qualla::json Sampler::SamplerConfig::getJson() const { return m_config; }

Genie/Genie/src/Sampler.hpp ADDED Viewed

	@@ -0,0 +1,60 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <memory>
+#include "GenieSampler.h"
+#include "Util/HandleManager.hpp"
+#include "qualla/env.hpp"
+#include "qualla/sampler.hpp"
+namespace genie {
+class Sampler {
+ public:
+  class SamplerConfig {
+   public:
+    static GenieSamplerConfig_Handle_t add(std::shared_ptr<SamplerConfig> config);
+    static std::shared_ptr<SamplerConfig> get(GenieSamplerConfig_Handle_t handle);
+    static void remove(GenieSamplerConfig_Handle_t handle);
+    static void validateSamplerConfig(const qualla::json& config);
+    static void translateSamplerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig);
+    SamplerConfig(const char* configStr);
+    void setParam(const std::string& keyStr, const std::string& valueStr);
+    qualla::json getJson() const;
+   private:
+    static qnn::util::HandleManager<SamplerConfig> s_manager;
+    qualla::json m_config;
+  };
+  static GenieSampler_Handle_t add(std::shared_ptr<Sampler> sampler);
+  static std::shared_ptr<Sampler> get(GenieSampler_Handle_t handle);
+  static void remove(GenieSampler_Handle_t handle);
+  Sampler(qualla::json& origJson,
+          std::vector<std::reference_wrapper<qualla::Sampler>>& quallaSamplers);
+  void applyConfig(qualla::json samplerConfigJson);
+  const qualla::json& getJson();
+ private:
+  qualla::json m_origJson;
+  static qnn::util::HandleManager<Sampler> s_manager;
+  std::vector<std::reference_wrapper<qualla::Sampler>> m_quallaSamplers;
+};
+}  // namespace genie

Genie/Genie/src/qualla/context.cpp CHANGED Viewed

@@ -93,6 +93,10 @@ extern void needQnnHtpEngine();
 extern void needQnnCpuEngine();
     #endif
 static OnLoad needs([]() {
     needStdoutLogger();
     needFileLogger();
@@ -111,6 +115,10 @@ static OnLoad needs([]() {
     #ifdef QUALLA_ENGINE_QNN_CPU
     needQnnCpuEngine();
     #endif
 });
 #endif

 extern void needQnnCpuEngine();
     #endif
+    #ifdef QUALLA_ENGINE_QNN_GPU
+extern void needQnnGpuEngine();
+    #endif
 static OnLoad needs([]() {
     needStdoutLogger();
     needFileLogger();
     #ifdef QUALLA_ENGINE_QNN_CPU
     needQnnCpuEngine();
     #endif
+    #ifdef QUALLA_ENGINE_QNN_GPU
+    needQnnGpuEngine();
+    #endif
 });
 #endif

Genie/Genie/src/qualla/dialogs/ssd-q1.cpp CHANGED Viewed

@@ -161,7 +161,7 @@ SelfSpecDecDialog::SelfSpecDecDialog(
     m_inputType = _engine["primary"]->getInputType();
     // Load KV prefix
     Timer  timer;
-    size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name);
     if (n_restored_prefix != _forecast_prefix) {
         // clang-format off
         throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",
@@ -1001,7 +1001,7 @@ bool SelfSpecDecDialog::process(std::vector<int32_t>& tokens, Dialog::Callback c
 void SelfSpecDecDialog::reset() {
   Dialog::reset();
   _n_past = _forecast_prefix;
-  size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name);
   if (n_restored_prefix != _forecast_prefix) {
     // clang-format off
     throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",

     m_inputType = _engine["primary"]->getInputType();
     // Load KV prefix
     Timer  timer;
+    size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name, true);
     if (n_restored_prefix != _forecast_prefix) {
         // clang-format off
         throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",
 void SelfSpecDecDialog::reset() {
   Dialog::reset();
   _n_past = _forecast_prefix;
+  size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name, true);
   if (n_restored_prefix != _forecast_prefix) {
     // clang-format off
     throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$",

Genie/Genie/src/qualla/engine.cpp CHANGED Viewed

@@ -69,7 +69,7 @@ bool Engine::updateKV(size_t n_past, const std::vector<bool>& selected) {
     return false;
 }
-size_t Engine::restore(const std::string& name) {
     _env.logger().error(fmt::format("{}-engine does not support restore", _type));
     return 0;
 }

     return false;
 }
+size_t Engine::restore(const std::string& name, bool chooseHigherVariant) {
     _env.logger().error(fmt::format("{}-engine does not support restore", _type));
     return 0;
 }

Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.cpp ADDED Viewed

	@@ -0,0 +1,317 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include "QnnMem.h"
+#include "DmaBufAllocator.hpp"
+#include "QnnTypeMacros.hpp"
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <linux/dma-buf.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+DmaBufferAllocator::DmaBufferAllocator(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface)
+    : m_libDmaBufHeapHandle(nullptr),
+      m_dmaBufCreate(nullptr),
+      m_dmaBufAlloc(nullptr),
+      m_dmaBufDeinit(nullptr),
+      m_qnnInterface(qnnInterface),
+      m_contextHandle(contextHandle) {}
+bool DmaBufferAllocator::initialize() {
+  // On Android, 32-bit and 64-bit libdmaBufheap.so can be found at /system/lib and /system/lib64
+  //  respectively.
+  m_libDmaBufHeapHandle = dlopen("libdmabufheap.so", RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == m_libDmaBufHeapHandle) {
+    QNN_ERROR("Unable to load backend. dlerror(): %s", dlerror());
+    return false;
+  }
+  m_dmaBufCreate = (DmaBufCreateFn_t)dlsym(
+      m_libDmaBufHeapHandle, "CreateDmabufHeapBufferAllocator");
+  m_dmaBufAlloc =
+      (DmaBufAllocFn_t)dlsym(m_libDmaBufHeapHandle, "DmabufHeapAlloc");
+  m_dmaBufDeinit = (DmaBufDeinitFn_t)dlsym(
+      m_libDmaBufHeapHandle, "FreeDmabufHeapBufferAllocator");
+  if (nullptr == m_dmaBufCreate || nullptr == m_dmaBufAlloc || nullptr == m_dmaBufDeinit) {
+    QNN_ERROR("Unable to access symbols in libdmaBufheap. dlerror(): %s", dlerror());
+    return false;
+  }
+  return true;
+}
+DmaBufferAllocator::~DmaBufferAllocator() {
+  if (m_libDmaBufHeapHandle) {
+    dlclose(m_libDmaBufHeapHandle);
+    m_libDmaBufHeapHandle = nullptr;
+  }
+}
+DmaBufferData* DmaBufferAllocator::getDmaBufTensorData(Qnn_Tensor_t* tensor) {
+    if (tensor == nullptr) return nullptr;
+    Qnn_MemHandle_t mem_handle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    if (mem_handle == nullptr) return nullptr;
+    return &m_memHandleToDmaBufMem.at(mem_handle);
+}
+void* DmaBufferAllocator::getBuffer(Qnn_Tensor_t* tensor) {
+  if (!tensor) {
+    QNN_WARN("DmaBufferAllocator: getBuffer: received a null pointer to a tensor");
+    return nullptr;
+  }
+  if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+    QNN_ERROR("DmaBufferAllocator: Tensor not found with address = %p", tensor);
+    return nullptr;
+  }
+  DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor];
+  return dmaBufferData.memPointer;
+}
+int DmaBufferAllocator::getFd(Qnn_Tensor_t* tensor) {
+    DmaBufferData* data = getDmaBufTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("DmaBufferAllocator: getFd : Couldn't find tensor %p", tensor);
+        return -1;
+    }
+    return data->fd;
+}
+size_t DmaBufferAllocator::getOffset(Qnn_Tensor_t* tensor) {
+    DmaBufferData* data = getDmaBufTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("DmaBufferAllocator: getOffset : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->offset;
+}
+size_t DmaBufferAllocator::getBufferSize(Qnn_Tensor_t* tensor) {
+    DmaBufferData* data = getDmaBufTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("DmaBufferAllocator: getBufferSize : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->totalBufferSize;
+};
+size_t DmaBufferAllocator::getTotalBufferSize(Qnn_Tensor_t* tensor) {
+    DmaBufferData* data = getDmaBufTensorData(tensor);
+    if (data == nullptr) {
+        QNN_ERROR("DmaBufferAllocator: getTotalBufferSize : Couldn't find tensor %p", tensor);
+        return 0;
+    }
+    return data->totalBufferSize;
+}
+bool DmaBufferAllocator::allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) {
+    if (m_libDmaBufHeapHandle == nullptr) {
+        QNN_ERROR("DmaBufferAllocator not initialized");
+        return false;
+    }
+    if (!tensor) {
+        QNN_ERROR("DmaBufferAllocator: Received nullptr for tensor");
+        return false;
+    }
+    if (m_tensorToDmaBufferData.find(tensor) != m_tensorToDmaBufferData.end()) {
+        QNN_ERROR("DmaBufferAllocator: Tensor already allocated");
+        return false;
+    }
+    void* dmaBufferAllocator = m_dmaBufCreate();
+    if (dmaBufferAllocator == nullptr) {
+      QNN_ERROR("DmaBufferAllocator: nullptr returned for CreateDmabufHeapBufferAllocator().");
+      return false;
+    }
+    int fd = m_dmaBufAlloc(dmaBufferAllocator, "qcom,system", tensorDataSize, 0, 0);
+    if (fd < 0) {
+      QNN_ERROR("DmaBufAlloc returned a invalid file descriptor = %d", fd);
+      return false;
+    }
+    void* memPointer = mmap(nullptr, tensorDataSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (MAP_FAILED == memPointer) {
+      printf("DmaBufferAllocator: Unable to open file returned by DmaBufAlloc with mmap");
+      return false;
+    }
+    Qnn_MemDescriptor_t memDescriptor = {
+        {QNN_TENSOR_GET_RANK(tensor), QNN_TENSOR_GET_DIMENSIONS(tensor), nullptr},
+        QNN_TENSOR_GET_DATA_TYPE(tensor),
+        QNN_MEM_TYPE_DMA_BUF,
+        {.dmaBufInfo = {fd, memPointer}}};
+    QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
+    QNN_TENSOR_SET_MEM_HANDLE(tensor, nullptr);
+    Qnn_MemHandle_t memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    if (QNN_SUCCESS !=
+        m_qnnInterface->memRegister(m_contextHandle, &memDescriptor, 1, &(memHandle))) {
+      QNN_ERROR("DmaBufferAllocator: Failure to register ion memory with the backend");
+      return false;
+    }
+    QNN_DEBUG("DmaBufferAllocator: Memregister successful with handle %p for DMA buffer with size: %zu and fd %d",
+              memHandle,
+              tensorDataSize,
+              fd);
+    QNN_TENSOR_SET_MEM_HANDLE(tensor, memHandle);
+    m_tensorToDmaBufferData.insert(
+        {tensor, DmaBufferData(dmaBufferAllocator, fd, memPointer, tensorDataSize)});
+    return true;
+}
+bool DmaBufferAllocator::freeTensorBuffer(Qnn_Tensor_t* tensor) {
+    if (!tensor) {
+        QNN_ERROR("DmaBufferAllocator: Received nullptr for tensor");
+        return false;
+    }
+    auto memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor);
+    if (QNN_SUCCESS != m_qnnInterface->memDeRegister(&memHandle, 1)) {
+        QNN_ERROR("DmaBufferAllocator: Failed to deregister custom memory handle with the backend");
+        return false;
+    }
+    if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+        QNN_ERROR("DmaBufferAllocator: Tensor not found with address = %p", tensor);
+        return false;
+    }
+    DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor];
+    if (!m_dmaBufDeinit) {
+        QNN_ERROR("DmaBufferAllocator: DmaBuf Deinit function pointer is null");
+        return false;
+    }
+    munmap(dmaBufferData.memPointer, dmaBufferData.totalBufferSize);
+    m_dmaBufDeinit(dmaBufferData.dmaBufferAllocator);
+    m_tensorToDmaBufferData.erase(tensor);
+    return true;
+}
+bool DmaBufferAllocator::useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) {
+    if (nullptr == dest || nullptr == src) {
+        QNN_ERROR("DmaBufferAllocator: Received nullptr");
+        return false;
+    }
+    if (m_tensorToDmaBufferData.find(src) == m_tensorToDmaBufferData.end()) {
+        QNN_ERROR("DmaBufferAllocator: Src Tensor not found");
+        return false;
+    }
+    QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSOR_GET_MEM_TYPE(src));
+    QNN_TENSOR_SET_MEM_HANDLE(dest, QNN_TENSOR_GET_MEM_HANDLE(src));
+    m_tensorToDmaBufferData.insert({dest, m_tensorToDmaBufferData[src]});
+    m_sameMemoryFreeTensors.insert(dest);
+    return true;
+}
+bool DmaBufferAllocator::beforeWriteToBuffer(Qnn_Tensor_t* tensor) {
+  if (!tensor) {
+    QNN_WARN("beforeWriteToBuffer: received a null pointer to a tensor");
+    return false;
+  }
+  if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+    QNN_ERROR("beforeWriteToBuffer: Tensor not found with address = %p", tensor);
+    return false;
+  }
+  DmaBufferData dmaBufferData  = m_tensorToDmaBufferData[tensor];
+  struct dma_buf_sync buf_sync = {};
+  buf_sync.flags               = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE;
+  auto ioctlReturnValue        = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync);
+  if (ioctlReturnValue) {
+    QNN_ERROR(
+        "beforeWriteToBuffer: Error preparing the cache for buffer writes."
+        "The DMA_BUF_IOCTL_SYNC operation returned %d",
+        ioctlReturnValue);
+    return false;
+  }
+  return true;
+}
+bool DmaBufferAllocator::afterWriteToBuffer(Qnn_Tensor_t* tensor) {
+  if (!tensor) {
+    QNN_WARN("afterWriteToBuffer: received a null pointer to a tensor");
+    return false;
+  }
+  if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+    QNN_ERROR("afterWriteToBuffer: Tensor not found with address = %p", tensor);
+    return false;
+  }
+  DmaBufferData dmaBufferData  = m_tensorToDmaBufferData[tensor];
+  struct dma_buf_sync buf_sync = {};
+  buf_sync.flags               = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE;
+  auto ioctlReturnValue        = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync);
+  if (ioctlReturnValue) {
+    QNN_ERROR(
+        "afterWriteToBuffer: Error close the cache after buffer writing."
+        "The DMA_BUF_IOCTL_SYNC operation returned %d",
+        ioctlReturnValue);
+    return false;
+  }
+  return true;
+}
+bool DmaBufferAllocator::beforeReadFromBuffer(Qnn_Tensor_t* tensor) {
+  if (!tensor) {
+    QNN_WARN("beforeReadFromBuffer: received a null pointer to a tensor");
+    return false;
+  }
+  if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+    QNN_ERROR("beforeReadFromBuffer: Tensor not found with address = %p", tensor);
+    return false;
+  }
+  DmaBufferData dmaBufferData  = m_tensorToDmaBufferData[tensor];
+  struct dma_buf_sync buf_sync = {};
+  buf_sync.flags               = DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ;
+  auto ioctlReturnValue        = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync);
+  if (ioctlReturnValue) {
+    QNN_ERROR(
+        "beforeReadFromBuffer: Error preparing the cache for buffer reading."
+        "The DMA_BUF_IOCTL_SYNC operation returned %d",
+        ioctlReturnValue);
+    return false;
+  }
+  return true;
+}
+bool DmaBufferAllocator::afterReadFromBuffer(Qnn_Tensor_t* tensor) {
+  if (!tensor) {
+    QNN_WARN("afterReadFromBuffer: received a null pointer to a tensor");
+    return false;
+  }
+  if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) {
+    QNN_ERROR("afterReadFromBuffer: Tensor not found with address = %p", tensor);
+    return false;
+  }
+  DmaBufferData dmaBufferData  = m_tensorToDmaBufferData[tensor];
+  struct dma_buf_sync buf_sync = {};
+  buf_sync.flags               = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ;
+  auto ioctlReturnValue        = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync);
+  if (ioctlReturnValue) {
+    QNN_ERROR(
+        "afterReadFromBuffer: Error closing the cache after buffer reading."
+        "The DMA_BUF_IOCTL_SYNC operation returned %d",
+        ioctlReturnValue);
+    return false;
+  }
+  return true;
+}

Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.hpp ADDED Viewed

	@@ -0,0 +1,128 @@

+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#pragma once
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "IBufferAlloc.hpp"
+#include "QnnInterface.h"
+#include "Log.hpp"
+typedef void *(*DmaBufCreateFn_t)();
+typedef int (*DmaBufAllocFn_t)(void *, const char *, size_t, unsigned int, size_t);
+typedef void (*DmaBufDeinitFn_t)(void *);
+struct DmaBufferData {
+    void   *dmaBufferAllocator;
+    int    fd;
+    void*  memPointer;
+    size_t totalBufferSize;
+    int offset{0};
+    DmaBufferData() : dmaBufferAllocator(nullptr), fd(-1), memPointer(nullptr), totalBufferSize(0) {}
+    DmaBufferData(void *bufferAllocator, int fdIn, void* memPointerIn, size_t sizeIn)
+        : dmaBufferAllocator(bufferAllocator), fd(fdIn), memPointer(memPointerIn), totalBufferSize(sizeIn) {}
+};
+class DmaBufferAllocator final : public IBufferAlloc {
+  public:
+    DmaBufferAllocator(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface);
+    // Disable copy constructors, r-value referencing, etc
+    DmaBufferAllocator(const DmaBufferAllocator&)            = delete;
+    DmaBufferAllocator& operator=(const DmaBufferAllocator&) = delete;
+    DmaBufferAllocator(DmaBufferAllocator&&)                 = delete;
+    DmaBufferAllocator& operator=(DmaBufferAllocator&&)      = delete;
+    bool    initialize() override;
+    void*   getBuffer(Qnn_Tensor_t* tensor) override;
+    int     getFd(Qnn_Tensor_t* tensor) override;
+    size_t  getOffset(Qnn_Tensor_t* tensor) override;
+    size_t  getBufferSize(Qnn_Tensor_t* tensor) override;
+    size_t  getTotalBufferSize(Qnn_Tensor_t* tensor) override;
+    bool freeTensorBuffer(Qnn_Tensor_t* tensor) override;
+    bool allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) override;
+    virtual ~DmaBufferAllocator();
+    bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) override;
+    bool afterWriteToBuffer(Qnn_Tensor_t *tensor) override;
+    bool beforeReadFromBuffer(Qnn_Tensor_t *tensor) override;
+    bool afterReadFromBuffer(Qnn_Tensor_t *tensor) override;
+    bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) override {
+        QNN_WARN("Offset based tensors not supported!!");
+        return false;;
+    }
+    bool useExternalMemory(Qnn_Tensor_t* dest, void* extMem) override {
+        QNN_WARN("External Memory not supported!!");
+        return false;;
+    }
+    void* allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) override {
+        QNN_WARN("Fused Buffers not supported\n");
+        return nullptr;
+    };
+    bool  allocateBuffers(
+             const std::map<int, std::map<std::string, size_t>>& allocs_per_chunk,
+             std::map<std::string, std::pair<int, size_t>>&      tensor_offsets
+    ) override {
+        QNN_WARN("Fused Buffers not supported\n");
+        return false;
+    };
+    bool mapFusedBufferOffset(
+        Qnn_Tensor_t*       tensor,
+        size_t              tensorDataSize,
+        int32_t             fd,
+        uint32_t            offset,
+        uint64_t            totalBufferSize,
+        void*               memPointer,
+        Qnn_ContextHandle_t contextHandle
+    ) override {
+        QNN_WARN("Fused Buffers not supported\n");
+        return false;
+    };
+    bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) override {
+        QNN_WARN("Fused Buffers not supported\n");
+        return false;
+    };
+    void freeFusedBuffers() override {
+        return;
+    };
+    bool mapFusedBufferOffset(
+        Qnn_Tensor_t*       tensor,
+        int                 alloc_idx,
+        size_t              offset,
+        Qnn_ContextHandle_t ctx,
+        size_t              size
+    ) override {
+        QNN_WARN("Fused Buffers not supported\n");
+        return false;
+    };
+  private:
+    DmaBufferData * getDmaBufTensorData(Qnn_Tensor_t* tensor);
+    // Pointer to the dlopen'd libdmabufheap.so shared library which contains
+    // dmaBufCreate, dmaBufAlloc, dmaBufDeinit
+    void *m_libDmaBufHeapHandle;
+    DmaBufCreateFn_t m_dmaBufCreate;
+    DmaBufAllocFn_t m_dmaBufAlloc;
+    DmaBufDeinitFn_t m_dmaBufDeinit;
+    QNN_INTERFACE_VER_TYPE* m_qnnInterface;
+    Qnn_ContextHandle_t     m_contextHandle;
+    std::unordered_map<Qnn_Tensor_t *, DmaBufferData>   m_tensorToDmaBufferData;
+    std::unordered_set<Qnn_Tensor_t*>                   m_sameMemoryFreeTensors;
+    std::unordered_map<Qnn_MemHandle_t, DmaBufferData>  m_memHandleToDmaBufMem;
+};

Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp CHANGED Viewed

@@ -53,4 +53,18 @@ class IBufferAlloc {
     virtual bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) = 0;
     virtual void freeFusedBuffers()                                = 0;
-};

     virtual bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) = 0;
     virtual void freeFusedBuffers()                                = 0;
+    // Functions to sync memory buffers for Read/Write using DmaBuf.
+    virtual bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) {
+        return false;
+    };
+    virtual bool afterWriteToBuffer(Qnn_Tensor_t *tensor) {
+        return false;
+    };
+    virtual bool beforeReadFromBuffer(Qnn_Tensor_t *tensor) {
+        return false;
+    };
+    virtual bool afterReadFromBuffer(Qnn_Tensor_t *tensor) {
+        return false;
+    };
+};

Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp CHANGED Viewed

@@ -10,6 +10,9 @@
 #include <iostream>
 #include "ClientBuffer.hpp"
 #include "IBufferAlloc.hpp"
 #include "IOTensor.hpp"
 #include "RpcMem.hpp"
@@ -28,6 +31,14 @@ IOTensor::IOTensor(BufferAlloc bufferAllocIn, QNN_INTERFACE_VER_TYPE* qnnInterfa
 bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) {
     if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
         m_bufferManager = std::unique_ptr<IBufferAlloc>(new RpcMem(contextHandle, m_qnnInterface));
     }
     if (true != m_bufferManager->initialize()) {
@@ -39,7 +50,7 @@ bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) {
 }
 IOTensor::~IOTensor() {
-    if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
         m_bufferManager->freeFusedBuffers();
     }
 }
@@ -215,6 +226,70 @@ bool IOTensor::setupOutputTensors(
     return true;
 }
 bool IOTensor::mapFusedBufferOffset(
         GraphInfo_t*                                                  graph_info,
         Qnn_ContextHandle_t                                           context_handle,

 #include <iostream>
 #include "ClientBuffer.hpp"
+#ifndef _WIN32
+#include "DmaBufAllocator.hpp"
+#endif
 #include "IBufferAlloc.hpp"
 #include "IOTensor.hpp"
 #include "RpcMem.hpp"
 bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) {
     if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) {
         m_bufferManager = std::unique_ptr<IBufferAlloc>(new RpcMem(contextHandle, m_qnnInterface));
+    } else if (m_bufferAlloc == BufferAlloc::DMABUF) {
+#ifdef _WIN32
+        return false;
+#else
+        m_bufferManager =
+                std::unique_ptr<IBufferAlloc>(new DmaBufferAllocator(contextHandle, m_qnnInterface)
+                );
+#endif
     }
     if (true != m_bufferManager->initialize()) {
 }
 IOTensor::~IOTensor() {
+    if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER || m_bufferAlloc == BufferAlloc::DMABUF) {
         m_bufferManager->freeFusedBuffers();
     }
 }
     return true;
 }
+// Setup details for Qnn_Tensor_t for execution.
+// Reuse same memory handle for KV input and output tensor.
+bool IOTensor::setupOutputWithSharedTensors(
+        Qnn_Tensor_t**                                 tensors,
+        std::unordered_map<std::string, void*>&        tensorNameToTensorPointer,
+        const GraphInfo_t&                             graphInfo,
+        std::unordered_map<std::string, size_t>&       tensorsSize,
+        Qnn_ContextHandle_t                            contextHandle,
+        std::unordered_map<std::string, Qnn_Tensor_t*> sharedTensorMap
+) {
+    uint32_t tensorCount = graphInfo.numOutputTensors;
+    TensorWrapper* tensorWrappers = graphInfo.outputTensors;
+    if (nullptr == tensorWrappers) {
+        QNN_ERROR("tensorWrappers is nullptr");
+        return false;
+    }
+    if (0 == tensorCount) {
+        QNN_DEBUG("tensor count is 0. Nothing to setup.");
+        return true;
+    }
+    *tensors = (Qnn_Tensor_t*)calloc(1, tensorCount * sizeof(Qnn_Tensor_t));
+    if (nullptr == *tensors) {
+        QNN_ERROR("mem alloc failed for *tensors");
+        return false;
+    }
+    bool returnStatus = true;
+    for (size_t tensorIdx = 0; tensorIdx < tensorCount; tensorIdx++) {
+        Qnn_Tensor_t wrapperTensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrappers[tensorIdx]);
+        auto wrapperTensorName = std::string(GET_TENSOR_WRAPPER_NAME(tensorWrappers[tensorIdx]));
+        if (true == returnStatus) {
+            (*tensors)[tensorIdx] = QNN_TENSOR_INIT;
+            returnStatus          = deepCopyQnnTensorInfo(((*tensors) + tensorIdx), &wrapperTensor);
+        }
+        if (true == returnStatus) {
+            if (sharedTensorMap.find(wrapperTensorName) == sharedTensorMap.end()) {
+                QNN_DEBUG("IoTensor :: Create Buffer for Tensor %s", wrapperTensorName.c_str());
+                size_t tensorDataSize = tensorsSize[wrapperTensorName];
+                returnStatus          = m_bufferManager->allocateTensorBuffer(
+                        ((*tensors) + tensorIdx), tensorDataSize
+                );
+            } else {
+                std::string inputName = QNN_TENSOR_GET_NAME(sharedTensorMap[wrapperTensorName]);
+                QNN_DEBUG("IoTensor :: Reuse Buffer %s for Tensor %s", inputName.c_str(), wrapperTensorName.c_str());
+                returnStatus = m_bufferManager->useSameMemory(
+                        ((*tensors) + tensorIdx), sharedTensorMap[wrapperTensorName]
+                );
+            }
+        }
+        if (true != returnStatus) {
+            QNN_ERROR("Failure in setupTensors, cleaning up resources");
+            tearDownTensors(*tensors, tensorIdx);
+            *tensors = nullptr;
+            QNN_ERROR("Failure in setupTensors, done cleaning up resources");
+            break;
+        } else {
+            tensorNameToTensorPointer.insert({wrapperTensorName, ((*tensors) + tensorIdx)});
+        }
+    }
+    return returnStatus;
+}
 bool IOTensor::mapFusedBufferOffset(
         GraphInfo_t*                                                  graph_info,
         Qnn_ContextHandle_t                                           context_handle,

Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp CHANGED Viewed

@@ -28,6 +28,7 @@
 enum class BufferAlloc {
     DEFAULT,       // malloc based allocator
     SHARED_BUFFER, // shared buffer allocator; actual allocator depends on the platform
     INVALID
 };
 class IBufferAlloc;
@@ -60,6 +61,16 @@ class IOTensor {
             bool                                     skipBufferAllocation = false
     );
     bool tearDownTensors(Qnn_Tensor_t* tensors, uint32_t tensorCount);
     bool tearDownTensors(std::vector<Qnn_Tensor_t*>& tensors, uint32_t tensorCount);
@@ -146,6 +157,20 @@ class IOTensor {
     std::unordered_set<void*>& getFreeTensorsPointerSet() { return m_freeTensorsPointerSet; }
   private:
     BufferAlloc                   m_bufferAlloc;
     QNN_INTERFACE_VER_TYPE*       m_qnnInterface;

 enum class BufferAlloc {
     DEFAULT,       // malloc based allocator
     SHARED_BUFFER, // shared buffer allocator; actual allocator depends on the platform
+    DMABUF,        // dma buffer allocator
     INVALID
 };
 class IBufferAlloc;
             bool                                     skipBufferAllocation = false
     );
+    bool setupOutputWithSharedTensors(
+        Qnn_Tensor_t**                           outputs,
+        std::unordered_map<std::string, void*>&  tensorNameToTensorPointer,
+        const GraphInfo_t&                       graphInfo,
+        std::unordered_map<std::string, size_t>& outputTensorsSize,
+        Qnn_ContextHandle_t                      contextHandle,
+        std::unordered_map<std::string, Qnn_Tensor_t *> sharedTensorMap
+    );
     bool tearDownTensors(Qnn_Tensor_t* tensors, uint32_t tensorCount);
     bool tearDownTensors(std::vector<Qnn_Tensor_t*>& tensors, uint32_t tensorCount);
     std::unordered_set<void*>& getFreeTensorsPointerSet() { return m_freeTensorsPointerSet; }
+    // Functions to sync memory buffers for Read/Write using DmaBuf.
+    bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) {
+        return m_bufferManager->beforeWriteToBuffer(tensor);
+    }
+    bool afterWriteToBuffer(Qnn_Tensor_t *tensor){
+        return m_bufferManager->afterWriteToBuffer(tensor);
+    }
+    bool beforeReadFromBuffer(Qnn_Tensor_t *tensor){
+        return m_bufferManager->beforeReadFromBuffer(tensor);
+    }
+    bool afterReadFromBuffer(Qnn_Tensor_t *tensor){
+        return m_bufferManager->afterReadFromBuffer(tensor);
+    }
   private:
     BufferAlloc                   m_bufferAlloc;
     QNN_INTERFACE_VER_TYPE*       m_qnnInterface;

Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp CHANGED Viewed

@@ -106,11 +106,17 @@ bool QnnApi::getContextConfigs(
 ) {
     std::vector<QnnContext_Config_t*> contextConfigPtrsVec;
-    if (contextPriority != QNN_PRIORITY_DEFAULT) {
-        contextConfigPtrsVec.push_back((QnnContext_Config_t*)malloc(sizeof(QnnContext_Config_t)));
         contextConfigPtrsVec.back()->option =
-                QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_OPTION_PRIORITY;
-        contextConfigPtrsVec.back()->priority = contextPriority;
     }
     const char** graphNames = nullptr;
@@ -891,6 +897,8 @@ bool QnnApi::composeGraphs(
             QnnLog_Level_t::QNN_LOG_LEVEL_VERBOSE
     );
     if (status == MODEL_NO_ERROR) {
         return true;
     }
@@ -1163,33 +1171,6 @@ bool QnnApi::createFromBinary(
         }
     }
-    QnnContext_Config_t** contextConfigs     = nullptr;
-    uint32_t              contextConfigCount = 0;
-    if (true != getContextConfigs(
-                        &contextConfigs,
-                        contextConfigCount,
-                        contextConfig.priority,
-                        graphSwitching,
-                        execSelectGraphs,
-                        loadSelectGraphs
-                )) {
-        QNN_ERROR("Couldn't populate context configs");
-        return false;
-    }
-    // Merge BE specific and agnostic configs
-    QnnContext_Config_t** allContextConfigs{nullptr};
-    if (true != mergeAllContextConfigs(
-                        &allContextConfigs,
-                        customConfigs,
-                        contextConfigs,
-                        customConfigCount,
-                        contextConfigCount
-                )) {
-        QNN_ERROR("Error merging custom and context configs");
-        return false;
-    }
     if (nullptr == m_qnnSystemInterface.systemContextCreate ||
         nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo ||
         nullptr == m_qnnSystemInterface.systemContextFree) {
@@ -1299,9 +1280,36 @@ bool QnnApi::createFromBinary(
     }
     bool isIOBufferMgrInitialized = false;
     for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
         if (nullptr == m_qnnInterface.contextCreateFromBinary) {
             QNN_ERROR(
                     "contextCreateFromBinaryFnHandle is nullptr for context index = %zu", contextIdx
@@ -1498,7 +1506,13 @@ bool QnnApi::createFromBinary(
             first_contextHandle = contextHandle;
         }
 #endif
     }
     m_isContextCreated = true;
@@ -1507,14 +1521,6 @@ bool QnnApi::createFromBinary(
             "Initialized %u graphs from %lu contexts", m_graphsCount, cachedBinariesPathVec.size()
     );
-    if (true != freeContextConfigs(contextConfigs, contextConfigCount)) {
-        QNN_ERROR("Couldn't free context configs");
-        return false;
-    }
-    if (allContextConfigs) {
-        free(allContextConfigs);
-    }
     if (nullptr != m_backendExtensions && m_backendExtensions->interface()) {
         if (!m_backendExtensions->interface()->afterCreateFromBinary()) {
             QNN_ERROR("Extensions Failure in afterCreateFromBinary()");
@@ -1599,34 +1605,6 @@ bool QnnApi::createFromBinaryListAsync(
         }
     }
-    QnnContext_Config_t** contextConfigs     = nullptr;
-    uint32_t              contextConfigCount = 0;
-    if (true != getContextConfigs(
-                        &contextConfigs,
-                        contextConfigCount,
-                        contextConfig.priority,
-                        graphSwitching,
-                        execSelectGraphs,
-                        loadSelectGraphs
-                )) {
-        QNN_ERROR("Couldn't populate context configs");
-        return false;
-    }
-    // Merge BE specific and agnostic configs
-    QnnContext_Config_t** allContextConfigs{nullptr};
-    if (true != mergeAllContextConfigs(
-                        &allContextConfigs,
-                        customConfigs,
-                        contextConfigs,
-                        customConfigCount,
-                        contextConfigCount
-                )) {
-        QNN_ERROR("Error merging custom and context configs");
-        return false;
-    }
     if (nullptr == m_qnnSystemInterface.systemContextCreate ||
         nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo ||
         nullptr == m_qnnSystemInterface.systemContextFree) {
@@ -1642,6 +1620,8 @@ bool QnnApi::createFromBinaryListAsync(
     GraphInfo_t*** graphsInfo =
             (GraphInfo_t***)calloc(cachedBinariesPathVec.size(), sizeof(GraphInfo_t**));
     uint32_t graphsTotalNum = 0;
     for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
         auto _startPerContext = std::chrono::steady_clock::now();
@@ -1710,17 +1690,41 @@ bool QnnApi::createFromBinaryListAsync(
         m_qnnSystemInterface.systemContextFree(sysCtxHandle);
         sysCtxHandle = nullptr;
-        uint32_t customConfigCountSF = 0;
         if (mmap_budget > 0) {
             QnnHtpContext_CustomConfig_t customConfigReadBudget;
             customConfigReadBudget.option = QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET;
             customConfigReadBudget.fileReadMemoryBudgetInMb = mmap_budget;
             QnnContext_Config_t** cfgs{nullptr};
             uint32_t customConfigCountReadBudget = 1;
             cfgs = (QnnContext_Config_t**)malloc(
                     customConfigCountReadBudget * sizeof(QnnContext_Config_t*)
             );
@@ -1729,15 +1733,16 @@ bool QnnApi::createFromBinaryListAsync(
             cfgs[0]->customConfig =
                     reinterpret_cast<QnnContext_CustomConfig_t>(&customConfigReadBudget);
             if (true != mergeAllContextConfigs(
-                                &allContextConfigs,
                                 cfgs,
-                                allContextConfigs,
                                 customConfigCountReadBudget,
                                 contextConfigCount + customConfigCount + customConfigCountSF
                         )) {
                 QNN_ERROR("Error merging custom and context configs");
                 return false;
             }
         }
         if (m_profileBackendHandle) {
@@ -1751,7 +1756,7 @@ bool QnnApi::createFromBinaryListAsync(
                 .version = QNN_CONTEXT_PARAMS_VERSION_1,
                 .v1 =
                         QnnContext_ParamsV1_t{
-                                (const QnnContext_Config_t**)allContextConfigs,
                                 (const void*)buffer.get(),
                                 bufferSize,
                                 nullptr,
@@ -1778,18 +1783,15 @@ bool QnnApi::createFromBinaryListAsync(
     }
     auto start = std::chrono::steady_clock::now();
     auto errCode = m_qnnInterface.contextCreateFromBinaryListAsync(
                 m_backendHandle,
                 m_deviceHandle,
                 const_cast<const QnnContext_Params_t**>(context_params_list.data()),
-                (const QnnContext_Config_t**)allContextConfigs,
                 nullptr
         );
     auto stop = std::chrono::steady_clock::now();
     QNN_DEBUG(
             "Initializing %lu context with %u graphs took: %lld us",
             cachedBinariesPathVec.size(),
@@ -1824,26 +1826,24 @@ bool QnnApi::createFromBinaryListAsync(
     m_isContextCreated = true;
-    if (true != freeContextConfigs(contextConfigs, contextConfigCount)) {
-        QNN_ERROR("Couldn't free context configs");
-        return false;
-    }
     if (true != freeContextParams(context_params_list.data(), cachedBinariesPathVec.size())) {
         QNN_ERROR("Couldn't free context params list");
         return false;
     }
-    if (allContextConfigs) {
-        free(allContextConfigs);
-    }
     if (nullptr != m_backendExtensions && m_backendExtensions->interface()) {
         if (!m_backendExtensions->interface()->afterCreateContextsFromBinaryList()) {
             QNN_ERROR("Extensions Failure in afterCreateContextsFromBinaryList()");
             return false;
         }
     }
     return true;
 }
 #endif
@@ -2543,6 +2543,64 @@ bool QnnApi::extractProfilingEvent(QnnProfile_EventId_t profileEventId) {
     return true;
 }
 bool QnnApi::applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch) {
 #if QUALLA_QNN_API_VERSION < 21700
     QNN_ERROR("LoRA adaptors require QNN SDK >= 2.25.1. Please update your libraries");
@@ -2650,7 +2708,7 @@ bool QnnApi::applyBinarySection(uint32_t binIndex, std::string binSectionPath,bo
 #endif
 }
-bool QnnApi::updateIOEncodings(std::shared_ptr<uint8_t>& buffer,uint64_t  bufferSize,uint32_t graphIndex){
     QNN_DEBUG("Applying adapter Encodings");
     QnnSystemContext_Handle_t sysCtxHandle{nullptr};
@@ -2679,3 +2737,224 @@ bool QnnApi::updateIOEncodings(std::shared_ptr<uint8_t>& buffer,uint64_t  buffer
     QNN_DEBUG(" updateIOEncodings success ");
     return true;
 }

 ) {
     std::vector<QnnContext_Config_t*> contextConfigPtrsVec;
+    if (contextPriority == QNN_PRIORITY_UNDEFINED) {
+        contextConfigPtrsVec.push_back((QnnContext_Config_t *) malloc(sizeof(QnnContext_Config_t)));
         contextConfigPtrsVec.back()->option =
+                QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_UNDEFINED;
+    } else {
+        if (contextPriority != QNN_PRIORITY_DEFAULT) {
+            contextConfigPtrsVec.push_back((QnnContext_Config_t *) malloc(sizeof(QnnContext_Config_t)));
+            contextConfigPtrsVec.back()->option =
+                    QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_OPTION_PRIORITY;
+            contextConfigPtrsVec.back()->priority = contextPriority;
+        }
     }
     const char** graphNames = nullptr;
             QnnLog_Level_t::QNN_LOG_LEVEL_VERBOSE
     );
+    graphCountPerContext = m_graphsCount;
     if (status == MODEL_NO_ERROR) {
         return true;
     }
         }
     }
     if (nullptr == m_qnnSystemInterface.systemContextCreate ||
         nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo ||
         nullptr == m_qnnSystemInterface.systemContextFree) {
     }
     bool isIOBufferMgrInitialized = false;
     for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
+        // Create context configs for each context
+        QnnContext_Config_t** contextConfigs     = nullptr;
+        uint32_t              contextConfigCount = 0;
+        if (true != getContextConfigs(
+            &contextConfigs,
+            contextConfigCount,
+            contextConfig.priority,
+            graphSwitching,
+            execSelectGraphs,
+            loadSelectGraphs
+        )) {
+          QNN_ERROR("Couldn't populate context configs");
+          return false;
+        }
+        // Merge BE specific and agnostic configs
+        QnnContext_Config_t** allContextConfigs{nullptr};
+        if (true != mergeAllContextConfigs(
+            &allContextConfigs,
+            customConfigs,
+            contextConfigs,
+            customConfigCount,
+            contextConfigCount
+        )) {
+          QNN_ERROR("Error merging custom and context configs");
+          return false;
+        }
         if (nullptr == m_qnnInterface.contextCreateFromBinary) {
             QNN_ERROR(
                     "contextCreateFromBinaryFnHandle is nullptr for context index = %zu", contextIdx
             first_contextHandle = contextHandle;
         }
 #endif
+        if (true != freeContextConfigs(contextConfigs, contextConfigCount)) {
+          QNN_ERROR("Couldn't free context configs");
+          return false;
+        }
+        if (allContextConfigs) {
+          free(allContextConfigs);
+        }
     }
     m_isContextCreated = true;
             "Initialized %u graphs from %lu contexts", m_graphsCount, cachedBinariesPathVec.size()
     );
     if (nullptr != m_backendExtensions && m_backendExtensions->interface()) {
         if (!m_backendExtensions->interface()->afterCreateFromBinary()) {
             QNN_ERROR("Extensions Failure in afterCreateFromBinary()");
         }
     }
     if (nullptr == m_qnnSystemInterface.systemContextCreate ||
         nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo ||
         nullptr == m_qnnSystemInterface.systemContextFree) {
     GraphInfo_t*** graphsInfo =
             (GraphInfo_t***)calloc(cachedBinariesPathVec.size(), sizeof(GraphInfo_t**));
     uint32_t graphsTotalNum = 0;
+    std::vector<QnnContext_Config_t**> allContextConfigs{(unsigned int)cachedBinariesPathVec.size(), nullptr};
+    std::vector<uint32_t> allContextConfigsSize{(unsigned int)cachedBinariesPathVec.size()};
     for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
         auto _startPerContext = std::chrono::steady_clock::now();
         m_qnnSystemInterface.systemContextFree(sysCtxHandle);
         sysCtxHandle = nullptr;
+        uint32_t contextConfigCount = 0;
+        if (true != getContextConfigs(
+                            &allContextConfigs[contextIdx],
+                            contextConfigCount,
+                            contextConfig.priority,
+                            graphSwitching,
+                            execSelectGraphs,
+                            loadSelectGraphs
+                    )) {
+            QNN_ERROR("Couldn't populate context configs");
+            return false;
+        }
+        allContextConfigsSize[contextIdx] = contextConfigCount;
+        // Merge BE specific and agnostic configs
+        if (true != mergeAllContextConfigs(
+                            &allContextConfigs[contextIdx],
+                            customConfigs,
+                            allContextConfigs[contextIdx],
+                            customConfigCount,
+                            contextConfigCount
+                    )) {
+            QNN_ERROR("Error merging custom and context configs");
+            return false;
+        }
+        allContextConfigsSize[contextIdx] += customConfigCount;
+        uint32_t customConfigCountSF = 0;
         if (mmap_budget > 0) {
             QnnHtpContext_CustomConfig_t customConfigReadBudget;
             customConfigReadBudget.option = QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET;
             customConfigReadBudget.fileReadMemoryBudgetInMb = mmap_budget;
             QnnContext_Config_t** cfgs{nullptr};
             uint32_t customConfigCountReadBudget = 1;
             cfgs = (QnnContext_Config_t**)malloc(
                     customConfigCountReadBudget * sizeof(QnnContext_Config_t*)
             );
             cfgs[0]->customConfig =
                     reinterpret_cast<QnnContext_CustomConfig_t>(&customConfigReadBudget);
             if (true != mergeAllContextConfigs(
+                                &allContextConfigs[contextIdx],
                                 cfgs,
+                                allContextConfigs[contextIdx],
                                 customConfigCountReadBudget,
                                 contextConfigCount + customConfigCount + customConfigCountSF
                         )) {
                 QNN_ERROR("Error merging custom and context configs");
                 return false;
             }
+            allContextConfigsSize[contextIdx] += customConfigCountReadBudget;
         }
         if (m_profileBackendHandle) {
                 .version = QNN_CONTEXT_PARAMS_VERSION_1,
                 .v1 =
                         QnnContext_ParamsV1_t{
+                                (const QnnContext_Config_t**)allContextConfigs[contextIdx],
                                 (const void*)buffer.get(),
                                 bufferSize,
                                 nullptr,
     }
     auto start = std::chrono::steady_clock::now();
     auto errCode = m_qnnInterface.contextCreateFromBinaryListAsync(
                 m_backendHandle,
                 m_deviceHandle,
                 const_cast<const QnnContext_Params_t**>(context_params_list.data()),
+                (const QnnContext_Config_t**)customConfigs,
                 nullptr
         );
     auto stop = std::chrono::steady_clock::now();
     QNN_DEBUG(
             "Initializing %lu context with %u graphs took: %lld us",
             cachedBinariesPathVec.size(),
     m_isContextCreated = true;
     if (true != freeContextParams(context_params_list.data(), cachedBinariesPathVec.size())) {
         QNN_ERROR("Couldn't free context params list");
         return false;
     }
     if (nullptr != m_backendExtensions && m_backendExtensions->interface()) {
         if (!m_backendExtensions->interface()->afterCreateContextsFromBinaryList()) {
             QNN_ERROR("Extensions Failure in afterCreateContextsFromBinaryList()");
             return false;
         }
     }
+    for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
+        if (true != freeContextConfigs(allContextConfigs[contextIdx], allContextConfigsSize[contextIdx])) {
+            QNN_ERROR("Couldn't free context configs");
+            return false;
+        }
+    }
     return true;
 }
 #endif
     return true;
 }
+bool QnnApi::applyBinarySection(uint32_t graphId, std::string binSectionPath) {
+#if QUALLA_QNN_API_VERSION < 21700
+    QNN_ERROR("LoRA adaptors require QNN SDK >= 2.25.1. Please update your libraries");
+    return false;
+#else
+    // assumption splitNum  from 0
+    QNN_DEBUG("QnnApi::applyBinarySection %d ", graphId);
+    if (nullptr == m_qnnInterface.contextApplyBinarySection) {
+        QNN_ERROR("contextApplyBinarySection Interface not suported!!");
+        return false;
+    }
+    if (graphId >= m_graphsCount) {
+        QNN_ERROR(" Passed split %d  base Model graphcount %d ", graphId, m_graphsCount);
+        return false;
+    }
+    uint64_t                 bufferSize{0};
+    std::shared_ptr<uint8_t> buffer{nullptr};
+    bufferSize = getFileSize(binSectionPath);
+    buffer     = std::shared_ptr<uint8_t>(new uint8_t[bufferSize]);
+    if (true != readBinaryFromFile(binSectionPath, buffer.get(), bufferSize)) {
+        QNN_ERROR("Failed to read binary data for context index = %d", graphId);
+        return false;
+    }
+    QnnContext_Buffer_t qnnBuffer;
+    qnnBuffer.version               = QNN_CONTEXT_BUFFER_VERSION_1;
+    qnnBuffer.v1.memType            = QNN_CONTEXTMEMTYPE_RAW;
+    qnnBuffer.v1.binaryBuf.dataSize = bufferSize;
+    qnnBuffer.v1.binaryBuf.data     = static_cast<void*>(buffer.get());
+    auto graphCountPerContext       = getGraphCountPerContext();
+    if (graphCountPerContext <= 0) {
+        QNN_ERROR(" graphCountPerContext is <=0 ");
+        return false;
+    }
+    auto contextHandle = m_contextVec[graphId / graphCountPerContext];
+    auto graphHandle   = m_graphsInfo[graphId]->graph;
+    if (contextHandle == nullptr || graphHandle == nullptr) {
+        QNN_ERROR(" contexthandle or graph handle is null for patch no = %d ", graphId);
+        return false;
+    }
+    auto errorCode = m_qnnInterface.contextApplyBinarySection(
+        contextHandle,
+        graphHandle,
+        QNN_CONTEXT_SECTION_UPDATABLE,
+        &qnnBuffer,
+        nullptr, //profile handle is null
+        nullptr  //singal handle is null
+    );
+    if (errorCode != QNN_SUCCESS) {
+        QNN_ERROR("Could not Apply Patch for graph = %d errocode = %zu ", graphId, errorCode);
+        return false;
+    }
+    return true;
+#endif
+}
 bool QnnApi::applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch) {
 #if QUALLA_QNN_API_VERSION < 21700
     QNN_ERROR("LoRA adaptors require QNN SDK >= 2.25.1. Please update your libraries");
 #endif
 }
+bool QnnApi::updateIOEncodings(std::shared_ptr<uint8_t>& buffer,uint64_t  bufferSize,uint32_t graphIndex) {
     QNN_DEBUG("Applying adapter Encodings");
     QnnSystemContext_Handle_t sysCtxHandle{nullptr};
     QNN_DEBUG(" updateIOEncodings success ");
     return true;
 }
+// This is a light weight function of existing ::createFromBinary, used for
+// GPU execution to avoid conflicts with HTP use-case and for better readability.
+bool QnnApi::createFromBinary(
+    std::vector<std::string> cachedBinariesPathVec
+) {
+    auto _start = std::chrono::steady_clock::now();
+    if (nullptr == m_qnnSystemInterface.systemContextCreate ||
+        nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo ||
+        nullptr == m_qnnSystemInterface.systemContextFree) {
+        QNN_ERROR("QNN System function pointers are not populated.");
+        return false;
+    }
+    graphCountPerContext = getGraphCountPerContext();
+    for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) {
+        uint64_t                 bufferSize{0};
+        std::shared_ptr<uint8_t> buffer{nullptr};
+        uint32_t                 graphsCount;
+        // read serialized binary into a byte buffer
+        bufferSize = getFileSize(cachedBinariesPathVec[contextIdx]);
+        if (0 == bufferSize) {
+            QNN_ERROR(
+                    "Received path to an empty file for context index = %zu. Nothing to deserialize.",
+                    contextIdx
+            );
+            return false;
+        }
+        buffer = std::shared_ptr<uint8_t>(
+                new uint8_t[bufferSize], std::default_delete<uint8_t[]>()
+        );
+        if (!buffer) {
+            QNN_ERROR("Failed to allocate memory for context index = %zu", contextIdx);
+            return false;
+        }
+        if (true !=
+            readBinaryFromFile(cachedBinariesPathVec[contextIdx], buffer.get(), bufferSize)) {
+            QNN_ERROR("Failed to read binary data for context index = %zu", contextIdx);
+            return false;
+        }
+        // inspect binary info
+        QnnSystemContext_Handle_t sysCtxHandle{nullptr};
+        if (QNN_SUCCESS != m_qnnSystemInterface.systemContextCreate(&sysCtxHandle)) {
+            QNN_ERROR("Could not create system handle for context index = %zu", contextIdx);
+            return false;
+        }
+        const QnnSystemContext_BinaryInfo_t* binaryInfo{nullptr};
+        Qnn_ContextBinarySize_t              binaryInfoSize{0};
+        if (QNN_SUCCESS != m_qnnSystemInterface.systemContextGetBinaryInfo(
+                                   sysCtxHandle,
+                                   static_cast<void*>(buffer.get()),
+                                   bufferSize,
+                                   &binaryInfo,
+                                   &binaryInfoSize
+                           )) {
+            QNN_ERROR("Failed to get context binary info for context index = %zu", contextIdx);
+            return false;
+        }
+        GraphInfo_t** graphsInfo;
+        if (!copyMetadataToGraphsInfo(binaryInfo, graphsInfo, graphsCount)) {
+            QNN_ERROR("Failed to copy metadata for graph index = %zu", contextIdx);
+            freeGraphsInfo(&graphsInfo, graphsCount);
+            if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+            return false;
+        }
+        if (graphCountPerContext == -1) {
+            graphCountPerContext = graphsCount;
+            m_graphsInfo         = (GraphInfo_t**)calloc(
+                    graphCountPerContext * cachedBinariesPathVec.size(), sizeof(GraphInfo_t*)
+            );
+        } else if (graphCountPerContext != graphsCount) {
+            QNN_ERROR(
+                    "Different len(graphs) found in different context files. Found %u vs %u",
+                    graphsCount,
+                    graphCountPerContext
+            );
+            freeGraphsInfo(&graphsInfo, graphsCount);
+            if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+            return false;
+        }
+        m_qnnSystemInterface.systemContextFree(sysCtxHandle);
+        sysCtxHandle = nullptr;
+        if (nullptr == m_qnnInterface.contextCreateFromBinary) {
+            QNN_ERROR(
+                    "contextCreateFromBinaryFnHandle is nullptr for context index = %zu", contextIdx
+            );
+            freeGraphsInfo(&graphsInfo, graphsCount);
+            if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+            return false;
+        }
+        Qnn_ContextHandle_t contextHandle{nullptr};
+        auto _stop = std::chrono::steady_clock::now();
+        QNN_DEBUG(
+                "Loading contexts[%lu] took: %lld us",
+                contextIdx,
+                std::chrono::duration_cast<std::chrono::microseconds>(_stop - _start).count()
+        );
+        auto start = std::chrono::steady_clock::now();
+        auto errCode = m_qnnInterface.contextCreateFromBinary(
+                m_backendHandle,
+                m_deviceHandle,
+                nullptr,
+                (const void*)buffer.get(),
+                bufferSize,
+                &contextHandle,
+                nullptr // profile handle
+        );
+        if (errCode != QNN_SUCCESS) {
+            QNN_ERROR(
+                    "Could not create context from binary for context index = %zu : err %d",
+                    contextIdx,
+                    (int)errCode
+            );
+            freeGraphsInfo(&graphsInfo, graphsCount);
+            if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+            return false;
+        }
+        auto stop = std::chrono::steady_clock::now();
+        QNN_DEBUG(
+                "Initializing context[%lu] with %u graphs took: %lld us",
+                contextIdx,
+                graphsCount,
+                std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count()
+        );
+        for (int n_graph = 0; n_graph < graphsCount; n_graph++) {
+            // Allocate inputTensors and outputTensors
+            GraphInfo_t* cur_graph = graphsInfo[n_graph];
+            m_graphsInfo[m_graphsCount++] = cur_graph;
+            m_contextMap[cur_graph]       = contextHandle;
+        }
+        m_contextVec.push_back(contextHandle);
+    }
+    m_isContextCreated = true;
+    QNN_DEBUG(
+            "Initialized %u graphs from %lu contexts", m_graphsCount, cachedBinariesPathVec.size()
+    );
+    if (nullptr == m_qnnInterface.graphRetrieve) {
+        QNN_ERROR("graphRetrieveFnHandle is nullptr.");
+        freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+        return false;
+    }
+    for (size_t graphIdx = 0; graphIdx < m_graphsCount; graphIdx++) {
+        if (!m_graphsInfo || QNN_SUCCESS != m_qnnInterface.graphRetrieve(
+                                                    m_contextVec[graphIdx / graphCountPerContext],
+                                                    m_graphsInfo[graphIdx]->graphName,
+                                                    &(m_graphsInfo[graphIdx]->graph)
+                                            )) {
+            QNN_ERROR("Unable to retrieve graph handle for graph index = %zu", graphIdx);
+            freeGraphsInfo(&m_graphsInfo, m_graphsCount);
+            return false;
+        }
+    }
+    return true;
+}
+bool QnnApi::initialize(
+        std::string               backendPath,
+        std::vector<std::string>  modelPathOrCachedBinaryPath
+) {
+    if (modelPathOrCachedBinaryPath.size() != 1) {
+        QNN_ERROR("Multiple Files not supported for now!!");
+        return false;
+    }
+    if (false == getQnnInterface(backendPath)) {
+        QNN_ERROR("Qnn getQnnInterface FAILED!");
+        return false;
+    }
+    const std::string systemLibraryPath = "libQnnSystem.so";
+    if (false == getQnnSystemInterface(systemLibraryPath)) {
+        QNN_ERROR("Qnn getQnnSystemInterface FAILED!");
+        return false;
+    }
+    QnnLog_Level_t logLevel = QNN_LOG_LEVEL_INFO;
+    if (false == initializeLogging(logLevel, false)) {
+        QNN_ERROR("Unable to Initialize logging in backend");
+        return false;
+    }
+    // Initialize Backend
+    if (false == initializeBackend()) {
+        QNN_ERROR("Qnn initializeBackend FAILED!");
+        return false;
+    }
+    if (false == createFromBinary(modelPathOrCachedBinaryPath)) {
+        QNN_ERROR("Create From Binary FAILED!");
+        return false;
+    }
+    for (size_t graphIdx = 0; graphIdx < m_graphsCount; graphIdx++) {
+        m_graphNameToIndex[m_graphsInfo[graphIdx]->graphName] = graphIdx;
+    }
+    QNN_DEBUG("Model Initialized");
+    return true;
+}

Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp CHANGED Viewed

@@ -370,6 +370,8 @@ class QnnApi {
     bool applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch);
     QNN_INTERFACE_VER_TYPE*           getQnnInterfaceVer() { return &m_qnnInterface; };
     GraphInfo_t**&                    getGraphsInfo() { return m_graphsInfo; };
     uint32_t                          getGraphsCount() { return m_graphsCount; };
@@ -426,4 +428,11 @@ class QnnApi {
     bool updateIOEncodings(std::shared_ptr<uint8_t>& buffer,
                            uint64_t  bufferSize,
                            uint32_t graphIndex);
 };

     bool applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch);
+    bool applyBinarySection(uint32_t graphId, std::string binSectionPath);
     QNN_INTERFACE_VER_TYPE*           getQnnInterfaceVer() { return &m_qnnInterface; };
     GraphInfo_t**&                    getGraphsInfo() { return m_graphsInfo; };
     uint32_t                          getGraphsCount() { return m_graphsCount; };
     bool updateIOEncodings(std::shared_ptr<uint8_t>& buffer,
                            uint64_t  bufferSize,
                            uint32_t graphIndex);
+    bool createFromBinary(std::vector<std::string> cachedBinariesPathVec);
+    bool initialize(
+        std::string               backendPath,
+        std::vector<std::string>  modelPathOrCachedBinaryPath
+    );
 };

Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp CHANGED Viewed

@@ -46,14 +46,14 @@ bool writeRawData(void* tensorData, size_t tensorSize, const std::filesystem::pa
 bool readRawData(void* tensorData, size_t tensorSize, const std::filesystem::path& path);
 struct Dims {
-    int32_t batch = 1;
-    int32_t height, width, channel, bitWidth;
     Dims() : height(0), width(0), channel(0), bitWidth(0) {}
-    Dims(int32_t height, int32_t width, int32_t channel, int32_t bitWidth)
         : height(height), width(width), channel(channel), bitWidth(bitWidth) {}
     Dims(std::vector<size_t>& tDims)
-        : height((int32_t)tDims[1]), width((int32_t)tDims[2]), channel((int32_t)tDims[3]),
-          bitWidth((int32_t)tDims[4]) {
         // Hack to mix batch dimension
         if (tDims[0] != 1 && tDims[1] == 1) height = tDims[0];
         if (tDims[0] >  1 && tDims[1] != 1) batch  = tDims[0];

 bool readRawData(void* tensorData, size_t tensorSize, const std::filesystem::path& path);
 struct Dims {
+    uint32_t batch = 1;
+    uint32_t height, width, channel, bitWidth;
     Dims() : height(0), width(0), channel(0), bitWidth(0) {}
+    Dims(uint32_t height, uint32_t width, uint32_t channel, uint32_t bitWidth)
         : height(height), width(width), channel(channel), bitWidth(bitWidth) {}
     Dims(std::vector<size_t>& tDims)
+        : height((uint32_t)tDims[1]), width((uint32_t)tDims[2]), channel((uint32_t)tDims[3]),
+          bitWidth((uint32_t)tDims[4]) {
         // Hack to mix batch dimension
         if (tDims[0] != 1 && tDims[1] == 1) height = tDims[0];
         if (tDims[0] >  1 && tDims[1] != 1) batch  = tDims[0];

Genie/Genie/src/qualla/engines/qnn-cpu.cpp CHANGED Viewed

@@ -55,8 +55,10 @@ class QnnCpuEngine : public Engine {
     virtual bool   updateKV(size_t n_past) override;
     virtual bool   updateKV(size_t n_past, const std::vector<bool>& selected) override;
     virtual bool   save(const std::string& name) override;
-    virtual size_t restore(const std::string& name) override;
     virtual void   reset() override;
 };
 namespace fs = std::filesystem;
@@ -98,7 +100,40 @@ QnnCpuEngine::QnnCpuEngine(Context& ctx, const qualla::json& json) : Engine(ctx,
     p.use_mmap       = conf.optional<bool>("use-mmap", false);
     p.ctx_size       = _ctx.size();
     p.n_vocab_size   = _ctx.n_vocab();
     _model = std::make_unique<QnnCpuModel>(_env, p);
     // Load model
@@ -211,7 +246,7 @@ size_t QnnCpuEngine::process(
     );
 }
-size_t QnnCpuEngine::restore(const std::string& name) {
     fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-cpu", _role);
     return _model->loadKVCache(cache_path.string());
 }
@@ -226,6 +261,23 @@ void QnnCpuEngine::reset() {
     updateKV(0);
 }
 // Registrator instance
 static OnLoad regy([]() {
     Engine::__register("qnn-cpu", [](Context& ctx, const json& conf) {

     virtual bool   updateKV(size_t n_past) override;
     virtual bool   updateKV(size_t n_past, const std::vector<bool>& selected) override;
     virtual bool   save(const std::string& name) override;
+    virtual size_t restore(const std::string& name, bool chooseHigherVariant) override;
     virtual void   reset() override;
+    virtual bool applyLoraAdapter(std::string lora_adapter_name) override;
+    virtual bool applyLoraStrength(std::string tensor_name, float tensor_val) override;
 };
 namespace fs = std::filesystem;
     p.use_mmap       = conf.optional<bool>("use-mmap", false);
     p.ctx_size       = _ctx.size();
     p.n_vocab_size   = _ctx.n_vocab();
+    p.lora_config_type = LoraConfigType::LORA_DISABLE;
+    qualla::json lora_conf = conf.optional<qualla::json>("lora", {});
+    if (lora_conf.size() != 0) {
+        p.lora_config_type = LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE;
+        if (lora_conf.is_array()) {
+          for (auto lc : lora_conf) {
+            std::string lnm = lc["adapter-name"];
+            p.lora_config[lnm].lora_name         = lnm;
+            p.lora_config[lnm].alpha_tensor_name = lc["alpha-tensor-name"];
+            p.lora_config[lnm].alpha_tensor_val  = 0.0f;
+            if(lc.contains("alpha-tensor-value")){
+              p.lora_config[lnm].alpha_tensor_val  = lc["alpha-tensor-value"];
+            }
+            std::string basedir = "";
+            if(lc.contains("binsection-basedir")){
+              basedir = lc["binsection-basedir"];
+            }
+            uint32_t n = lc["bin-sections"].size();
+            for (uint32_t i = 0; i < n; i++) {
+              auto binSec = lc["bin-sections"].get<std::vector<std::string>>();
+              fs::path binsection_path = fs::path(binSec[i]);
+              if (binsection_path.is_relative()) binsection_path = basedir / fs::path(binSec[i]);
+              if (!fs::is_regular_file(binsection_path)) {
+                __ERROR("qnn-cpu: Can't access Lora binsection adapter : {}",
+                        binsection_path.string());
+                throw std::runtime_error(
+                    "qnn-cpu: Can't open adapter file : " + binsection_path.string()
+                );
+              }
+              p.lora_config[lnm].binsection_list.push_back(binsection_path.string());
+            }
+          }
+        }
+    }
     _model = std::make_unique<QnnCpuModel>(_env, p);
     // Load model
     );
 }
+size_t QnnCpuEngine::restore(const std::string& name, bool chooseHigherVariant) {
     fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-cpu", _role);
     return _model->loadKVCache(cache_path.string());
 }
     updateKV(0);
 }
+// For Lora
+bool QnnCpuEngine::applyLoraAdapter(std::string lora_adapter_name) {
+  if (!_model) {
+    __ERROR("qnn-cpu: applyLoraAdapter failed, model not initialized");
+    return false;
+  }
+  return _model->applyLoraAdapter(lora_adapter_name);
+}
+bool QnnCpuEngine::applyLoraStrength(std::string tensor_name, float tensor_val) {
+  if (!_model) {
+    __ERROR("qnn-cpu: applyLoraStrength failed, model not initialized");
+    return false;
+  }
+  return _model->applyLoraStrength(tensor_name, tensor_val);
+}
 // Registrator instance
 static OnLoad regy([]() {
     Engine::__register("qnn-cpu", [](Context& ctx, const json& conf) {

Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp CHANGED Viewed

@@ -61,6 +61,12 @@ QnnCpuModel::QnnCpuModel(Env& env, const Params& params)
         m_output_dim.push_back(m_numLogits);
         m_output_dim.push_back(m_embd);
     }
 }
 QnnCpuModel::~QnnCpuModel() {
@@ -383,6 +389,7 @@ bool QnnCpuModel::initializeTensorPointers() {
     t_input_ids_k_cache       = &input_specs["x3"];
     t_input_ids_v_cache       = &input_specs["x4"];
     t_input_ids_n_past        = &input_specs["x5"];
     auto& output_specs = m_output_specs[model_order.back()];
     t_logits           = &output_specs["output_genAI"];
@@ -406,6 +413,7 @@ void QnnCpuModel::setupInputTensors(const std::vector<int32_t>& tokens, bool run
     uint32_t* input_id_num_token_buffer     = (uint32_t*)getBuffer(t_input_ids_num_token);
     uint32_t* input_id_reset_kvcache_buffer = (uint32_t*)getBuffer(t_input_ids_reset_kvcache);
     uint32_t* input_id_n_past_buffer        = (uint32_t*)getBuffer(t_input_ids_n_past);
     uint32_t size = 1;
     for (auto dim : m_input_dim) {
@@ -420,6 +428,7 @@ void QnnCpuModel::setupInputTensors(const std::vector<int32_t>& tokens, bool run
     std::memcpy(input_id_buffer, tokens.data(), tokens.size() * sizeof(uint32_t));
     *input_id_num_token_buffer = tokens.size();
     *input_id_n_past_buffer = m_nPast;
     auto stop = std::chrono::steady_clock::now();
     // QnnUtils::logProfile("setupInputTensors (cpp) took", start, stop);
@@ -589,6 +598,48 @@ size_t QnnCpuModel::getDequantLogits(std::vector<float>& dequant_logits, bool lo
     return logits_all? prev_run.num_tokens_processed : 1;
 }
 // TODO: implement save/restore
 size_t QnnCpuModel::loadKVCache(const std::string& load_path) {
     //TO read the cache file into KV tensor

         m_output_dim.push_back(m_numLogits);
         m_output_dim.push_back(m_embd);
     }
+    m_loraConfigType = params.lora_config_type;
+    m_lora_alpha_val = 1.0f;
+    if (m_loraConfigType == LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE) {
+      m_loraConfig.insert(params.lora_config.begin(), params.lora_config.end());
+    }
 }
 QnnCpuModel::~QnnCpuModel() {
     t_input_ids_k_cache       = &input_specs["x3"];
     t_input_ids_v_cache       = &input_specs["x4"];
     t_input_ids_n_past        = &input_specs["x5"];
+    t_input_lora_alpha        = &input_specs["x6"];
     auto& output_specs = m_output_specs[model_order.back()];
     t_logits           = &output_specs["output_genAI"];
     uint32_t* input_id_num_token_buffer     = (uint32_t*)getBuffer(t_input_ids_num_token);
     uint32_t* input_id_reset_kvcache_buffer = (uint32_t*)getBuffer(t_input_ids_reset_kvcache);
     uint32_t* input_id_n_past_buffer        = (uint32_t*)getBuffer(t_input_ids_n_past);
+    float*    input_id_lora_alpha           = (float*)getBuffer(t_input_lora_alpha);
     uint32_t size = 1;
     for (auto dim : m_input_dim) {
     std::memcpy(input_id_buffer, tokens.data(), tokens.size() * sizeof(uint32_t));
     *input_id_num_token_buffer = tokens.size();
     *input_id_n_past_buffer = m_nPast;
+    *input_id_lora_alpha = m_lora_alpha_val;
     auto stop = std::chrono::steady_clock::now();
     // QnnUtils::logProfile("setupInputTensors (cpp) took", start, stop);
     return logits_all? prev_run.num_tokens_processed : 1;
 }
+bool QnnCpuModel::applyBinarySections(std::vector<std::string>& binsection_list) {
+  //apply binary section for lora config
+  for (int i = 0; i < binsection_list.size(); i++) {
+    __DEBUG("qnn-cpu: applyBinarySections adapters {}", binsection_list.at(i));
+    if (!m_qnnApi->applyBinarySection(i, binsection_list.at(i))) {
+      __ERROR("qnn-cpu: Error in applyBinarySections {}", i);
+      return false;
+    }
+  }
+  return true;
+}
+bool QnnCpuModel::applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val) {
+  m_lora_alpha_val = alpha_val;
+  return true;
+}
+bool QnnCpuModel::applyLoraAdapter(const std::string& lora_adapter_name) {
+  if (m_loraConfigType != LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE) {
+    __ERROR("qnn-cpu: Lora config is not enable for adapters");
+    return false;
+  }
+  if (!m_loraConfig.contains(lora_adapter_name)) {
+    __ERROR("qnn-cpu: Could not find lora adapters config to apply ");
+    return false;
+  }
+  if (!applyLoraStrength(
+          m_loraConfig[lora_adapter_name].alpha_tensor_name,
+          m_loraConfig[lora_adapter_name].alpha_tensor_val
+          )) {
+    __ERROR("qnn-cpu: Could not apply Alpha tensor ");
+    return false;
+  }
+  if (!applyBinarySections(m_loraConfig[lora_adapter_name].binsection_list)) {
+    __ERROR("qnn-cpu: Could not apply binary Sections ");
+    return false;
+  }
+  return true;
+}
 // TODO: implement save/restore
 size_t QnnCpuModel::loadKVCache(const std::string& load_path) {
     //TO read the cache file into KV tensor

Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp CHANGED Viewed

@@ -26,6 +26,12 @@
 namespace qualla {
 class QnnCpuModel {
     enum ExecutionMode { AUTODETECT, BERT_KV, KV_ONLY, BERT_ONLY };
@@ -34,6 +40,13 @@ class QnnCpuModel {
   public:
     enum ModelOutput { LOGITS = 0x0, EMBEDDINGS= 0x1 };
     struct Params {
         std::filesystem::path model_basedir;
         std::string           op_package;
@@ -50,6 +63,8 @@ class QnnCpuModel {
         uint32_t n_layer;
         uint32_t n_embd;
         uint32_t n_heads;
     };
     const std::filesystem::path model_basedir;
@@ -92,6 +107,11 @@ class QnnCpuModel {
     std::vector<Qnn_Param_t> m_params;
     ExecutionMode            m_mode{ExecutionMode::AUTODETECT};
     // Save some information about the last inference run
     struct PreviousRunInfo {
         bool   was_bert_mode;
@@ -118,6 +138,7 @@ class QnnCpuModel {
     QnnUtils::Tensor* t_input_ids_k_cache;
     QnnUtils::Tensor* t_input_ids_v_cache;
     QnnUtils::Tensor* t_input_ids_n_past;
     float*            dequant_logits_ptr{nullptr};
     // Store pointers for bert
@@ -171,6 +192,10 @@ class QnnCpuModel {
     size_t loadKVCache(const std::string& save_path);
     bool   saveKVCache(const std::string& load_path);
   private:
     bool m_mmap_context_bins = false; // mmap context binary files instead of reading them in memory
     // Internal functions to separate different runInference logic

 namespace qualla {
+enum LoraConfigType {
+  LORA_DISABLE               = 0,
+  LORA_INPUT_WEIGHT_ENABLE   = 1,
+  LORA_ADAPTER_WEIGHT_ENABLE = 2
+};
 class QnnCpuModel {
     enum ExecutionMode { AUTODETECT, BERT_KV, KV_ONLY, BERT_ONLY };
   public:
     enum ModelOutput { LOGITS = 0x0, EMBEDDINGS= 0x1 };
+    struct LoraConfig {
+      std::string              lora_name;
+      std::vector<std::string> binsection_list;   //loRAv2 adapter bins filenames
+      std::string              alpha_tensor_name; //loRAv2 alpha tensor names
+      float                    alpha_tensor_val;  //loRAv2 alpha tensor values
+    };
     struct Params {
         std::filesystem::path model_basedir;
         std::string           op_package;
         uint32_t n_layer;
         uint32_t n_embd;
         uint32_t n_heads;
+        LoraConfigType lora_config_type;
+        std::map<std::string, LoraConfig> lora_config;
     };
     const std::filesystem::path model_basedir;
     std::vector<Qnn_Param_t> m_params;
     ExecutionMode            m_mode{ExecutionMode::AUTODETECT};
+    // LoRA params and configs
+    float                    m_lora_alpha_val;
+    LoraConfigType           m_loraConfigType;
+    std::map<std::string, LoraConfig> m_loraConfig;
     // Save some information about the last inference run
     struct PreviousRunInfo {
         bool   was_bert_mode;
     QnnUtils::Tensor* t_input_ids_k_cache;
     QnnUtils::Tensor* t_input_ids_v_cache;
     QnnUtils::Tensor* t_input_ids_n_past;
+    QnnUtils::Tensor* t_input_lora_alpha;
     float*            dequant_logits_ptr{nullptr};
     // Store pointers for bert
     size_t loadKVCache(const std::string& save_path);
     bool   saveKVCache(const std::string& load_path);
+    bool   applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val);
+    bool   applyLoraAdapter(const std::string& lora_adapter_name);
+    bool   applyBinarySections(std::vector<std::string>& binsection_list);
   private:
     bool m_mmap_context_bins = false; // mmap context binary files instead of reading them in memory
     // Internal functions to separate different runInference logic

Genie/Genie/src/qualla/engines/qnn-gpu.cpp ADDED Viewed

	@@ -0,0 +1,193 @@

+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI")
+#include <vector>
+#include <string>
+#include <qualla/engine.hpp>
+#include <qualla/detail/config.hpp>
+#include <qualla/detail/timer.hpp>
+#include <qualla/detail/onload.hpp>
+#include <fmt/format.h>
+#include "gpu-model.hpp"
+#define __INFO(__fmt, ...)  _env.logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__))
+#define __WARN(__fmt, ...)  _env.logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__))
+#define __ERROR(__fmt, ...) _env.logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__))
+#define __KPIS(__fmt, ...)                                                                         \
+    _env.logger().post(Logger::ENGINE_KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __DEBUG(__fmt, ...)                                                                        \
+    _env.logger().post(Logger::ENGINE_DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+#define __TRACE(__fmt, ...)                                                                        \
+    _env.logger().post(Logger::ENGINE_TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); })
+namespace qualla {
+class GpuEngine : public Engine {
+  private:
+    QnnGpuModel::Params _params;
+    std::unique_ptr<QnnGpuModel> _model;
+  public:
+    GpuEngine(Context& ctx, const qualla::json& json);
+    ~GpuEngine();
+    virtual size_t process(
+            const std::vector<int32_t>& tokens,
+            std::vector<float>&         logits,
+            bool                        logits_all
+    ) override;
+    virtual bool   updateKV(size_t n_past) override;
+    virtual bool   save(const std::string& name) override;
+    virtual size_t restore(const std::string& name, bool chooseHigherVariant) override;
+    virtual void   reset() override;
+    virtual bool load() override;
+    virtual bool unload() override;
+};
+namespace fs = std::filesystem;
+GpuEngine::GpuEngine(Context& ctx, const qualla::json& json) : Engine(ctx, "qnn-gpu", json) {
+    qualla::Timer start;
+    using FF  = Feature::Flags;
+    _features = FF::OUTPUT_LOGITS | FF::SAVE_RESTORE | FF::DYNAMIC_LOAD;
+    __DEBUG("Qnn-Gpu : init start");
+    qualla::Config conf(json, _type + "-engine:");
+    // Parse config
+    _params.model_basedir = conf.optional<std::string>("model-basedir", "");
+    if (_params.model_basedir.is_relative()) {
+        _params.model_basedir = _env.path().models / _params.model_basedir;
+        _params.model_basedir = _params.model_basedir.make_preferred();
+    }
+    _params.model_list = conf.mandatory<std::vector<std::string>>("model-list");
+    _params.ctx_size           = _ctx.size();
+    _params.num_heads          = conf.optional<int64_t>("num-heads", 32);
+    _params.head_dim           = conf.optional<int64_t>("head-dim", 128);
+    if (!conf.optional<bool>("dynamic-load", false)) {
+        load();
+    }
+};
+GpuEngine::~GpuEngine() {
+    unload();
+}
+bool GpuEngine::load() {
+#ifdef _WIN32
+    // QnnGpu Engine does not support Windows.
+    return false;
+#endif
+    if (_model) return true;
+    qualla::Timer start;
+    bool  status = true;
+    __INFO("Qnn-Gpu : Loading Model");
+    _model = std::make_unique<QnnGpuModel>(_env, _params);
+    // Load model
+    status = _model->initializeModel();
+    if (!status) {
+        throw std::runtime_error("Qnn-Gpu :Failure to initialize model");
+    }
+    // Initialize IO Tensor buffers
+    status = _model->initializeIOTensors();
+    if (!status) {
+        throw std::runtime_error("Qnn-Gpu :Error in setting up IO Tensors");
+    }
+    // Initialize IO Tensor Pointers
+    if (true != _model->initializeTensorPointers()) {
+        throw std::runtime_error("Qnn-Gpu :Could not find I/O tensors in loaded graphs");
+    }
+    // Validate the model
+    if (true != _model->validateModel()) {
+        throw std::runtime_error("Qnn-Gpu :Model Validation Failed");
+    }
+    _kpis.load.update(start.elapsed_usec());
+    return true;
+}
+bool GpuEngine::unload() {
+    qualla::Timer start;
+    __DEBUG("Qnn-Gpu : Unloading Model");
+    _model.reset(nullptr);
+    _kpis.unload.update(start.elapsed_usec());
+    return true;
+}
+// KV Cache updation after each inference is handled inside QnnGpu Backend
+// GPU Engine uses same memory handle for each KV input/output to the graph and uses
+// Scatter op to update KV after each inference to the same memory handle.
+bool GpuEngine::updateKV(size_t n_past) {
+    return true;
+}
+size_t GpuEngine::process(
+        const std::vector<int32_t>& tokens,
+        std::vector<float>&         logits,
+        bool                        logits_all
+) {
+    if (!_model && !load()) {
+      return 0;
+    }
+    qualla::Timer start;
+    size_t n_tok = _model->runInference(tokens, logits, logits_all);
+    if (n_tok == 0) {
+        State::error("Qnn-Gpu : RunInference Failed!");
+    }
+    _kpis.process.update(start.elapsed_usec());
+    return n_tok;
+}
+size_t GpuEngine::restore(const std::string& name, bool chooseHigherVariant) {
+    if (!_model && !load()) {
+      return 0;
+    }
+    fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-gpu", _role);
+    return _model->loadKVCache(cache_path.string());
+}
+bool GpuEngine::save(const std::string& name) {
+    if (!_model && !load()) {
+      return false;
+    }
+    fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-gpu", _role);
+    return _model->saveKVCache(cache_path.string());
+}
+// Reset requires clearing of KV caches only
+void GpuEngine::reset() {
+    if (!_model && !load()) {
+      return;
+    }
+    _model->reset();
+}
+// Registrator instance
+static OnLoad regy([]() {
+    Engine::__register("qnn-gpu", [](Context& ctx, const json& conf) {
+        return (Engine*)new GpuEngine(ctx, conf);
+    });
+});
+void          needQnnGpuEngine() {}
+} // namespace qualla

Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.cpp ADDED Viewed

	@@ -0,0 +1,603 @@

+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI")
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <set>
+#include <sstream>
+#include "fmt/format.h"
+#include "fmt/ranges.h"
+#include "fp16/fp16.h"
+#include "gpu-model.hpp"
+#include "qualla/detail/cache-file.hpp"
+#include "qualla/detail/timer.hpp"
+#include "qualla/env.hpp"
+namespace fs = std::filesystem;
+static constexpr uint32_t g_magicNum = 0xC0DE;
+#define __INFO(__fmt, ...)  _env.logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__));
+#define __WARN(__fmt, ...)  _env.logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__));
+#define __ERROR(__fmt, ...) _env.logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__));
+#define __KPIS(__fmt, ...) \
+  _env.logger().post(Logger::ENGINE_KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); });
+#define __DEBUG(__fmt, ...) \
+  _env.logger().post(Logger::ENGINE_DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); });
+#define __TRACE(__fmt, ...) \
+  _env.logger().post(Logger::ENGINE_TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); });
+namespace qualla {
+QnnGpuModel::QnnGpuModel(Env& env, const Params& params)
+    : _env(env), _modelBaseDir(params.model_basedir) {
+  // Initialize _qnnApi
+  _qnnApi = std::unique_ptr<QnnApi>(new QnnApi());
+  _ctxSize  = params.ctx_size;
+  _numHeads = params.num_heads;
+  _headDim  = params.head_dim;
+#ifdef _WIN32
+  _useDmabufIo = false;
+#else
+  _useDmabufIo = true;
+#endif
+  // Set up filename list for context binaries.
+  for (auto& i : params.model_list) {
+    fs::path model_path = _modelBaseDir / fs::path(i);
+    if (!fs::is_regular_file(model_path)) {
+      __ERROR("Qnn-Gpu-Model : Can't access model file : {}", model_path.string());
+      throw std::runtime_error("Qnn-Gpu-Model : Can't access model file : " + model_path.string());
+    }
+    _modelList.push_back(model_path.string());
+  }
+}
+QnnGpuModel::~QnnGpuModel() { __INFO("Qnn-Gpu-Model : model destruct complete"); }
+// Given a filename, initializeModel load and initializes QNN runtime libraries and the model
+bool QnnGpuModel::initializeModel(void) {
+  qualla::Timer start;
+  __INFO("Qnn-Gpu-Model : Model Init Start");
+  const std::string backend = "libQnnGpu.so";
+  __INFO("Backend Library : {}", backend);
+  __INFO("Model Files : {}", _modelList);
+  if (!_qnnApi->initialize(backend, _modelList)) {
+    __ERROR("Qnn-Api : Initialization Failed!");
+    return false;
+  }
+  // Initialize QNN IO Tensor
+  if (_useDmabufIo) {
+    _ioTensor =
+        std::unique_ptr<IOTensor>(new IOTensor(BufferAlloc::DMABUF, _qnnApi->getQnnInterfaceVer()));
+  } else {
+    _ioTensor = std::unique_ptr<IOTensor>(
+        new IOTensor(BufferAlloc::DEFAULT, _qnnApi->getQnnInterfaceVer()));
+  }
+  _numGraphs = _qnnApi->getGraphsCount();
+  __INFO("Qnn-Gpu-Model : initialized with {} graph(s)", _numGraphs);
+  GraphInfo_t** graphs_info = _qnnApi->getGraphsInfo();
+  for (size_t graphIdx = 0; graphIdx < _numGraphs; graphIdx++) {
+    GraphInfo_t* const graphInfo = graphs_info[graphIdx];
+    char* graphName              = graphInfo->graphName;
+    std::string graphStr         = std::string(graphName);
+    _modelOrder.push_back(graphStr);
+  }
+  __INFO("Qnn-Gpu-Model : model init complete: {} usec", start.elapsed_usec());
+  return true;
+}
+// Once the model has been loaded, initialize IO Tensors
+// _ioTensors is initialized by the context for now
+bool QnnGpuModel::initializeIOTensors() {
+  qualla::Timer start;
+  // For QNN-GPU, we have only one context per model.
+  bool status = _ioTensor->initialize(_qnnApi->getContexts().back());
+  if (!status) {
+    __ERROR("Qnn-Gpu-Model : failure to initialize IOTensor");
+    return false;
+  }
+  // Getting graph info, Hardcoding single graph for now.
+  GraphInfo_t** const& graphsInfo = _qnnApi->getGraphsInfo();
+  for (size_t graphIdx = 0; graphIdx < _numGraphs; graphIdx++) {
+    GraphInfo_t* const& graphInfo = graphsInfo[graphIdx];
+    std::string graphName         = std::string(graphInfo->graphName);
+    __DEBUG("Qnn-Gpu-Model : numInputTensors {} numOutputTensors {}",
+            graphInfo->numInputTensors,
+            graphInfo->numOutputTensors);
+    // Setup Inputs
+    {
+      std::unordered_map<std::string, size_t> inputTensorsSize;
+      for (size_t tensorIdx = 0; tensorIdx < graphInfo->numInputTensors; tensorIdx++) {
+        std::string tensorName;
+        std::vector<size_t> tensorDims;
+        auto& tensor = graphInfo->inputTensors[tensorIdx];
+        _qnnApi->getTensorNameAndShape(tensorName, tensorDims, tensor);
+        auto dims                    = QnnUtils::Dims(tensorDims);
+        inputTensorsSize[tensorName] = dims.getSize();
+        __DEBUG("Qnn-Gpu-Model : Input Tensor Info {} {} {} {}",
+                tensorIdx,
+                tensorName,
+                tensorDims,
+                inputTensorsSize[tensorName]);
+        std::vector<QnnUtils::QuantParam> quantParams;
+        if (!_qnnApi->getTensorQuantParams(&tensor, quantParams)) {
+          quantParams.emplace_back(0, 0);
+        }
+        std::shared_ptr<QnnUtils::Tensor> tensorUtil =
+            std::shared_ptr<QnnUtils::Tensor>(new (std::nothrow) QnnUtils::Tensor);
+        tensorUtil->dims                   = dims;
+        tensorUtil->dtype                  = QNN_TENSOR_GET_DATA_TYPE(tensor);
+        tensorUtil->quantParam             = quantParams;
+        _inputSpecs[graphName][tensorName] = tensorUtil;
+      }
+      Qnn_Tensor_t* tensor_bank = nullptr;
+      std::unordered_map<std::string, void*> tensor_ptr_map;
+      if (true != _ioTensor->setupInputTensors(&tensor_bank,
+                                               tensor_ptr_map,
+                                               *graphInfo,
+                                               inputTensorsSize,
+                                               _qnnApi->getContexts()[graphIdx],
+                                               false)) {
+        QNN_ERROR("Qnn-Gpu-Model : Error in setting up Input Tensors for graph %s",
+                  graphName.c_str());
+        return false;
+      }
+      _inputTensors[graphName] = tensor_bank;
+      for (auto& [tensorName, tensor_ptr] : tensor_ptr_map) {
+        _inputSpecs[graphName][tensorName]->tensor = (Qnn_Tensor_t*)tensor_ptr;
+      }
+      __DEBUG("Qnn-Gpu-Model : Input Tensor Allocated for {}", graphName);
+    }
+    // Setup Outputs
+    {
+      std::unordered_map<std::string, size_t> outputTensorsSize;
+      std::unordered_map<std::string, Qnn_Tensor_t*> sharedTensorMap;
+      for (size_t tensorIdx = 0; tensorIdx < graphInfo->numOutputTensors; tensorIdx++) {
+        std::string tensorName;
+        std::vector<size_t> tensorDims;
+        auto& tensor = graphInfo->outputTensors[tensorIdx];
+        _qnnApi->getTensorNameAndShape(tensorName, tensorDims, tensor);
+        if (tensorName.starts_with("past_")) {
+          std::string tensorInName    = tensorName.substr(0, tensorName.size() - 3) + "in";
+          sharedTensorMap[tensorName] = _inputSpecs[graphName][tensorInName]->tensor;
+          // Update Gpu _kvCache
+          auto [type, layer_id] = parseKVTensorName(tensorName);
+          _kvCache.push_back(
+              GpuKVCache((type == 1), layer_id, _inputSpecs[graphName][tensorInName].get()));
+        }
+        std::vector<QnnUtils::QuantParam> quantParams;
+        if (!_qnnApi->getTensorQuantParams(&tensor, quantParams)) {
+          quantParams.emplace_back(0, 0);
+        }
+        auto dims                     = QnnUtils::Dims(tensorDims);
+        outputTensorsSize[tensorName] = dims.getAlignedSize();
+        __DEBUG("Qnn-Gpu-Model : Output Tensor Info {} {} {} {}",
+                tensorIdx,
+                tensorName,
+                tensorDims,
+                outputTensorsSize[tensorName]);
+        std::shared_ptr<QnnUtils::Tensor> tensorUtil =
+            std::shared_ptr<QnnUtils::Tensor>(new (std::nothrow) QnnUtils::Tensor);
+        tensorUtil->dims                    = dims;
+        tensorUtil->dtype                   = QNN_TENSOR_GET_DATA_TYPE(tensor);
+        tensorUtil->quantParam              = quantParams;
+        _outputSpecs[graphName][tensorName] = tensorUtil;
+      }
+      Qnn_Tensor_t* tensor_bank = nullptr;
+      std::unordered_map<std::string, void*> tensor_ptr_map;
+      if (_ioTensor->getBufferAllocType() == BufferAlloc::DMABUF) {
+        if (true != _ioTensor->setupOutputWithSharedTensors(&tensor_bank,
+                                                            tensor_ptr_map,
+                                                            *graphInfo,
+                                                            outputTensorsSize,
+                                                            _qnnApi->getContexts()[graphIdx],
+                                                            sharedTensorMap)) {
+          QNN_ERROR("Qnn-Gpu-Model : Error in setting up Output Tensors for graph %s",
+                    graphName.c_str());
+          return false;
+        }
+      } else {
+        if (true != _ioTensor->setupOutputTensors(&tensor_bank,
+                                                  tensor_ptr_map,
+                                                  *graphInfo,
+                                                  outputTensorsSize,
+                                                  _qnnApi->getContexts()[graphIdx],
+                                                  false)) {
+          QNN_ERROR("Qnn-Gpu-Model : Error in setting up Input Tensors for graph %s",
+                    graphName.c_str());
+          return false;
+        }
+      }
+      _outputTensors[graphName] = tensor_bank;
+      for (auto& [tensorName, tensor_ptr] : tensor_ptr_map) {
+        _outputSpecs[graphName][tensorName]->tensor = (Qnn_Tensor_t*)tensor_ptr;
+      }
+      __DEBUG("Qnn-Gpu-Model : Output Tensor Allocated {} {}", graphName, _outputTensors.size());
+    }
+  }
+  auto stop = std::chrono::steady_clock::now();
+  return true;
+}
+bool QnnGpuModel::initializeTensorPointers() {
+  auto inputSpec  = _inputSpecs[_modelOrder.back()];
+  auto outputSpec = _outputSpecs[_modelOrder.back()];
+  t_inputIds    = inputSpec[INPUT_IDS].get();
+  t_attnMask    = inputSpec[ATTN_MASK].get();
+  t_positionIds = inputSpec[POS_IDS].get();
+  t_logits      = outputSpec[LOGITS].get();
+  auto status = !(t_inputIds == nullptr || t_attnMask == nullptr || t_positionIds == nullptr ||
+                  t_logits == nullptr);
+  if (!status) {
+    __ERROR("Qnn-Gpu-Model : error in setting up named tensor pointers for llama.");
+    return false;
+  }
+  return true;
+}
+bool QnnGpuModel::validateModel() {
+  // Validating context Size.
+  size_t numInputs    = t_inputIds->dims.getNumElements();
+  size_t dimMask      = t_attnMask->dims.getNumElements();
+  size_t modelCtxSize = dimMask / numInputs;
+  if (modelCtxSize != _ctxSize) {
+    __ERROR("Qnn-Gpu-Model : Invalid Context Size {} {}.", modelCtxSize, _ctxSize);
+    return false;
+  }
+  return true;
+}
+void QnnGpuModel::setupInputTensors(const std::vector<int32_t>& tokens) {
+  auto start = std::chrono::steady_clock::now();
+  if (tokens.size() > _ctxSize) {
+    std::string errMsg = "Called inference with more tokens than model supports: ";
+    errMsg += std::to_string(tokens.size()) + " vs. " + std::to_string(_ctxSize);
+    throw std::runtime_error(errMsg);
+  }
+  // Setup 1. input_ids
+  // Index of input tokens in the embedding vocabulary
+  uint32_t* inputIdBuffer = (uint32_t*)getBuffer(t_inputIds);
+  if (inputIdBuffer) {
+    if (_useDmabufIo) {
+      _ioTensor->beforeWriteToBuffer(t_inputIds->tensor);
+    }
+    inputIdBuffer[0] = tokens[0];
+    if (_useDmabufIo) {
+      _ioTensor->afterWriteToBuffer(t_inputIds->tensor);
+    }
+  }
+  // Setup 2. attention_mask
+  // Mask to avoid performing attention of padding.
+  uint32_t* attnMaskBuffer = (uint32_t*)getBuffer(t_attnMask);
+  if (attnMaskBuffer) {
+    if (_useDmabufIo) {
+      _ioTensor->beforeWriteToBuffer(t_attnMask->tensor);
+    }
+    attnMaskBuffer[_numTokensProcessed] = 1;
+    if (_useDmabufIo) {
+      _ioTensor->afterWriteToBuffer(t_attnMask->tensor);
+    }
+  }
+  // Setup 3. position_ids
+  // Indices of positions of each input tokens in position embeddings.
+  uint32_t* positionIdBuffer = (uint32_t*)getBuffer(t_positionIds);
+  if (positionIdBuffer) {
+    if (_useDmabufIo) {
+      _ioTensor->beforeWriteToBuffer(t_positionIds->tensor);
+    }
+    positionIdBuffer[0] = (uint32_t)(_numTokensProcessed);
+    if (_useDmabufIo) {
+      _ioTensor->afterWriteToBuffer(t_positionIds->tensor);
+    }
+  }
+  auto stop = std::chrono::steady_clock::now();
+}
+template <class T1, class T2>
+inline bool QnnGpuModel::executeModel(T1& input, T2& output, std::string graphName) {
+  bool ret = _qnnApi->graphExecute(input, output, graphName, timeLogs);
+  if (ret != true) {
+    QNN_ERROR("Qnn-Gpu-Model : Error executing inference: %d for graph %s", ret, graphName.c_str());
+    return false;
+  }
+  QNN_DEBUG("Qnn-Gpu-Model : Execute finished for graph %s", graphName.c_str());
+  return true;
+}
+bool QnnGpuModel::runInferenceHelper(std::vector<std::string>& exec_models,
+                                     int32_t* wait_time_total,
+                                     int32_t* exec_time_total,
+                                     bool pipeline_kv_update,
+                                     size_t update_size) {
+  int32_t exec_time = 0;
+  int32_t wait_time = 0;
+  for (auto& graphName : exec_models) {
+    {
+      auto start_time = std::chrono::steady_clock::now();
+      Qnn_Tensor_t* inputTensors;
+      Qnn_Tensor_t* outputTensors;
+      try {
+        inputTensors  = _inputTensors[graphName];
+        outputTensors = _outputTensors[graphName];
+      } catch (std::exception e) {
+        __DEBUG("Qnn-Gpu-Model : Could not find tensors %s", graphName.c_str());
+        return false;
+      }
+      bool status = executeModel(inputTensors, outputTensors, graphName);
+      if (!status) {
+        return false;
+      }
+      auto end_time = std::chrono::steady_clock::now();
+      exec_time += static_cast<int32_t>(
+          std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count());
+    }
+  }
+  *exec_time_total += exec_time;
+  *wait_time_total += wait_time;
+  return true;
+}
+size_t QnnGpuModel::runInference(const std::vector<int32_t>& tokens,
+                                 std::vector<float>& logits,
+                                 bool logits_all) {
+  auto start            = std::chrono::steady_clock::now();
+  int32_t totalWaitTime = 0;
+  int32_t totalExecTime = 0;
+  // Setup inputs for inference
+  auto& execModels = _modelOrder;
+  int numIters     = tokens.size();
+  for (int i = 0; i < numIters; i++) {
+    if (numIters > 1) {
+      __DEBUG("Qnn-Gpu-Model : Prompt Processing {} of {} tokens", i + 1, numIters);
+    } else {
+      __DEBUG("Qnn-Gpu-Model : Token Generation {} of {} tokens", i + 1, numIters);
+    }
+    std::vector<int32_t> curr_tokens;
+    curr_tokens.push_back(tokens[i]);
+    setupInputTensors(curr_tokens);
+    bool status =
+        runInferenceHelper(execModels, &totalWaitTime, &totalExecTime, false, tokens.size());
+    if (!status) {
+      return 0;
+    }
+    processLogits(logits, logits_all);
+    // Update the numProcessTokens to updated with Accepted Tokens.
+    _numTokensProcessed++;
+  }
+  auto stop = std::chrono::steady_clock::now();
+  timeLogs["Run Inference (cpp) "].first += static_cast<double>(
+      std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count());
+  timeLogs["Run Inference (cpp) "].second++;
+  QNN_DEBUG("[TIME] Wait[%d] Exec[%d]\n", totalWaitTime, totalExecTime);
+  if (!logits_all) {
+    return 1;
+  }
+  return tokens.size();
+}
+// Parse KV$ Tensor names here - supports past_{key,value}_{layer_idx}[_h0]_{in,out}
+std::tuple<int, int> QnnGpuModel::parseKVTensorName(std::string name) {
+  if (!name.starts_with("past_")) return {0, 0};
+  const bool is_key = name.starts_with("past_key");
+  const size_t pos0 = (is_key) ? 9 : 11;  // "past_key_" OR "past_value_"
+  const size_t pos1 = name.find('_', pos0);
+  int layer_idx = static_cast<int>(std::stoi(name.substr(pos0, pos1 - pos0)));
+  return std::make_tuple(is_key ? 1 : 2, layer_idx);
+}
+size_t QnnGpuModel::loadKVCache(const std::string& load_path) {
+  std::ifstream fs(load_path, std::ios::in | std::ios::binary);
+  if (fs.fail()) {
+    __ERROR("Qnn-Gpu-Model : loadKVCache errror reading file {}", load_path);
+    return 0;
+  }
+  CacheFileSpec spec;
+  fs.read((char*)&spec, sizeof(spec));
+  if (spec.magic != g_magicNum) {
+    __ERROR("Qnn-Gpu-Model : loadKVCache expected {} found {:#x}", g_magicNum, spec.magic);
+    return 0;
+  }
+  // clang-format off
+  __INFO("Qnn-Gpu-Model : loadKVCache {{ num_tensors {}, magic {}, dtype {}, n_heads {}, embed_dim {} update_size {} }}",
+  spec.num_tensors, spec.magic, int(spec.dtype), spec.n_heads, spec.embed_dim, spec.update_size); fflush(stdout);
+  // clang-format on
+  _numTokensProcessed = static_cast<size_t>(spec.update_size);
+  if (_numTokensProcessed > 0) {
+    // Loop over _kvCache tensor and read from file
+    for (auto cache : _kvCache) {
+      if (_useDmabufIo) {
+        _ioTensor->beforeWriteToBuffer(t_inputIds->tensor);
+      }
+      char* buffer = (char*)getBuffer(cache.tensorUtil);
+      if (cache.isKey) {
+        // Kye Cache Dims [1, num_heads, head_dim, ctx_size]
+        // float16 bits equivalent to uint16_t
+        const size_t copySize = _numTokensProcessed;
+        const size_t skipSize = _ctxSize;
+        for (int i = 0; i < _numHeads; i++) {
+          for (int j = 0; j < _headDim; j++) {
+            fs.read(buffer, copySize * sizeof(uint16_t));
+            buffer += skipSize * sizeof(uint16_t);
+          }
+        }
+      } else {
+        // Kye Cache Dims [1, num_heads, ctx_size, head_dim]
+        // float16 bits equivalent to uint16_t
+        const size_t copySize = _numTokensProcessed * _headDim;
+        const size_t skipSize = _ctxSize * _headDim;
+        for (int i = 0; i < _numHeads; i++) {
+          fs.read(buffer, copySize * sizeof(uint16_t));
+          buffer += skipSize * sizeof(uint16_t);
+        }
+      }
+      if (_useDmabufIo) {
+        _ioTensor->afterWriteToBuffer(t_inputIds->tensor);
+      }
+    }
+  }
+  return _numTokensProcessed;
+}
+bool QnnGpuModel::saveKVCache(const std::string& save_path) {
+  std::ofstream fs(save_path, std::ios::out | std::ios::binary);
+  if (fs.fail()) {
+    __ERROR("Qnn-Gpu-Model : saveKVCache error opening file : {}", save_path);
+    throw std::runtime_error("Failed to write to cache file. Please re-check path");
+  }
+  const CacheFileSpec::DataType dtype = CacheFileSpec::DataType::FLOAT16_T;
+  uint32_t numKVTensors = _kvCache.size();
+  // Save the cache file metadata
+  CacheFileSpec file_spec(
+      numKVTensors, g_magicNum, dtype, 0x0, _numHeads, _headDim, _numTokensProcessed);
+  fs.write((char*)&file_spec, sizeof(file_spec));
+  // clang-format off
+  __INFO("Qnn-Gpu-Model : saveKVCache {{ num_tensors {}, magic {}, dtype {}, n_heads {}, embed_dim {} update_size {} }}",
+    numKVTensors, g_magicNum, int(dtype), _numHeads, _headDim, _numTokensProcessed); fflush(stdout);
+  // clang-format on
+  if (_numTokensProcessed > 0) {
+    // Loop over _kvCache tensor and write to file
+    for (auto cache : _kvCache) {
+      if (_useDmabufIo) {
+        _ioTensor->beforeReadFromBuffer(t_inputIds->tensor);
+      }
+      char* buffer = (char*)getBuffer(cache.tensorUtil);
+      if (cache.isKey) {
+        // Kye Cache Dims [1, num_heads, head_dim, ctx_size]
+        // float16 bits equivalent to uint16_t
+        const size_t copySize = _numTokensProcessed;
+        const size_t skipSize = _ctxSize;
+        for (int i = 0; i < _numHeads; i++) {
+          for (int j = 0; j < _headDim; j++) {
+            fs.write((char*)buffer, copySize * sizeof(uint16_t));
+            buffer += skipSize * sizeof(uint16_t);
+          }
+        }
+      } else {
+        // Kye Cache Dims [1, num_heads, ctx_size, head_dim]
+        // float16 bits equivalent to uint16_t
+        const size_t copySize = _numTokensProcessed * _headDim;
+        const size_t skipSize = _ctxSize * _headDim;
+        for (int i = 0; i < _numHeads; i++) {
+          fs.write((char*)buffer, copySize * sizeof(uint16_t));
+          buffer += skipSize;
+        }
+      }
+      if (_useDmabufIo) {
+        _ioTensor->afterReadFromBuffer(t_inputIds->tensor);
+      }
+    }
+  }
+  fs.flush();
+  fs.close();
+  return true;
+}
+size_t QnnGpuModel::processLogits(std::vector<float>& logits, bool logits_all) {
+  auto logitsSpec   = _outputSpecs[_modelOrder.back()][LOGITS].get();
+  size_t logitsSize = getNumElements(logitsSpec);
+  if (_useDmabufIo) {
+    _ioTensor->beforeReadFromBuffer(t_inputIds->tensor);
+  }
+  uint16_t* logitBuf = (uint16_t*)getBuffer(logitsSpec);
+  if (!logits_all) {
+    logits.clear();
+  }
+  size_t allocateSize = logits.size() + logitsSize;
+  logits.reserve(allocateSize);
+  for (auto i = 0; i < logitsSize; ++i) {
+    logits.push_back(fp16_ieee_to_fp32_value(logitBuf[i]));
+  }
+  if (_useDmabufIo) {
+    _ioTensor->afterReadFromBuffer(t_inputIds->tensor);
+  }
+  return logits.size() / logitsSize;
+}
+bool QnnGpuModel::reset() {
+  // Reset Token Counter
+  _numTokensProcessed = 0;
+  // Reset Attention Mask
+  uint32_t* attnMaskBuffer = (uint32_t*)getBuffer(t_attnMask);
+  uint32_t attnMaskSize    = getBufferSize(t_attnMask);
+  if (attnMaskBuffer) {
+    if (_useDmabufIo) {
+      _ioTensor->beforeWriteToBuffer(t_attnMask->tensor);
+    }
+    memset(attnMaskBuffer, 0, attnMaskSize);
+    if (_useDmabufIo) {
+      _ioTensor->afterWriteToBuffer(t_attnMask->tensor);
+    }
+  }
+  // Reset KV Cache.
+  // TODO : Check if mask_neg -100 is enough to remove
+  // effect of KV Cache. Test with mask_neg = -float_inf
+  for (auto cache : _kvCache) {
+    if (_useDmabufIo) {
+      _ioTensor->beforeWriteToBuffer(t_inputIds->tensor);
+    }
+    char* buffer        = (char*)getBuffer(cache.tensorUtil);
+    uint32_t bufferSize = getBufferSize(cache.tensorUtil);
+    memset(buffer, 0, bufferSize);
+    if (_useDmabufIo) {
+      _ioTensor->afterWriteToBuffer(t_inputIds->tensor);
+    }
+  }
+  return true;
+}
+}  // namespace qualla

Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.hpp ADDED Viewed

	@@ -0,0 +1,136 @@

+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI")
+#ifndef __QUALLA_QNN_GPU_MODEL_H_
+#define __QUALLA_QNN_GPU_MODEL_H_
+#include <atomic>
+#include <filesystem>
+#include <string>
+#include <vector>
+#include "IOTensor.hpp"
+#include "QnnApi.hpp"
+#include "qnn-utils.hpp"
+#include "qualla/env.hpp"
+namespace qualla {
+// Maintain a list of named tensors for
+static std::string INPUT_IDS = "input_ids";
+static std::string ATTN_MASK = "attention_mask";
+static std::string LOGITS    = "logits";
+static std::string POS_IDS   = "position_ids";
+class QnnGpuModel {
+ public:
+  struct Params {
+    std::filesystem::path model_basedir;
+    std::vector<std::string> model_list;  // model filenames
+    uint32_t ctx_size;
+    uint32_t num_heads;
+    uint32_t head_dim;
+  };
+  struct GpuKVCache {
+    bool isKey;
+    uint32_t tensorId;
+    QnnUtils::Tensor* tensorUtil;
+    GpuKVCache() {
+      isKey      = false;
+      tensorUtil = nullptr;
+      tensorId   = 0;
+    }
+    GpuKVCache(bool _isKey, uint32_t _tensorId, QnnUtils::Tensor* _tensorUtil)
+        : isKey(_isKey), tensorId(_tensorId), tensorUtil(_tensorUtil) {}
+  };
+  // QNN specific variables
+  std::unique_ptr<QnnApi> _qnnApi;
+  std::unique_ptr<IOTensor> _ioTensor{nullptr};
+  // Model Location Storage
+  const std::filesystem::path _modelBaseDir;
+  std::vector<std::string> _modelList;
+  std::vector<std::string> _modelOrder;
+  bool _useDmabufIo;
+  // Model parameters
+  uint32_t _ctxSize{0};
+  uint32_t _numHeads{0};
+  uint32_t _headDim{0};
+  // Information regarding model execution settings and last inference
+  // Model specific variables
+  uint32_t _numGraphs;
+  // I/O Tensor Informations
+  std::unordered_map<std::string, Qnn_Tensor_t*> _inputTensors;
+  std::unordered_map<std::string,
+                     std::unordered_map<std::string, std::shared_ptr<QnnUtils::Tensor>>>
+      _inputSpecs;
+  std::unordered_map<std::string, Qnn_Tensor_t*> _outputTensors;
+  std::unordered_map<std::string,
+                     std::unordered_map<std::string, std::shared_ptr<QnnUtils::Tensor>>>
+      _outputSpecs;
+  // Store some pointers for easier access
+  QnnUtils::Tensor* t_inputIds{nullptr};
+  QnnUtils::Tensor* t_attnMask{nullptr};
+  QnnUtils::Tensor* t_positionIds{nullptr};
+  QnnUtils::Tensor* t_logits{nullptr};
+  // _numTokensProcessed defines number of population of kvcache
+  size_t _numTokensProcessed{0};
+  std::vector<GpuKVCache> _kvCache;
+  std::map<std::string, std::pair<double, uint16_t>> timeLogs;
+  // Model Constructor
+  QnnGpuModel(Env& env, const Params& params);
+  ~QnnGpuModel();
+  bool initializeModel(void);
+  bool initializeIOTensors(void);
+  void setupInputTensors(const std::vector<int32_t>& tokens);
+  bool initializeTensorPointers();
+  bool validateModel();
+  template <class T1, class T2>
+  inline bool executeModel(T1& input, T2& output, std::string graph_name);
+  size_t runInference(const std::vector<int32_t>& tokens,
+                      std::vector<float>& logits,
+                      bool logits_all = false);
+  size_t loadKVCache(const std::string& save_path);
+  bool saveKVCache(const std::string& load_path);
+  bool reset();
+ private:
+  Env& _env;
+  // Internal functions to separate different runInference logic
+  bool runInferenceHelper(std::vector<std::string>& exec_models,
+                          int32_t* wait_time_total,
+                          int32_t* exec_time_total,
+                          bool pipeline_kv_update,
+                          size_t update_size);
+  size_t processLogits(std::vector<float>& logits, bool logits_all);
+  inline void* getBuffer(QnnUtils::Tensor& spec) { return _ioTensor->getBuffer(spec.tensor); }
+  inline void* getBuffer(QnnUtils::Tensor* spec) { return _ioTensor->getBuffer(spec->tensor); }
+  inline size_t getBufferSize(QnnUtils::Tensor& spec) { return spec.dims.getSize(); }
+  inline size_t getBufferSize(QnnUtils::Tensor* spec) { return spec->dims.getSize(); }
+  inline size_t getNumElements(QnnUtils::Tensor& spec) { return spec.dims.getNumElements(); }
+  inline size_t getNumElements(QnnUtils::Tensor* spec) { return spec->dims.getNumElements(); }
+  // Parse KV$ Tensor names here - supports past_{key,value}_{layer_idx}[_h0]_{in,out}
+  std::tuple<int, int> parseKVTensorName(std::string name);
+};
+}  // namespace qualla
+#endif

Genie/Genie/src/qualla/engines/qnn-htp.cpp CHANGED Viewed

@@ -353,11 +353,11 @@ qualla::InputType NspEngine::getInputType(){
     return _model->m_inputType;
 }
-size_t NspEngine::restore(const std::string& name) {
     if (!_model && !load()) return 0;
     fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-htp", _role);
-    return _model->loadKVCache(cache_path.string());
 }
 bool NspEngine::save(const std::string& name) {

     return _model->m_inputType;
 }
+size_t NspEngine::restore(const std::string& name, bool chooseHigherVariant) {
     if (!_model && !load()) return 0;
     fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-htp", _role);
+    return _model->loadKVCache(cache_path.string(), chooseHigherVariant);
 }
 bool NspEngine::save(const std::string& name) {

Genie/Genie/src/qualla/engines/qnn-htp.hpp CHANGED Viewed

@@ -70,7 +70,7 @@ class NspEngine : public Engine {
     virtual bool   updateKV(size_t n_past) override;
     virtual bool   updateKV(size_t n_past, const std::vector<bool>& selected) override;
     virtual bool   save(const std::string& name) override;
-    virtual size_t restore(const std::string& name) override;
     virtual void   reset() override;
     virtual bool         set(qualla::json data) override;

     virtual bool   updateKV(size_t n_past) override;
     virtual bool   updateKV(size_t n_past, const std::vector<bool>& selected) override;
     virtual bool   save(const std::string& name) override;
+    virtual size_t restore(const std::string& name, bool chooseHigherVariant) override;
     virtual void   reset() override;
     virtual bool         set(qualla::json data) override;

Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp CHANGED Viewed

@@ -338,7 +338,7 @@ bool NewNSPKVManager::registerPointerOffset() {
     return true;
 }
-bool NewNSPKVManager::updateState() {
     // clang-format off
     __TRACE("qnn-kv : graph[{}] updateState to AR-{}(n_past={}, ptr={})", _mgr_idx,
         _req_state.variant, _req_state.n_past, _req_state.ptr_offset);
@@ -354,9 +354,15 @@ bool NewNSPKVManager::updateState() {
                 cache.output_buffer += cache.is_key ? _n_ctx * _bw : _n_ctx * _n_embed * _bw;
         }
     }
     _cur_state = _req_state;
     _counter   = _callback_fn(_mgr_idx);
     return true;
 }
@@ -525,7 +531,7 @@ bool NewNSPKVManager::loadCache(
     }
     _req_state = {variant, n_valid, 0};
-    updateState();
     return true;
 }

     return true;
 }
+void NewNSPKVManager::updateKVCache(){
     // clang-format off
     __TRACE("qnn-kv : graph[{}] updateState to AR-{}(n_past={}, ptr={})", _mgr_idx,
         _req_state.variant, _req_state.n_past, _req_state.ptr_offset);
                 cache.output_buffer += cache.is_key ? _n_ctx * _bw : _n_ctx * _n_embed * _bw;
         }
     }
     _cur_state = _req_state;
+}
+void NewNSPKVManager::updateKVDispatcher(){
     _counter   = _callback_fn(_mgr_idx);
+}
+bool NewNSPKVManager::updateState() {
+    updateKVCache();
+    updateKVDispatcher();
     return true;
 }
     }
     _req_state = {variant, n_valid, 0};
+    updateKVCache();
     return true;
 }

Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp CHANGED Viewed

@@ -134,7 +134,8 @@ class NewNSPKVManager {
             int32_t        n_heads
     );
     bool dumpCache(std::ofstream* fs, bool is_key, int32_t n_valid, int32_t n_heads);
     bool updateState();
     void runKVUpdateJob(int thread_idx); // Worker thread function
     void setTensorAllocInfo(std::map<std::string, std::pair<int, size_t>>* alloc_info) {

             int32_t        n_heads
     );
     bool dumpCache(std::ofstream* fs, bool is_key, int32_t n_valid, int32_t n_heads);
+    void updateKVCache();
+    void updateKVDispatcher();
     bool updateState();
     void runKVUpdateJob(int thread_idx); // Worker thread function
     void setTensorAllocInfo(std::map<std::string, std::pair<int, size_t>>* alloc_info) {

Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp CHANGED Viewed

@@ -1960,6 +1960,9 @@ bool QnnNspModel::calculate_rope_embeddings(void) {
     const size_t nmemb  = m_ctx_size * m_pos_dim;
     const int    pos_bw = d_pos.bw();
     rope_sin = malloc(nmemb * pos_bw);
     rope_cos = malloc(nmemb * pos_bw);
@@ -1973,7 +1976,7 @@ bool QnnNspModel::calculate_rope_embeddings(void) {
     std::vector<double> inv_freq(m_pos_dim);
     const double        exponent = 1.0 / static_cast<double>(m_pos_dim);
     for (int j = 0; j < m_pos_dim; j++)
-        inv_freq[j] = 1.0 / pow(rope_theta, j * exponent);
     double attention_factor = 1.0;
     if (rope_scaling.rope_type == RopeScalingParams::ROPE_LLAMA3) {
         // Implemented from HuggingFace
@@ -1991,7 +1994,7 @@ bool QnnNspModel::calculate_rope_embeddings(void) {
             if (wavelen < high_freq_wavelen) // wavelen < high_freq_wavelen: do nothing
                 continue;
             else if (wavelen > low_freq_wavelen) // wavelen > low_freq_wavelen: divide by factor
-                inv_freq[j] = 1.0 / static_cast<double>(factor * pow(rope_theta, j * exponent));
             else { // otherwise: interpolate between the two, using a smooth factor
                 assert(low_freq_wavelen != high_freq_wavelen);
                 const double smooth =
@@ -2266,7 +2269,7 @@ void QnnNspModel::dumpTensorSpecs() {
     }
 }
-size_t QnnNspModel::loadKVCache(const std::string& load_path) {
     if(m_disableKvCache){
         __ERROR("KV cache is disabled, loading KV cache is not allowed");
@@ -2308,7 +2311,8 @@ size_t QnnNspModel::loadKVCache(const std::string& load_path) {
     // clang-format on
     const int32_t n_valid = static_cast<int32_t>(spec.update_size);
-    const int32_t variant = nsp_graph_count.begin()->first; // Set KVManager to smallest variant
     _kv_dispatcher->setVariant(variant);
     // Lock, load KeyCache then ValueCache, unlock

     const size_t nmemb  = m_ctx_size * m_pos_dim;
     const int    pos_bw = d_pos.bw();
+    const double             theta        = m_positional_encoding.rope_params.theta;
+    const RopeScalingParams& rope_scaling = m_positional_encoding.rope_params.rope_scaling;
     rope_sin = malloc(nmemb * pos_bw);
     rope_cos = malloc(nmemb * pos_bw);
     std::vector<double> inv_freq(m_pos_dim);
     const double        exponent = 1.0 / static_cast<double>(m_pos_dim);
     for (int j = 0; j < m_pos_dim; j++)
+        inv_freq[j] = 1.0 / pow(theta, j * exponent);
     double attention_factor = 1.0;
     if (rope_scaling.rope_type == RopeScalingParams::ROPE_LLAMA3) {
         // Implemented from HuggingFace
             if (wavelen < high_freq_wavelen) // wavelen < high_freq_wavelen: do nothing
                 continue;
             else if (wavelen > low_freq_wavelen) // wavelen > low_freq_wavelen: divide by factor
+                inv_freq[j] = 1.0 / static_cast<double>(factor * pow(theta, j * exponent));
             else { // otherwise: interpolate between the two, using a smooth factor
                 assert(low_freq_wavelen != high_freq_wavelen);
                 const double smooth =
     }
 }
+size_t QnnNspModel::loadKVCache(const std::string& load_path, bool chooseHigherVariant) {
     if(m_disableKvCache){
         __ERROR("KV cache is disabled, loading KV cache is not allowed");
     // clang-format on
     const int32_t n_valid = static_cast<int32_t>(spec.update_size);
+    int32_t variant = nsp_graph_count.begin()->first; // Set KVManager to smallest variant
+    if(chooseHigherVariant) variant = nsp_graph_count.rbegin()->first; // Ideal for loading KV prefix cache
     _kv_dispatcher->setVariant(variant);
     // Lock, load KeyCache then ValueCache, unlock

Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp CHANGED Viewed

@@ -54,14 +54,14 @@ struct RopeScalingParams {
         double low_freq_factor;
         double high_freq_factor;
         int    original_max_position_embeddings;
-    } llama3_params;
     struct {
         double              factor;
         std::vector<double> long_factor;
         std::vector<double> short_factor;
         int                 original_max_position_embeddings;
-    } longrope_params;
     RopeScalingParams() {}
 };
@@ -79,7 +79,7 @@ struct PositionalEncoding {
         int32_t           dims;
         double            theta;
         RopeScalingParams rope_scaling;
-    } rope_params;
     PositionalEncoding() { type = ROPE; }
 };
@@ -265,10 +265,8 @@ class QnnNspModel {
     QnnUtils::Tensor* t_position_ids{nullptr};
     // PositionalEncodingType::ROPE variables
     int32_t m_pos_dim{-1};       // Dimension of positional embedding tensor (incl partial_factor)
-    double  rope_theta{10000.0}; // Base theta parameter for RoPE calculations
     void*   rope_sin{nullptr};   // Pre-calculated RoPE sin table of size [ctx_size, m_pos_dim]
     void*   rope_cos{nullptr};   // Pre-calculated RoPE cos table of size [ctx_size, m_pos_dim]
-    RopeScalingParams rope_scaling; // RoPE scaling parameters
     QnnUtils::Tensor* t_position_ids_sin{nullptr};
     QnnUtils::Tensor* t_position_ids_cos{nullptr};
@@ -398,7 +396,7 @@ class QnnNspModel {
     bool debugOutputs(QnnUtils::Tensor* outTensor, std::string& outTensorName);
-    size_t loadKVCache(const std::string& load_path);
     bool   saveKVCache(const std::string& save_path);
     bool   applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val);
     bool   applyLoraAdapter(const std::string& lora_adapter_name);

         double low_freq_factor;
         double high_freq_factor;
         int    original_max_position_embeddings;
+    } llama3_params {0};
     struct {
         double              factor;
         std::vector<double> long_factor;
         std::vector<double> short_factor;
         int                 original_max_position_embeddings;
+    } longrope_params {0};
     RopeScalingParams() {}
 };
         int32_t           dims;
         double            theta;
         RopeScalingParams rope_scaling;
+    } rope_params {0};
     PositionalEncoding() { type = ROPE; }
 };
     QnnUtils::Tensor* t_position_ids{nullptr};
     // PositionalEncodingType::ROPE variables
     int32_t m_pos_dim{-1};       // Dimension of positional embedding tensor (incl partial_factor)
     void*   rope_sin{nullptr};   // Pre-calculated RoPE sin table of size [ctx_size, m_pos_dim]
     void*   rope_cos{nullptr};   // Pre-calculated RoPE cos table of size [ctx_size, m_pos_dim]
     QnnUtils::Tensor* t_position_ids_sin{nullptr};
     QnnUtils::Tensor* t_position_ids_cos{nullptr};
     bool debugOutputs(QnnUtils::Tensor* outTensor, std::string& outTensorName);
+    size_t loadKVCache(const std::string& load_path, bool chooseHigherVariant=false);
     bool   saveKVCache(const std::string& save_path);
     bool   applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val);
     bool   applyLoraAdapter(const std::string& lora_adapter_name);

Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp CHANGED Viewed

@@ -39,6 +39,7 @@ class BasicSampler : public Sampler {
     virtual bool save(const std::string& name) override;
     virtual bool restore(const std::string& name) override;
     virtual void reset() override;
   protected:
     int32_t _process(std::span<const float> logits, std::vector<float>* probs_out, bool samp_tok);

     virtual bool save(const std::string& name) override;
     virtual bool restore(const std::string& name) override;
     virtual void reset() override;
+    virtual void applyConfig(const qualla::json& conf) override;
   protected:
     int32_t _process(std::span<const float> logits, std::vector<float>* probs_out, bool samp_tok);

Genie/Genie/src/qualla/include/qualla/dialog.hpp CHANGED Viewed

@@ -107,6 +107,7 @@ class Dialog : public State {
     Tokenizer& tokenizer() { return *_tokenizer; }
     Sampler&   sampler(const std::string& role = "primary") { return *_sampler[role]; }
     Engine&    engine(const std::string& role = "primary") { return *_engine[role]; }
     // Get latest KPIs.
     // Updates TPS, etc as needed.

     Tokenizer& tokenizer() { return *_tokenizer; }
     Sampler&   sampler(const std::string& role = "primary") { return *_sampler[role]; }
     Engine&    engine(const std::string& role = "primary") { return *_engine[role]; }
+    bool isSamplerPresent(std::string role) { return _sampler.find(role) != _sampler.end(); }
     // Get latest KPIs.
     // Updates TPS, etc as needed.

Genie/Genie/src/qualla/include/qualla/engine.hpp CHANGED Viewed

@@ -86,7 +86,7 @@ class Engine : public State {
     QUALLA_API virtual bool updateKV(size_t n_past, const std::vector<bool>& selected);
     QUALLA_API virtual bool   save(const std::string& name);
-    QUALLA_API virtual size_t restore(const std::string& name);
     QUALLA_API virtual void   reset();
     QUALLA_API virtual bool cacheEosEmbedding(std::vector<uint8_t>& eosEmbedding);

     QUALLA_API virtual bool updateKV(size_t n_past, const std::vector<bool>& selected);
     QUALLA_API virtual bool   save(const std::string& name);
+    QUALLA_API virtual size_t restore(const std::string& name, bool chooseHigherVariant=false);
     QUALLA_API virtual void   reset();
     QUALLA_API virtual bool cacheEosEmbedding(std::vector<uint8_t>& eosEmbedding);

Genie/Genie/src/qualla/include/qualla/sampler.hpp CHANGED Viewed

@@ -54,6 +54,7 @@ class Sampler : public State {
     QUALLA_API virtual bool save(const std::string& name);
     QUALLA_API virtual bool restore(const std::string& name);
     QUALLA_API virtual void reset();
     // Get sampler type
     const std::string& type() const { return _type; }

     QUALLA_API virtual bool save(const std::string& name);
     QUALLA_API virtual bool restore(const std::string& name);
     QUALLA_API virtual void reset();
+    QUALLA_API virtual void applyConfig(const qualla::json& conf);
     // Get sampler type
     const std::string& type() const { return _type; }

Genie/Genie/src/qualla/sampler.cpp CHANGED Viewed

@@ -84,6 +84,10 @@ std::vector<int32_t> Sampler::process_multiple(
     return {-1};
 }
 // Sampler registry
 using Registry = std::unordered_map<std::string, Sampler::Creator>;

     return {-1};
 }
+void Sampler::applyConfig(const qualla::json& conf) {
+  _env.logger().warn(fmt::format("Basic sampler supports this for now"));
+}
 // Sampler registry
 using Registry = std::unordered_map<std::string, Sampler::Creator>;

Genie/Genie/src/qualla/samplers/basic.cpp CHANGED Viewed

@@ -221,4 +221,12 @@ static OnLoad regy([]() {
 void needBasicSampler() {}
 } // namespace qualla

 void needBasicSampler() {}
+void BasicSampler::applyConfig(const json& conf) {
+  if (conf.contains("seed")) _seed = conf["seed"];
+  if (conf.contains("temp")) _temp = conf["temp"];
+  if (conf.contains("top-k")) _top_k = conf["top-k"];
+  if (conf.contains("top-p")) _top_p = conf["top-p"];
+}
 } // namespace qualla

Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock CHANGED Viewed

@@ -31,9 +31,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 [[package]]
 name = "cc"
-version = "1.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67b9470d453346108f93a59222a9a1a5724db32d0a4727b7ab7ace4b4d822dc9"
 dependencies = [
  "shlex",
 ]
@@ -190,9 +190,9 @@ dependencies = [
 [[package]]
 name = "itoa"
-version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 [[package]]
 name = "lazy_static"
@@ -202,9 +202,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 [[package]]
 name = "libc"
-version = "0.2.161"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
 [[package]]
 name = "log"
@@ -322,9 +322,9 @@ dependencies = [
 [[package]]
 name = "proc-macro2"
-version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
 dependencies = [
  "unicode-ident",
 ]
@@ -413,9 +413,9 @@ dependencies = [
 [[package]]
 name = "regex-automata"
-version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -436,18 +436,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 [[package]]
 name = "serde"
-version = "1.0.214"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5"
 dependencies = [
  "serde_derive",
 ]
 [[package]]
 name = "serde_derive"
-version = "1.0.214"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -456,9 +456,9 @@ dependencies = [
 [[package]]
 name = "serde_json"
-version = "1.0.132"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
 dependencies = [
  "itoa",
  "memchr",
@@ -498,9 +498,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 [[package]]
 name = "syn"
-version = "2.0.87"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -509,18 +509,18 @@ dependencies = [
 [[package]]
 name = "thiserror"
-version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d171f59dbaa811dbbb1aee1e73db92ec2b122911a48e1390dfe327a821ddede"
 dependencies = [
  "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
-version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b08be0f17bd307950653ce45db00cd31200d82b624b36e181337d9c7d92765b5"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -529,9 +529,9 @@ dependencies = [
 [[package]]
 name = "tokenizers"
-version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b172ffa9a2e5c31bbddc940cd5725d933ced983a9333bbebc4c7eda3bbce1557"
 dependencies = [
  "aho-corasick",
  "derive_builder",
@@ -569,9 +569,9 @@ dependencies = [
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 [[package]]
 name = "unicode-normalization-alignments"

 [[package]]
 name = "cc"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47"
 dependencies = [
  "shlex",
 ]
 [[package]]
 name = "itoa"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2"
 [[package]]
 name = "lazy_static"
 [[package]]
 name = "libc"
+version = "0.2.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
 [[package]]
 name = "log"
 [[package]]
 name = "proc-macro2"
+version = "1.0.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3"
 dependencies = [
  "unicode-ident",
 ]
 [[package]]
 name = "regex-automata"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
 [[package]]
 name = "serde"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
 [[package]]
 name = "serde_derive"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
 [[package]]
 name = "serde_json"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
  "itoa",
  "memchr",
 [[package]]
 name = "syn"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
  "proc-macro2",
  "quote",
 [[package]]
 name = "thiserror"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
 [[package]]
 name = "tokenizers"
+version = "0.20.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b67c92f6d705e2a1d106fb0b28c696f9074901a9c656ee5d9f5de204c39bf7"
 dependencies = [
  "aho-corasick",
  "derive_builder",
 [[package]]
 name = "unicode-ident"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 [[package]]
 name = "unicode-normalization-alignments"

Genie/Model/model.cpp CHANGED Viewed

@@ -179,8 +179,29 @@ MODEL_LIB_EXPORT ModelError_t QnnModel_GenAI_composeGraphs(Qnn_BackendHandle_t b
                    (Qnn_Tensor_t)tin6),
            err);
   /* ADDING NODE FOR genAI */
-  const char* inputs_genAI[] = {"x0", "x1", "x2", "x3", "x4", "x5"};
   Qnn_Tensor_t tout;
   tout.version = QNN_TENSOR_VERSION_1;
@@ -224,7 +245,7 @@ MODEL_LIB_EXPORT ModelError_t QnnModel_GenAI_composeGraphs(Qnn_BackendHandle_t b
                              params,                  // Node Params
                              numParams,               // Num Node Params
                              inputs_genAI,            // Input Tensor Names
-                             6,                       // Num Input Tensor Names
                              outputs_genAI,           // Output Tensors
                              2                        // Num Output Tensors
                              ),

                    (Qnn_Tensor_t)tin6),
            err);
+  uint32_t input6Dim[1] = {1};
+  Qnn_Tensor_t tin7;
+  tin7.version = QNN_TENSOR_VERSION_1;
+  tin7.v1.id = 0;
+  tin7.v1.name = "x6";
+  tin7.v1.type = QNN_TENSOR_TYPE_APP_WRITE;
+  tin7.v1.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
+  tin7.v1.dataType = QNN_DATATYPE_FLOAT_32;
+  tin7.v1.quantizeParams.encodingDefinition = QNN_DEFINITION_UNDEFINED;
+  tin7.v1.quantizeParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED;
+  tin7.v1.quantizeParams.scaleOffsetEncoding = {.scale = 0.0000000000000000f,
+          .offset = 0};
+  tin7.v1.rank = 1;
+  tin7.v1.dimensions = input6Dim;
+  tin7.v1.memType = QNN_TENSORMEMTYPE_RAW;
+  tin7.v1.clientBuf = {.data = nullptr, .dataSize = 0};
+  VALIDATE(qnn_model.addTensor(
+                   "x6",  // Node Name
+                   (Qnn_Tensor_t)tin7),
+           err);
   /* ADDING NODE FOR genAI */
+  const char* inputs_genAI[] = {"x0", "x1", "x2", "x3", "x4", "x5", "x6"};
   Qnn_Tensor_t tout;
   tout.version = QNN_TENSOR_VERSION_1;
                              params,                  // Node Params
                              numParams,               // Num Node Params
                              inputs_genAI,            // Input Tensor Names
+                             7,                       // Num Input Tensor Names
                              outputs_genAI,           // Output Tensors
                              2                        // Num Output Tensors
                              ),

Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json CHANGED Viewed

@@ -43,7 +43,8 @@
             "cpu-mask": "0xe0",
             "kv-dim": 64,
             "kv-update-method": "SHIFT_CONCAT",
-            "allow-async-init": false
           },
           "extensions": "htp_backend_ext_config.json"
         },

             "cpu-mask": "0xe0",
             "kv-dim": 64,
             "kv-update-method": "SHIFT_CONCAT",
+            "allow-async-init": false,
+            "enable-graph-switching": false
           },
           "extensions": "htp_backend_ext_config.json"
         },

Genie/configs/llama2-7b/llama2-7b-genaitransformer-lora.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "dialog" : {
+    "version" : 1,
+    "type" : "basic",
+    "stop-sequence" : [""],
+    "max-num-tokens" : 200,
+    "context" : {
+      "version" : 1,
+      "size": 512,
+      "n-vocab": 32000,
+      "bos-token": 1,
+      "eos-token": 2
+    },
+    "sampler" : {
+      "version" : 1,
+      "seed" : 100,
+      "temp" : 1.2,
+      "top-k" : 20,
+      "top-p" : 0.75,
+      "greedy" : false
+    },
+    "tokenizer" : {
+      "version" : 1,
+      "path" : "your/path/to/tokenizer_file.json"
+    },
+    "engine" : {
+      "version" : 1,
+      "n-threads" : 6,
+      "backend" : {
+        "version" : 1,
+        "type" : "QnnGenAiTransformer",
+        "QnnGenAiTransformer" : {
+          "version" : 1,
+          "n-layer": 32,
+          "n-embd": 4096,
+          "n-heads": 32
+        }
+      },
+      "model" : {
+        "version" : 1,
+        "type" : "library",
+        "library" : {
+          "version" : 1,
+          "model-bin" : "your/path/to/model/file.bin",
+          "lora": {
+            "version": 1,
+            "alpha-tensor-name": "alpha",
+            "adapters": [
+              {
+                "version": 1,
+                "name": "lora1",
+                "bin-sections": [
+                  "your/path/to/model/lora/file.bin"
+                ]
+              }
+            ]
+          }
+        }
+      }
+    }
+  }
+}

Genie/configs/llama2-7b/llama2-7b-genaitransformer.json CHANGED Viewed

@@ -30,7 +30,10 @@
         "version" : 1,
         "type" : "QnnGenAiTransformer",
         "QnnGenAiTransformer" : {
-          "version" : 1
         }
       },
       "model" : {

         "version" : 1,
         "type" : "QnnGenAiTransformer",
         "QnnGenAiTransformer" : {
+          "version" : 1,
+          "n-layer": 32,
+          "n-embd": 4096,
+          "n-heads": 32
         }
       },
       "model" : {

Genie/configs/llama2-7b/llama2-7b-gpu.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "dialog" : {
+    "version" : 1,
+    "type" : "basic",
+    "context" : {
+      "version" : 1,
+      "size": 1024,
+      "n-vocab": 32000,
+      "bos-token": 1,
+      "eos-token": 2
+    },
+    "sampler" : {
+      "version" : 1,
+      "seed" : 42,
+      "temp" : 1.1,
+      "top-k" : 40,
+      "top-p" : 0.95,
+      "greedy" : false
+    },
+    "tokenizer" : {
+      "version" : 1,
+      "path" : "/path/to/tokenizer.json"
+    },
+    "engine" : {
+      "version" : 1,
+      "n-threads" : 3,
+      "backend" : {
+        "version" : 1,
+        "type" : "QnnGpu"
+      },
+      "model" : {
+        "version" : 1,
+        "type" : "binary",
+        "binary" : {
+          "version" : 1,
+          "ctx-bins" : [
+            "/path/to/model.bin"
+          ]
+        }
+      }
+    }
+  }
+}