diff --git a/Genie/Genie/GenieSymbols.default b/Genie/Genie/GenieSymbols.default index 4084db46f37f5b3d47b9ed1f7e65938d185786f2..07436dcfdfd8a24241a5ae1c35ef1d06d9612985 100644 --- a/Genie/Genie/GenieSymbols.default +++ b/Genie/Genie/GenieSymbols.default @@ -14,6 +14,11 @@ GenieDialogConfig_free*; GenieDialog_create*; GenieDialog_query*; + GenieDialog_getSampler*; + GenieSampler_applyConfig*; + GenieSamplerConfig_createFromJson*; + GenieSamplerConfig_setParam*; + GenieSamplerConfig_free*; GenieDialog_tokenQuery*; GenieDialog_embeddingQuery*; GenieDialog_save*; diff --git a/Genie/Genie/make/Android.mk b/Genie/Genie/make/Android.mk index 319f417eb9d4c2be12da0ed39c91719960903ef2..e004c056828db599cc1a9b6bf2362dcd54f79276 100644 --- a/Genie/Genie/make/Android.mk +++ b/Genie/Genie/make/Android.mk @@ -29,6 +29,7 @@ PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../../../../include/QNN/HTP PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/tokenizers PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-api PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu +PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-gpu PACKAGE_C_INCLUDES += -I $(LOCAL_PATH)/../src/qualla/engines/qnn-htp #========================== Define T2T Lib variables ============================================= @@ -45,6 +46,7 @@ MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/dialogs MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/*.cpp) MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-api/*.cpp) MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-cpu/*.cpp) +MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-gpu/*.cpp) MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/engines/qnn-htp/*.cpp) MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/utils/*.cpp) MY_SRC_FILES += $(wildcard $(LOCAL_PATH)/../src/qualla/loggers/*.cpp) diff --git a/Genie/Genie/make/Application.mk b/Genie/Genie/make/Application.mk index 4e0596f856b93970ac937ab0b6302b74306ae05b..290d39f82615ea4846222d3642b6b5d73730e9d9 100644 --- a/Genie/Genie/make/Application.mk +++ b/Genie/Genie/make/Application.mk @@ -10,5 +10,5 @@ APP_ABI := arm64-v8a APP_STL := c++_shared APP_PLATFORM := android-21 APP_MODULES := Genie -APP_CPPFLAGS += -std=c++2a -O3 -Wall -frtti -fexceptions -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_HTP=TRUE -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE +APP_CPPFLAGS += -std=c++2a -O3 -Wall -frtti -fexceptions -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_HTP=TRUE -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE APP_LDFLAGS += -lc -lm -ldl -Wl,--version-script=GenieSymbols.default -Wl,--strip-all diff --git a/Genie/Genie/make/Makefile.linux-x86_64 b/Genie/Genie/make/Makefile.linux-x86_64 index 98d4d4a9657c72a5ea359a475eed0b9103339d68..f43a1eb32dd0732f92f086b6adbd4fe9551d8a01 100644 --- a/Genie/Genie/make/Makefile.linux-x86_64 +++ b/Genie/Genie/make/Makefile.linux-x86_64 @@ -17,6 +17,7 @@ SRC_DIR_SAMPLE_DIALOGS := src/qualla/dialogs SRC_DIR_GENIE_ENGINES := src/qualla/engines SRC_DIR_GENIE_QNN_API := src/qualla/engines/qnn-api SRC_DIR_GENIE_ENGINES_CPU := src/qualla/engines/qnn-cpu +SRC_DIR_GENIE_ENGINES_GPU := src/qualla/engines/qnn-gpu SRC_DIR_GENIE_UTILS := src/qualla/utils # SRC_DIR_GENIE_LOGGERS := src/qualla/loggers @@ -29,6 +30,7 @@ SRC_DIR_GENIE := src # Includes GENIE_ENGINES_CPU_INCLUDE := src/qualla/engines/qnn-cpu +GENIE_ENGINES_GPU_INCLUDE := src/qualla/engines/qnn-gpu GENIE_ENGINES_API_INCLUDE := src/qualla/engines/qnn-api GENIE_ENGINES_HTP_INCLUDE := src/qualla/engines/qnn-htp GENIE_TOKENIZER_INCLUDE := src/qualla/tokenizers @@ -62,7 +64,7 @@ endif GENIE_all: $(libGenie) # Include paths -INCLUDES += -I$(GENIE_INCLUDE) -I$(QUALLA_INCLUDE) -I$(SRC_DIR_GENIE_TOKENIZERS) -I$(QNN_API_INCLUDE) -I$(GENIE_ENGINES_CPU_INCLUDE) -I$(QNN_API_HTP_INCLUDE) -I$(GENIE_ENGINES_API_INCLUDE) -I$(GENIE_TOKENIZER_INCLUDE) -I$(GENIE_C_API_HEADERS_INCLUDE) +INCLUDES += -I$(GENIE_INCLUDE) -I$(QUALLA_INCLUDE) -I$(SRC_DIR_GENIE_TOKENIZERS) -I$(QNN_API_INCLUDE) -I$(GENIE_ENGINES_CPU_INCLUDE) -I$(GENIE_ENGINES_GPU_INCLUDE) -I$(QNN_API_HTP_INCLUDE) -I$(GENIE_ENGINES_API_INCLUDE) -I$(GENIE_TOKENIZER_INCLUDE) -I$(GENIE_C_API_HEADERS_INCLUDE) # set compiler flags COMMON_CXXFLAGS = -std=c++2a -frtti -fPIC -Wall -pg -pthread -nostdinc++ -stdlib=libc++ -idirafter /usr/lib/llvm-14/include/c++/v1 -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include $(INCLUDES) @@ -71,11 +73,11 @@ COMMON_LDFLAGS = -shared -s -fPIC -pthread -L/usr/lib/x86_64-linux-gnu -L./src/ COMMON_CFLAGS = -nostdinc -idirafter /usr/lib/llvm-14/lib/clang/14.0.0/include/ -idirafter /usr/include ifdef QNN_DEBUG_ENABLE -CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API="" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE +CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API="" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE CFLAGS += $(COMMON_CFLAGS) LDFLAGS += $(COMMON_LDFLAGS) else -CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE +CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DGENIE_API="__attribute__((visibility(\"default\")))" -DSPILLFILL -DQUALLA_ENGINE_QNN_CPU=TRUE -DQUALLA_ENGINE_QNN_GPU=TRUE -DQUALLA_APPS=OFF -DFMT_HEADER_ONLY -DGENIE_SAMPLE -DQUALLA_INTERNAL_QNN_SDK -DGENIE_SSD_FEATURE -DGENIE_SPD_FEATURE -DGENIE_LADE_FEATURE -DGENIE_MULTISTREAM_FEATURE -DGENIE_LORA_FEATURE -DGENIE_E2T_FEATURE CFLAGS += $(COMMON_CFLAGS) LDFLAGS += $(COMMON_LDFLAGS) -fvisibility=hidden -flto endif @@ -89,6 +91,7 @@ SOURCES_GENIE_QNN_API_CPP := $(wildcard $(SRC_DIR_GENIE_QNN_API)/*.cpp) SOURCES_GENIE_ENGINES_CPP := $(filter-out $(SRC_DIR_GENIE_ENGINES)/qnn-htp.cpp, $(wildcard $(SRC_DIR_GENIE_ENGINES)/*.cpp)) SOURCES_GENIE_DIALOGS_CPP := $(wildcard $(SRC_DIR_SAMPLE_DIALOGS)/*.cpp) SOURCES_GENIE_ENGINES_CPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_CPU)/*.cpp) +SOURCES_GENIE_ENGINES_GPU_CPP := $(wildcard $(SRC_DIR_GENIE_ENGINES_GPU)/*.cpp) SOURCES_GENIE_UTILS_CPP := $(wildcard $(SRC_DIR_GENIE_UTILS)/*.cpp) @@ -108,6 +111,8 @@ OBJ_DIR_GENIE_ENGINES := $(OBJ_DIR_QUALLA)/engines OBJ_DIR_GENIE_UTILS := $(OBJ_DIR_QUALLA)/utils OBJ_DIR_GENIE_ENGINES_CPU := $(OBJ_DIR_QUALLA)/engines/qnn-cpu $(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_CPU)) +OBJ_DIR_GENIE_ENGINES_GPU := $(OBJ_DIR_QUALLA)/engines/qnn-gpu +$(shell mkdir -p $(OBJ_DIR_GENIE_ENGINES_GPU)) OBJ_DIR_GENIE_LOGGERS := obj/$(QNN_TARGET)/qualla/loggers OBJ_DIR_GENIE_SAMPLERS := obj/$(QNN_TARGET)/qualla/samplers @@ -125,6 +130,7 @@ OBJECTS_GENIE_ENGINES := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES)/%.o,$(foreach OBJECTS_GENIE_DIALOGS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_DIALOGS)/%.o,$(foreach x,$(SOURCES_GENIE_DIALOGS_CPP),$(notdir $(x)))) OBJECTS_GENIE_UTILS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_UTILS)/%.o,$(foreach x,$(SOURCES_GENIE_UTILS_CPP),$(notdir $(x)))) OBJECTS_GENIE_ENGINES_CPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_CPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_CPU_CPP),$(notdir $(x)))) +OBJECTS_GENIE_ENGINES_GPU := $(patsubst %.cpp,$(OBJ_DIR_GENIE_ENGINES_GPU)/%.o,$(foreach x,$(SOURCES_GENIE_ENGINES_GPU_CPP),$(notdir $(x)))) OBJECTS_GENIE_LOGGERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_LOGGERS)/%.o,$(foreach x,$(SOURCES_GENIE_LOGGERS_CPP),$(notdir $(x)))) OBJECTS_GENIE_SAMPLERS := $(patsubst %.cpp,$(OBJ_DIR_GENIE_SAMPLERS)/%.o,$(foreach x,$(SOURCES_GENIE_SAMPLERS_CPP),$(notdir $(x)))) @@ -157,16 +163,18 @@ $(OBJ_DIR_GENIE_UTILS)/%.o: $(SRC_DIR_GENIE_UTILS)/%.cpp $(CXX) $(CXXFLAGS) -c $ $(OBJ_DIR_GENIE_ENGINES_CPU)/%.o: $(SRC_DIR_GENIE_ENGINES_CPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@ +$(OBJ_DIR_GENIE_ENGINES_GPU)/%.o: $(SRC_DIR_GENIE_ENGINES_GPU)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@ + $(OBJ_DIR_GENIE_LOGGERS)/%.o: $(SRC_DIR_GENIE_LOGGERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@ $(OBJ_DIR_GENIE_SAMPLERS)/%.o: $(SRC_DIR_GENIE_SAMPLERS)/%.cpp $(CXX) $(CXXFLAGS) -c $^ -o $@ # set up resources -directories := $(TARGET_DIR) $(OBJ_DIR_GENIE) $(OBJ_DIR_GENIE_QNN_API) $(OBJ_DIR_QUALLA) $(OBJ_DIR_GENIE_TOKENIZERS) $(OBJ_DIR_GENIE_ENGINES) $(OBJ_DIR_GENIE_DIALOGS) $(OBJ_DIR_GENIE_UTILS) $(OBJ_DIR_GENIE_ENGINES_CPU) $(OBJ_DIR_GENIE_LOGGERS) $(OBJ_DIR_GENIE_SAMPLERS) +directories := $(TARGET_DIR) $(OBJ_DIR_GENIE) $(OBJ_DIR_GENIE_QNN_API) $(OBJ_DIR_QUALLA) $(OBJ_DIR_GENIE_TOKENIZERS) $(OBJ_DIR_GENIE_ENGINES) $(OBJ_DIR_GENIE_DIALOGS) $(OBJ_DIR_GENIE_UTILS) $(OBJ_DIR_GENIE_ENGINES_CPU) $(OBJ_DIR_GENIE_ENGINES_GPU) $(OBJ_DIR_GENIE_LOGGERS) $(OBJ_DIR_GENIE_SAMPLERS) # Compile -$(libGenie): $(OBJECTS_GENIE) $(OBJECTS_QUALLA) $(OBJECTS_GENIE_QNN_API) $(OBJECTS_GENIE_TOKENIZERS) $(OBJECTS_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU) $(OBJECTS_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS) | $(directories) +$(libGenie): $(OBJECTS_GENIE) $(OBJECTS_QUALLA) $(OBJECTS_GENIE_QNN_API) $(OBJECTS_GENIE_TOKENIZERS) $(OBJECTS_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU) $(OBJECTS_GENIE_ENGINES_GPU) $(OBJECTS_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS) | $(directories) $(CXX) $(CXXFLAGS) -shared -o $@ $^ $(LIBS) $(libtokenizers) @@ -179,6 +187,7 @@ $(OBJECTS_GENIE_ENGINES): | $(OBJ_DIR_GENIE_ENGINES) $(OBJECTS_GENIE_DIALOGS): | $(OBJ_DIR_GENIE_DIALOGS) $(OBJECTS_GENIE_UTILS): | $(OBJ_DIR_GENIE_UTILS) $(OBJECTS_GENIE_ENGINES_CPU): | $(OBJ_DIR_GENIE_ENGINES_CPU) +$(OBJECTS_GENIE_ENGINES_GPU): | $(OBJ_DIR_GENIE_ENGINES_GPU) $(OBJECTS_GENIE_LOGGERS): | $(OBJ_DIR_GENIE_LOGGERS) $(OBJECTS_GENIE_SAMPLERS): | $(OBJ_DIR_GENIE_SAMPLERS) diff --git a/Genie/Genie/src/Dialog.cpp b/Genie/Genie/src/Dialog.cpp index e3812e81362e3ccde2a9e96473339b3b760b883c..0f004bd6a5b05788b3006650b228a343c5030fd9 100644 --- a/Genie/Genie/src/Dialog.cpp +++ b/Genie/Genie/src/Dialog.cpp @@ -95,81 +95,6 @@ static void translateContextConfig(const qualla::json& genieConfig, qualla::json } } -//============================================================================= -// Sampler::Config functions -//============================================================================= - -static void validateSamplerConfig(const qualla::json& config) { - if (!config.is_object()) { - throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "sampler config is not an object"); - } - - std::set mandatoryFields{"version"}; - for (const auto& field : mandatoryFields) { - if (!config.contains(field)) { - throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing sampler field: " + field); - } - } - - // component is used in the "ENFORCE" macros - std::string component = "sampler"; - - for (auto& item : config.items()) { - if (item.key() == "version") { - JSON_ENFORCE_NUMERIC(); - if (item.value().get() != 1) { - throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, - "Invalid sampler config: unsupported version: " + item.value().dump()); - } - } else if (item.key() == "seed") { - JSON_ENFORCE_NUMERIC(); - } else if (item.key() == "temp") { - JSON_ENFORCE_NUMERIC(); - } else if (item.key() == "top-k") { - JSON_ENFORCE_NUMERIC(); - } else if (item.key() == "top-p") { - JSON_ENFORCE_NUMERIC(); - } else if (item.key() == "greedy") { - JSON_ENFORCE_BOOLEAN(); - } else { - throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key()); - } - } -} - -static void translateSamplerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) { - if (genieConfig["dialog"].contains("sampler")) { - quallaConfig["sampler"]["type"] = "basic"; - - if (genieConfig["dialog"]["sampler"].contains("seed")) { - quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"]; - } - if (genieConfig["dialog"]["sampler"].contains("temp")) { - quallaConfig["sampler"]["temp"] = genieConfig["dialog"]["sampler"]["temp"]; - } - - quallaConfig["sampler"]["role"] = "primary"; -#if defined(GENIE_SPD_FEATURE) - if (genieConfig["dialog"]["type"] == "spd") { - quallaConfig["sampler"]["role"] = "target"; - } -#endif - - if (genieConfig["dialog"]["sampler"].contains("top-k")) { - quallaConfig["sampler"]["top-k"] = genieConfig["dialog"]["sampler"]["top-k"]; - } - if (genieConfig["dialog"]["sampler"].contains("top-p")) { - quallaConfig["sampler"]["top-p"] = genieConfig["dialog"]["sampler"]["top-p"]; - } - if (genieConfig["dialog"]["sampler"].contains("greedy")) { - quallaConfig["sampler"]["greedy"] = genieConfig["dialog"]["sampler"]["greedy"]; - } - if (genieConfig["dialog"]["sampler"].contains("seed")) { - quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"]; - } - } -} - //============================================================================= // Tokenizer::Config functions //============================================================================= @@ -322,6 +247,8 @@ static void validateBackendHtpConfig(const qualla::json& config) { } else if (item.key() == "rope-theta") { rope_theta_set = true; JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "enable-graph-switching") { + JSON_ENFORCE_BOOLEAN(); } else { throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key()); } @@ -410,7 +337,7 @@ static void validateBackendConfig(const qualla::json& config) { htp = true; } else if (type == "QnnGenAiTransformer") { genai = true; - } else { + } else if (type != "QnnGpu") { throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "Invalid backend config: unsupported type: " + item.value().dump()); } @@ -629,6 +556,9 @@ static void validateModelLibraryConfig(const qualla::json& config) { } } else if (item.key() == "model-bin") { JSON_ENFORCE_STRING(); + } else if (item.key() == "lora") { + JSON_ENFORCE_OBJECT(); + validateLoraConfig(item.value()); } else { throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key()); } @@ -956,6 +886,10 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig, quallaEngineConfig["use-async-Init"] = genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"]; } + if (genieEngineConfig["backend"]["QnnHtp"].contains("enable-graph-switching")) { + quallaEngineConfig["enable-graph-switching"] = + genieEngineConfig["backend"]["QnnHtp"]["enable-graph-switching"]; + } } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") { quallaEngineConfig["type"] = "qnn-cpu"; quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer"); @@ -979,6 +913,8 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig, quallaEngineConfig["n_heads"] = genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"]; } + } else if (genieEngineConfig["backend"]["type"] == "QnnGpu") { + quallaEngineConfig["type"] = "qnn-gpu"; } if (genieEngineConfig["backend"].contains("extensions")) { @@ -1020,6 +956,21 @@ static void translateEngineConfig(const qualla::json& genieEngineConfig, quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"]; quallaEngineConfig["op-package"] = getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider"; + if (genieEngineConfig["model"]["library"].contains("lora")) { + for (int i = 0; i < genieEngineConfig["model"]["library"]["lora"]["adapters"].size(); i++) { + quallaEngineConfig["lora"][i]["adapter-name"] = + genieEngineConfig["model"]["library"]["lora"]["adapters"][i]["name"]; + if (genieEngineConfig["model"]["library"]["lora"].contains("alpha-tensor-name")) { + quallaEngineConfig["lora"][i]["alpha-tensor-name"] = + genieEngineConfig["model"]["library"]["lora"] + ["alpha-tensor-name"]; + } + quallaEngineConfig["lora"][i]["alpha-tensor-value"] = 1.0f; + quallaEngineConfig["lora"][i]["binsection-basedir"] = ""; + quallaEngineConfig["lora"][i]["bin-sections"] = + genieEngineConfig["model"]["library"]["lora"]["adapters"][i]["bin-sections"]; + } + } } if (genieEngineConfig["model"].contains("positional-encoding")) { quallaEngineConfig["positional-encoding"]["type"] = @@ -1424,7 +1375,7 @@ static void validateDialogConfig(const qualla::json& config) { validateTokenizerConfig(item.value()); } else if (item.key() == "sampler") { JSON_ENFORCE_OBJECT(); - validateSamplerConfig(item.value()); + Sampler::SamplerConfig::validateSamplerConfig(item.value()); } else if (item.key() == "engine") { JSON_ENFORCE_ARRAY_OR_OBJECT(); } else if (item.key() == "embedding") { @@ -1550,7 +1501,7 @@ static void translateDialogConfig(const qualla::json& genieConfig, qualla::json& translateContextConfig(genieConfig, quallaConfig); translateTokenizerConfig(genieConfig, quallaConfig); - translateSamplerConfig(genieConfig, quallaConfig); + Sampler::SamplerConfig::translateSamplerConfig(genieConfig, quallaConfig); translateMultiEngineConfig(genieConfig, quallaConfig); translateEmbeddingConfig(genieConfig, quallaConfig); } @@ -1611,7 +1562,7 @@ Dialog::Config::Config(const char* configStr) { m_config = config; } -qualla::json Dialog::Config::getJson() const { return m_config; } +qualla::json& Dialog::Config::getJson() { return m_config; } //============================================================================= // Dialog functions @@ -1640,6 +1591,27 @@ Dialog::Dialog(std::shared_ptr config) { if (!m_quallaDialog) { throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a dialog object"); } + /* + * spec-dec has a mandatory "target" sampler and an optional "draft" sampler + * Check their availability and pass their references to Dialog Sampler to update with + * applyConfig() + */ + std::shared_ptr sampler; + std::vector> quallaSamplers; + if (quallaConfig["type"] == "spec-dec") { + quallaSamplers.push_back(m_quallaDialog->sampler("target")); + if (m_quallaDialog->isSamplerPresent("draft")) + quallaSamplers.push_back(m_quallaDialog->sampler("draft")); + sampler = std::make_shared(config->getJson()["dialog"], quallaSamplers); + } else { + quallaSamplers.push_back(m_quallaDialog->sampler()); // Default role is "primary" + sampler = std::make_shared(config->getJson()["dialog"], quallaSamplers); + } + m_samplerHandle = Sampler::add(sampler); +} + +GenieSampler_Handle_t Dialog::getSamplerHandle(std::shared_ptr dialog) { + return dialog->m_samplerHandle; } static_assert(qualla::Sentence::Code::COMPLETE == @@ -1801,4 +1773,6 @@ int32_t Dialog::tokenQuery(const uint32_t* tokens, kpis.generate.last_usec, kpis.tps.generate); return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_QUERY_FAILED); -} \ No newline at end of file +} + +Dialog::~Dialog() { Sampler::remove(m_samplerHandle); } diff --git a/Genie/Genie/src/Dialog.hpp b/Genie/Genie/src/Dialog.hpp index c62690c358732c8159dbc365b0c5ccda2e083de7..6a8206fef3bd414eeaed4d04b692b99577f106c2 100644 --- a/Genie/Genie/src/Dialog.hpp +++ b/Genie/Genie/src/Dialog.hpp @@ -10,11 +10,13 @@ #include #include +#include #include "GenieDialog.h" #include "Util/HandleManager.hpp" #include "qualla/dialog.hpp" #include "qualla/DialogCallback.hpp" +#include "Sampler.hpp" namespace genie { @@ -33,7 +35,7 @@ class Dialog { static void remove(GenieDialogConfig_Handle_t handle); Config(const char* configStr); - qualla::json getJson() const; + qualla::json& getJson(); private: static qnn::util::HandleManager s_manager; @@ -43,10 +45,12 @@ class Dialog { static GenieDialog_Handle_t add(std::shared_ptr dialog); static std::shared_ptr get(GenieDialog_Handle_t handle); static void remove(GenieDialog_Handle_t handle); + static GenieSampler_Handle_t getSamplerHandle(std::shared_ptr dialog); qualla::DialogCallback dialogCallback; Dialog(std::shared_ptr config); + ~Dialog(); Dialog(const Dialog&) = delete; Dialog& operator=(const Dialog&) = delete; @@ -91,5 +95,6 @@ class Dialog { uint32_t m_tokenLimit{UINT32_MAX}; static qnn::util::HandleManager s_manager; static std::atomic s_nameCounter; + GenieSampler_Handle_t m_samplerHandle; }; } // namespace genie diff --git a/Genie/Genie/src/Embedding.cpp b/Genie/Genie/src/Embedding.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bd89f1699b1d8edf8343b1988f15f80a6ec30876 --- /dev/null +++ b/Genie/Genie/src/Embedding.cpp @@ -0,0 +1,740 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include +#include +#include + +#include "Embedding.hpp" +#include "Exception.hpp" +#include "Macro.hpp" +#include "qualla/detail/json.hpp" +#include "qualla/env.hpp" + +using namespace genie; + +#ifdef _WIN32 +inline std::string libPrefix = ""; +inline std::string libSuffix = ".dll"; +#else +inline std::string libPrefix = "lib"; +inline std::string libSuffix = ".so"; +#endif + +inline std::string getLibName(std::string baseName) { return libPrefix + baseName + libSuffix; } + +//============================================================================= +// Context::Config functions +//============================================================================= + +static void validateContextConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "context config is not an object"); + } + + std::set mandatoryFields{ + "version", "n-vocab", "ctx-size", "embed-size", "pad-token"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing context field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "context"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid context config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "n-vocab") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "ctx-size") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "embed-size") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "pad-token") { + JSON_ENFORCE_NUMERIC(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown context config key: " + item.key()); + } + } +} + +static void translateContextConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) { + quallaConfig["n-vocab"] = genieConfig["n-vocab"]; + quallaConfig["size"] = genieConfig["ctx-size"]; + quallaConfig["n-embd"] = genieConfig["embed-size"]; + quallaConfig["pad-token"] = genieConfig["pad-token"]; +} + +//============================================================================= +// Tokenizer::Config functions +//============================================================================= + +static void validateTokenizerConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "tokenizer config is not an object"); + } + + std::set mandatoryFields{"version", "path"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing tokenizer field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "tokenizer"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid tokenizer config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "path") { + JSON_ENFORCE_STRING(); + // Note: the existence of this file is checked by qualla + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Unknown tokenizer config key: " + item.key()); + } + } +} + +static void translateTokenizerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) { + quallaConfig["tokenizer"] = genieConfig["path"]; +} + +//============================================================================= +// Backend::Config functions +//============================================================================= + +static void validateBackendHtpConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnHtp config is not an object"); + } + + std::set mandatoryFields{ + "version", "spill-fill-bufsize", "use-mmap", "pooled-output", "allow-async-init"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "QnnHtp"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid QnnHtp config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "spill-fill-bufsize") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "use-mmap") { + JSON_ENFORCE_BOOLEAN(); + } else if (item.key() == "pooled-output") { + JSON_ENFORCE_BOOLEAN(); + } else if (item.key() == "allow-async-init") { + JSON_ENFORCE_BOOLEAN(); + } else if (item.key() == "disable-kv-cache") { + JSON_ENFORCE_BOOLEAN(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown QnnHtp config key: " + item.key()); + } + } +} + +static void validateBackendGenaiConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "QnnGenAiTransformer config is not an object"); + } + + std::set mandatoryFields{"version"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Missing QnnGenAiTransformer field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "QnnGenAiTransformer"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception( + GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid QnnGenAiTransformer config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "n-logits") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "n-layer") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "n-embd") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "n-heads") { + JSON_ENFORCE_NUMERIC(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Unknown QnnGenAiTransformer config key: " + item.key()); + } + } +} + +static void validateBackendConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "backend config is not an object"); + } + + std::set mandatoryFields{"version", "type"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing backend field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "backend"; + + std::string type; + bool htp = false; + qualla::json htpConfig; + bool genai = false; + qualla::json genaiConfig; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid backend config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "type") { + JSON_ENFORCE_STRING(); + type = item.value().get(); + if (type == "QnnHtp") { + htp = true; + } else if (type == "QnnGenAiTransformer") { + genai = true; + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid backend config: unsupported type: " + item.value().dump()); + } + } else if (item.key() == "extensions") { + JSON_ENFORCE_STRING(); + } else if (item.key() == "QnnHtp") { + JSON_ENFORCE_OBJECT(); + htpConfig = item.value(); + } else if (item.key() == "QnnGenAiTransformer") { + JSON_ENFORCE_OBJECT(); + genaiConfig = item.value(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown backend config key: " + item.key()); + } + } + + if (htp) { + if (!htpConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing QnnHtp embedding config"); + } + validateBackendHtpConfig(htpConfig); + } else { + if (htpConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "QnnHtp backend config for incorrect backend type: " + type); + } + } + + if (genai) { + if (!genaiConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Missing QnnGenAiTransformer embedding config"); + } + validateBackendGenaiConfig(genaiConfig); + } else { + if (genaiConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "QnnGenAiTransformer backend config for incorrect backend type: " + type); + } + } +} + +//============================================================================= +// Model::Config functions +//============================================================================= + +static void validateModelBinaryConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "binary config is not an object"); + } + + std::set mandatoryFields{"version", "ctx-bins"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "binary"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid binary config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "ctx-bins") { + JSON_ENFORCE_ARRAY(); + for (auto& elem : item.value()) { + if (!elem.is_string()) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "ctx-bins must be an array of strings"); + } + } + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown binary config key: " + item.key()); + } + } +} + +static void validateModelLibraryConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "library config is not an object"); + } + + std::set mandatoryFields{"version", "model-bin"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "library"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid library config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "model-bin") { + JSON_ENFORCE_STRING(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown library config key: " + item.key()); + } + } +} + +static void validateModelConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "model config is not an object"); + } + + std::set mandatoryFields{"version", "type"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing model field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "model"; + + std::string type; + bool binary = false; + qualla::json binaryConfig; + bool library = false; + qualla::json libraryConfig; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid model config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "type") { + JSON_ENFORCE_STRING(); + type = item.value().get(); + if (type == "binary") { + binary = true; + } else if (type == "library") { + library = true; + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid model config: unsupported type: " + item.value().dump()); + } + } else if (item.key() == "binary") { + JSON_ENFORCE_OBJECT(); + binaryConfig = item.value(); + } else if (item.key() == "library") { + JSON_ENFORCE_OBJECT(); + libraryConfig = item.value(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown model config key: " + item.key()); + } + } + + if (binary) { + if (!binaryConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing binary model config"); + } + validateModelBinaryConfig(binaryConfig); + } else { + if (binaryConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "binary model config for incorrect model type: " + type); + } + } + + if (library) { + if (!libraryConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing library model config"); + } + validateModelLibraryConfig(libraryConfig); + } else { + if (libraryConfig.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "library model config for incorrect model type: " + type); + } + } +} + +//============================================================================= +// Engine::Config functions +//============================================================================= + +static void validateEngineConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "engine config is not an object"); + } + + std::set mandatoryFields{"version", "backend", "model"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing engine field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "engine"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid engine config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "backend") { + JSON_ENFORCE_OBJECT(); + validateBackendConfig(item.value()); + } else if (item.key() == "model") { + JSON_ENFORCE_OBJECT(); + validateModelConfig(item.value()); + } else if (item.key() == "n-threads") { + JSON_ENFORCE_NUMERIC(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown engine config key: " + item.key()); + } + } +} + +static void translateEngineConfig(const qualla::json& genieEngineConfig, + qualla::json& quallaEngineConfig) { + if (genieEngineConfig["version"] == 1) { + if (genieEngineConfig.contains("n-threads")) + quallaEngineConfig["n-threads"] = genieEngineConfig["n-threads"]; + + if (genieEngineConfig["backend"]["type"] == "QnnHtp") { + quallaEngineConfig["type"] = "qnn-htp"; + quallaEngineConfig["model-architecture-type"] = "encoder", + quallaEngineConfig["backend-lib"] = getLibName("QnnHtp"); + quallaEngineConfig["use-mmap"] = genieEngineConfig["backend"]["QnnHtp"]["use-mmap"]; + quallaEngineConfig["spill-fill-bufsize"] = + genieEngineConfig["backend"]["QnnHtp"]["spill-fill-bufsize"]; + quallaEngineConfig["pooled-output"] = genieEngineConfig["backend"]["QnnHtp"]["pooled-output"]; + if (genieEngineConfig["backend"]["QnnHtp"].contains("disable-kv-cache")) { + quallaEngineConfig["disable-kv-cache"] = + genieEngineConfig["backend"]["QnnHtp"]["disable-kv-cache"]; + } + // By default, Qualla will default to the async init path. + // For now, we are forcing async init off unless explicitly + // specified in the Genie config. It is HTP specific feature only. + quallaEngineConfig["use-async-Init"] = false; + if (genieEngineConfig["backend"]["QnnHtp"].contains("allow-async-init")) { + quallaEngineConfig["use-async-Init"] = + genieEngineConfig["backend"]["QnnHtp"]["allow-async-init"]; + } + } else if (genieEngineConfig["backend"]["type"] == "QnnGenAiTransformer") { + quallaEngineConfig["type"] = "qnn-cpu"; + quallaEngineConfig["model-output"] = "embeddings"; + quallaEngineConfig["backend-lib"] = getLibName("QnnGenAiTransformer"); + if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-logits")) { + quallaEngineConfig["n_logits"] = + genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-logits"]; + } + if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-layer")) { + quallaEngineConfig["n_layer"] = + genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-layer"]; + } + if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-embd")) { + quallaEngineConfig["n_embd"] = + genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-embd"]; + } + if (genieEngineConfig["backend"]["QnnGenAiTransformer"].contains("n-heads")) { + quallaEngineConfig["n_heads"] = + genieEngineConfig["backend"]["QnnGenAiTransformer"]["n-heads"]; + } + } + + if (genieEngineConfig["backend"].contains("extensions")) { + quallaEngineConfig["backend-ext-conf"] = genieEngineConfig["backend"]["extensions"]; + } + + if (genieEngineConfig["model"]["type"] == "binary") { + quallaEngineConfig["model-list"] = genieEngineConfig["model"]["binary"]["ctx-bins"]; + } else if (genieEngineConfig["model"]["type"] == "library") { + quallaEngineConfig["model"] = getLibName("QnnGenAiTransformerModel"); + quallaEngineConfig["model-bin-path"] = genieEngineConfig["model"]["library"]["model-bin"]; + quallaEngineConfig["op-package"] = + getLibName("QnnGenAiTransformerCpuOpPkg") + ":QnnOpPackage_interfaceProvider"; + } + } +} + +//============================================================================= +// Prompt::Config functions +//============================================================================= + +static void validatePromptConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "prompt config is not an object"); + } + + std::set mandatoryFields{"version", "prompt-template"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing prompt field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "prompt"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid context config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "prompt-template") { + JSON_ENFORCE_ARRAY(); + for (auto& elem : item.value()) { + if (!elem.is_string()) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, "prompt tags must be an array of strings"); + } + } + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown context config key: " + item.key()); + } + } +} + +static void translatePromptConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) { + quallaConfig["tags"] = genieConfig["prompt-template"]; +} + +//============================================================================= +// Embedding::Config functions +//============================================================================= + +qnn::util::HandleManager Embedding::Config::s_manager; + +GenieEmbeddingConfig_Handle_t Embedding::Config::add(std::shared_ptr config) { + return (GenieEmbeddingConfig_Handle_t)s_manager.add(config); +} + +std::shared_ptr Embedding::Config::get(GenieEmbeddingConfig_Handle_t handle) { + return s_manager.get((qnn::util::Handle_t)handle); +} + +void Embedding::Config::remove(GenieEmbeddingConfig_Handle_t handle) { + s_manager.remove((qnn::util::Handle_t)handle); +} + +static void validateEmbeddingConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Embedding config is not an object"); + } + + std::set mandatoryFields{"version", "context", "tokenizer", "engine"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing embedding field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "embedding"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid embedding config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "context") { + JSON_ENFORCE_OBJECT(); + validateContextConfig(item.value()); + } else if (item.key() == "tokenizer") { + JSON_ENFORCE_OBJECT(); + validateTokenizerConfig(item.value()); + } else if (item.key() == "prompt") { // optional parameter + JSON_ENFORCE_OBJECT(); + validatePromptConfig(item.value()); + } else if (item.key() == "truncate-input") { // optional parameter + JSON_ENFORCE_BOOLEAN(); + } else if (item.key() == "engine") { + JSON_ENFORCE_OBJECT(); + validateEngineConfig(config["engine"]); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Unknown embedding config key: " + item.key()); + } + } +} + +static void translateEmbeddingConfig(const qualla::json& genieConfig, qualla::json& quallaConfig) { + translateContextConfig(genieConfig["context"], quallaConfig["context"]); + translatePromptConfig(genieConfig["prompt"], quallaConfig["prompt"]); + translateTokenizerConfig(genieConfig["tokenizer"], quallaConfig); + translateEngineConfig(genieConfig["engine"], quallaConfig["engine"]); + + if (genieConfig.contains( + "truncate-input")) { // to allow truncation of input incase it exceeds the context. + quallaConfig["truncate-input"] = genieConfig["truncate-input"]; + } +} + +Embedding::Config::Config(const char* configStr) { + qualla::json config; + + { + std::set keys; + + auto callback = [&keys](int depth, qualla::json::parse_event_t event, qualla::json& parsed) { + if ((depth == 1) && (event == qualla::json::parse_event_t::key)) { + if (keys.count(parsed) > 0) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Multiple embedding config key: " + parsed.dump()); + } + keys.insert(parsed); + } + return true; + }; + + config = qualla::json::parse(configStr, callback); + } + + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Embedding config is not an object"); + } + + std::set mandatoryFields{"embedding"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing embedding field: " + field); + } + } + + // component is used in the "ENFORCE" macros + std::string component = "embedding"; + + for (auto& item : config.items()) { + if (item.key() == "embedding") { + JSON_ENFORCE_OBJECT(); + validateEmbeddingConfig(item.value()); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Unknown embedding config key: " + item.key()); + } + } + m_config = config; +} + +qualla::json Embedding::Config::getJson() const { return m_config; } + +//============================================================================= +// Embedding functions +//============================================================================= + +qnn::util::HandleManager Embedding::s_manager; +std::atomic Embedding::s_nameCounter{0u}; + +GenieEmbedding_Handle_t Embedding::add(std::shared_ptr embedding) { + return (GenieEmbedding_Handle_t)s_manager.add(embedding); +} + +std::shared_ptr Embedding::get(GenieEmbedding_Handle_t handle) { + return s_manager.get((qnn::util::Handle_t)handle); +} + +void Embedding::remove(GenieEmbedding_Handle_t handle) { + s_manager.remove((qnn::util::Handle_t)handle); +} + +Embedding::Embedding(std::shared_ptr config) { + auto env = qualla::Env::create(qualla::json{}); + qualla::json quallaConfig; + translateEmbeddingConfig(config->getJson()["embedding"], quallaConfig); + m_quallaEmbedding = qualla::Embedding::create( + env, "embedding" + std::to_string(s_nameCounter.fetch_add(1u)), quallaConfig); + if (!m_quallaEmbedding) { + throw Exception(GENIE_STATUS_ERROR_MEM_ALLOC, "Could not create a embedding object"); + } +} + +int32_t Embedding::generate(const char* queryStr, + GenieEmbedding_GenerateCallback_t callback, + const void* userData) { + std::string query(queryStr); + std::vector outputEmbedding; + bool status = false; + status = m_quallaEmbedding->query(query, outputEmbedding); + if (status) { + std::vector dimensions; + m_quallaEmbedding->output_dimensions(dimensions); + callback(dimensions.data(), dimensions.size(), outputEmbedding.data(), userData); + qualla::Embedding::KPIs kpis = m_quallaEmbedding->kpis(); + printf( + "\n\n[KPIS]:\nInit Time: %zu us\nPrompt Processing Time: %zu us, Prompt Processing Rate : " + "%f toks/sec\n", + kpis.init.total_usec, + kpis.prompt.last_usec, + kpis.tps.prompt); + } + return (status) ? (GENIE_STATUS_SUCCESS) : (GENIE_STATUS_ERROR_GENERATE_FAILED); +} diff --git a/Genie/Genie/src/Embedding.hpp b/Genie/Genie/src/Embedding.hpp new file mode 100644 index 0000000000000000000000000000000000000000..afbeb9986916b2e46bd7647223c0d540b31f8d18 --- /dev/null +++ b/Genie/Genie/src/Embedding.hpp @@ -0,0 +1,56 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#pragma once + +#include +#include + +#include "GenieEmbedding.h" +#include "Util/HandleManager.hpp" +#include "qualla/embedding.hpp" + +namespace genie { + +class Embedding { + public: + class Config { + public: + static GenieEmbeddingConfig_Handle_t add(std::shared_ptr config); + static std::shared_ptr get(GenieEmbeddingConfig_Handle_t handle); + static void remove(GenieEmbeddingConfig_Handle_t handle); + + Config(const char* configStr); + qualla::json getJson() const; + + private: + static qnn::util::HandleManager s_manager; + qualla::json m_config; + }; + + static GenieEmbedding_Handle_t add(std::shared_ptr embedding); + static std::shared_ptr get(GenieEmbedding_Handle_t handle); + static void remove(GenieEmbedding_Handle_t handle); + + Embedding(std::shared_ptr config); + + Embedding(const Embedding&) = delete; + Embedding& operator=(const Embedding&) = delete; + Embedding(Embedding&&) = delete; + Embedding& operator=(Embedding&&) = delete; + + int32_t generate(const char* queryStr, + GenieEmbedding_GenerateCallback_t callback, + const void* userData); + + private: + std::unique_ptr m_quallaEmbedding; + static qnn::util::HandleManager s_manager; + static std::atomic s_nameCounter; +}; +} // namespace genie diff --git a/Genie/Genie/src/Exception.hpp b/Genie/Genie/src/Exception.hpp index 956c935caecb25696b823d093dee0ee9b8e85405..b6ccdca1aa97c0858d306cf67bfe8592513adf80 100644 --- a/Genie/Genie/src/Exception.hpp +++ b/Genie/Genie/src/Exception.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include "GenieCommon.h" diff --git a/Genie/Genie/src/GenieDialog.cpp b/Genie/Genie/src/GenieDialog.cpp index 6f6f3116de4a0261b15aed2194cfc17b8b3bcda8..c4d3a6c4777c1da45d0106b6fad83b5f12da23a1 100644 --- a/Genie/Genie/src/GenieDialog.cpp +++ b/Genie/Genie/src/GenieDialog.cpp @@ -232,6 +232,24 @@ Genie_Status_t GenieDialog_tokenQuery(const GenieDialog_Handle_t dialogHandle, return status; } +GENIE_API +Genie_Status_t GenieDialog_getSampler(const GenieDialog_Handle_t dialogHandle, + GenieSampler_Handle_t* dialogSamplerHandle) { + try { + GENIE_ENSURE(dialogHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + auto dialog = genie::Dialog::get(dialogHandle); + GENIE_ENSURE(dialog, GENIE_STATUS_ERROR_INVALID_HANDLE); + GENIE_ENSURE(dialogSamplerHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + *dialogSamplerHandle = genie::Dialog::getSamplerHandle(dialog); + GENIE_ENSURE(*dialogSamplerHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_GET_HANDLE_FAILED; + } + + return GENIE_STATUS_SUCCESS; +} + GENIE_API Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) { try { @@ -246,4 +264,4 @@ Genie_Status_t GenieDialog_free(const GenieDialog_Handle_t dialogHandle) { return GENIE_STATUS_ERROR_GENERAL; } return GENIE_STATUS_SUCCESS; -} +} \ No newline at end of file diff --git a/Genie/Genie/src/GenieEmbedding.cpp b/Genie/Genie/src/GenieEmbedding.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2d4ee6361d53a96eb9c85ded8459f6ea32327279 --- /dev/null +++ b/Genie/Genie/src/GenieEmbedding.cpp @@ -0,0 +1,118 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +#include "Embedding.hpp" +#include "Exception.hpp" +#include "GenieEmbedding.h" +#include "Macro.hpp" +#include "Util/HandleManager.hpp" +#include "qualla/detail/json.hpp" + +using namespace genie; + +GENIE_API +Genie_Status_t GenieEmbeddingConfig_createFromJson(const char* str, + GenieEmbeddingConfig_Handle_t* configHandle) { + try { + GENIE_ENSURE(str, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + auto config = std::make_shared(str); + GENIE_ENSURE(config, GENIE_STATUS_ERROR_MEM_ALLOC); + *configHandle = genie::Embedding::Config::add(config); + } catch (const qualla::json::parse_error& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_JSON_FORMAT; + } catch (const Exception& e) { + std::cerr << e.what() << std::endl; + return e.status(); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_GENERAL; + } + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieEmbeddingConfig_free(const GenieEmbeddingConfig_Handle_t configHandle) { + try { + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + { + // Check if the embedding actually exists + auto configObj = genie::Embedding::Config::get(configHandle); + GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE); + } + genie::Embedding::Config::remove(configHandle); + } catch (const std::exception& e) { + return GENIE_STATUS_ERROR_GENERAL; + } + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieEmbedding_create(const GenieEmbeddingConfig_Handle_t configHandle, + GenieEmbedding_Handle_t* embeddingHandle) { + try { + GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + + // Get config object + auto configObj = genie::Embedding::Config::get(configHandle); + GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE); + + // Create embedding + auto embedding = std::make_shared(configObj); + GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_MEM_ALLOC); + + // Create Handle + *embeddingHandle = genie::Embedding::add(embedding); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_GENERAL; + } + + // Return SUCCESS + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieEmbedding_generate(const GenieEmbedding_Handle_t embeddingHandle, + const char* queryStr, + const GenieEmbedding_GenerateCallback_t callback, + const void* userData) { + int32_t status; + + try { + GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + auto embedding = genie::Embedding::get(embeddingHandle); + GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_INVALID_HANDLE); + GENIE_ENSURE(queryStr, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + GENIE_ENSURE(callback, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + + status = embedding->generate(queryStr, callback, userData); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_GENERAL; + } + + return status; +} + +GENIE_API +Genie_Status_t GenieEmbedding_free(const GenieEmbedding_Handle_t embeddingHandle) { + try { + GENIE_ENSURE(embeddingHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + { + // Check if the embedding actually exists + auto embedding = genie::Embedding::get(embeddingHandle); + GENIE_ENSURE(embedding, GENIE_STATUS_ERROR_INVALID_HANDLE); + } + genie::Embedding::remove(embeddingHandle); + } catch (const std::exception& e) { + return GENIE_STATUS_ERROR_GENERAL; + } + return GENIE_STATUS_SUCCESS; +} diff --git a/Genie/Genie/src/GenieSampler.cpp b/Genie/Genie/src/GenieSampler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b3b0d42757b8f7ee08d85023c1143523bd2fef00 --- /dev/null +++ b/Genie/Genie/src/GenieSampler.cpp @@ -0,0 +1,93 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +#include + +#include "Exception.hpp" +#include "GenieSampler.h" +#include "Macro.hpp" +#include "Sampler.hpp" +#include "Util/HandleManager.hpp" +#include "qualla/detail/json.hpp" + +using namespace genie; +GENIE_API +Genie_Status_t GenieSamplerConfig_createFromJson(const char* str, + GenieSamplerConfig_Handle_t* configHandle) { + try { + GENIE_ENSURE(str, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_ARGUMENT); + auto config = std::make_shared(str); + GENIE_ENSURE(config, GENIE_STATUS_ERROR_MEM_ALLOC); + *configHandle = Sampler::Sampler::SamplerConfig::add(config); + } catch (const qualla::json::parse_error& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_JSON_FORMAT; + } catch (const Exception& e) { + std::cerr << e.what() << std::endl; + return e.status(); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_GENERAL; + } + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieSamplerConfig_free(const GenieSamplerConfig_Handle_t configHandle) { + try { + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + { + // Check if the dialog actually exists + auto configObj = Sampler::SamplerConfig::get(configHandle); + GENIE_ENSURE(configObj, GENIE_STATUS_ERROR_INVALID_HANDLE); + } + Sampler::SamplerConfig::remove(configHandle); + } catch (const std::exception& e) { + return GENIE_STATUS_ERROR_GENERAL; + } + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieSamplerConfig_setParam(const GenieSamplerConfig_Handle_t configHandle, + const char* keyStr, + const char* valueStr) { + try { + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + auto samplerConfig = Sampler::SamplerConfig::get(configHandle); + GENIE_ENSURE(samplerConfig, GENIE_STATUS_ERROR_INVALID_HANDLE); + samplerConfig->setParam(keyStr, valueStr); + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_SET_PARAMS_FAILED; + } + return GENIE_STATUS_SUCCESS; +} + +GENIE_API +Genie_Status_t GenieSampler_applyConfig(const GenieSampler_Handle_t samplerHandle, + const GenieSamplerConfig_Handle_t configHandle) { + try { + GENIE_ENSURE(samplerHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + GENIE_ENSURE(configHandle, GENIE_STATUS_ERROR_INVALID_HANDLE); + + auto sampler = Sampler::get(samplerHandle); + GENIE_ENSURE(sampler, GENIE_STATUS_ERROR_INVALID_HANDLE); + + auto samplerConfig = Sampler::SamplerConfig::get(configHandle); + GENIE_ENSURE(samplerConfig, GENIE_STATUS_ERROR_INVALID_HANDLE); + + sampler->applyConfig(samplerConfig->getJson()); + + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + return GENIE_STATUS_ERROR_APPLY_CONFIG_FAILED; + } + return GENIE_STATUS_SUCCESS; +} \ No newline at end of file diff --git a/Genie/Genie/src/Macro.hpp b/Genie/Genie/src/Macro.hpp index c50b1585e4cee424b3b744415b4d74233d7a8c31..b0129ff11a353ec44fce465f441e0b5689c12c7f 100644 --- a/Genie/Genie/src/Macro.hpp +++ b/Genie/Genie/src/Macro.hpp @@ -8,6 +8,8 @@ #pragma once +#define ENABLE_DEBUG_LOGS 0 + //====================================================================================================================== // Error generation macros //====================================================================================================================== diff --git a/Genie/Genie/src/Sampler.cpp b/Genie/Genie/src/Sampler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7eda21996c7934ffd51d10d92e09608cca736bf --- /dev/null +++ b/Genie/Genie/src/Sampler.cpp @@ -0,0 +1,275 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== +#include +#include + +#include "Exception.hpp" +#include "Macro.hpp" +#include "Sampler.hpp" +#include "qualla/detail/json.hpp" + +using namespace genie; + +//============================================================================= +// Sampler functions +//============================================================================= + +qnn::util::HandleManager Sampler::s_manager; + +GenieSampler_Handle_t Sampler::add(std::shared_ptr config) { + return (GenieSampler_Handle_t)s_manager.add(config); +} + +std::shared_ptr Sampler::get(GenieSampler_Handle_t handle) { + return s_manager.get((qnn::util::Handle_t)handle); +} + +void Sampler::remove(GenieSampler_Handle_t handle) { + s_manager.remove((qnn::util::Handle_t)handle); +} + +Sampler::Sampler(qualla::json& origJson, + std::vector>& quallaSamplers) + : m_origJson(origJson), m_quallaSamplers(quallaSamplers) {} + +void Sampler::applyConfig(qualla::json samplerConfigJson) { + m_origJson["sampler"]["seed"] = qualla::Config::optional( + samplerConfigJson["sampler"], "seed", m_origJson["sampler"]["seed"]); + m_origJson["sampler"]["temp"] = qualla::Config::optional( + samplerConfigJson["sampler"], "temp", m_origJson["sampler"]["temp"]); + m_origJson["sampler"]["top-k"] = qualla::Config::optional( + samplerConfigJson["sampler"], "top-k", m_origJson["sampler"]["top-k"]); + m_origJson["sampler"]["top-p"] = qualla::Config::optional( + samplerConfigJson["sampler"], "top-p", m_origJson["sampler"]["top-p"]); + m_origJson["sampler"]["version"] = + qualla::Config::optional(samplerConfigJson["sampler"], "version", 1); + m_origJson["sampler"]["type"] = "basic"; + +#if ENABLE_DEBUG_LOGS + std::cout << "Updated sampler config: " << std::endl; + std::cout << "temp: " << m_origJson["sampler"]["temp"].get() << std::endl; + std::cout << "top-k: " << m_origJson["sampler"]["top-k"] << std::endl; + std::cout << "top-p: " << m_origJson["sampler"]["top-p"].get() << std::endl; + std::cout << "seed: " << m_origJson["sampler"]["seed"] << std::endl; +#endif + // Loop through the live qualla sampler instances and update the parameters + for (auto& quallaSampler : m_quallaSamplers) { + quallaSampler.get().applyConfig(m_origJson["sampler"]); + } +} + +//============================================================================= +// Sampler::SamplerConfig functions +//============================================================================= + +qnn::util::HandleManager Sampler::SamplerConfig::s_manager; + +GenieSamplerConfig_Handle_t Sampler::SamplerConfig::add( + std::shared_ptr config) { + return (GenieSamplerConfig_Handle_t)s_manager.add(config); +} + +std::shared_ptr Sampler::SamplerConfig::get( + GenieSamplerConfig_Handle_t handle) { + return s_manager.get((qnn::util::Handle_t)handle); +} + +void Sampler::SamplerConfig::remove(GenieSamplerConfig_Handle_t handle) { + s_manager.remove((qnn::util::Handle_t)handle); +} + +Sampler::SamplerConfig::SamplerConfig(const char* configStr) { + qualla::json quallaConfig; + qualla::json config; + { + std::set keys; + + auto callback = [&keys](int depth, qualla::json::parse_event_t event, qualla::json& parsed) { + if ((depth == 1) && (event == qualla::json::parse_event_t::key)) { + if (keys.count(parsed) > 0) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Multiple sampler config key: " + parsed.dump()); + } + keys.insert(parsed); + } + return true; + }; + + config = qualla::json::parse(configStr, callback); + } + + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Sampler config is not an object"); + } + + if (!config.contains("sampler")) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing field: sampler"); + } + + // component is used in the "ENFORCE" macros + const std::string component = "sampler"; + for (auto& item : config.items()) { + if (item.key() == "sampler") { + JSON_ENFORCE_OBJECT(); + validateSamplerConfig(item.value()); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key()); + } + } + + if (config["sampler"].contains("seed")) + quallaConfig["sampler"]["seed"] = config["sampler"]["seed"]; + if (config["sampler"].contains("temp")) + quallaConfig["sampler"]["temp"] = config["sampler"]["temp"]; + if (config["sampler"].contains("top-k")) + quallaConfig["sampler"]["top-k"] = config["sampler"]["top-k"]; + if (config["sampler"].contains("top-p")) + quallaConfig["sampler"]["top-p"] = config["sampler"]["top-p"]; + if (config["sampler"].contains("greedy")) + quallaConfig["sampler"]["greedy"] = config["sampler"]["greedy"]; + if (config["sampler"].contains("version")) + quallaConfig["sampler"]["version"] = config["sampler"]["version"]; + else + quallaConfig["sampler"]["version"] = 1; + + quallaConfig["sampler"]["type"] = "basic"; + + m_config = quallaConfig; +} + +void Sampler::SamplerConfig::setParam(const std::string& keyStr, const std::string& valueStr) { + if (!keyStr.empty()) { + // Case 1: Only the parameter mentioned in keyStr is to be updated by valueStr + std::set validParams = {"seed", "top-p", "top-k", "temp"}; + if (std::find(validParams.begin(), validParams.end(), keyStr) == validParams.end()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Invalid key obtained: " + keyStr); + } + try { + if (keyStr == "seed") + m_config["sampler"]["seed"] = std::stoi(valueStr); + else if (keyStr == "top-p") + m_config["sampler"]["top-p"] = std::stof(valueStr); + else if (keyStr == "top-k") + m_config["sampler"]["top-k"] = std::stof(valueStr); + else if (keyStr == "temp") + m_config["sampler"]["temp"] = std::stof(valueStr); + } catch (const std::invalid_argument& e) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Invalid value obtained: " + valueStr + " for key: " + keyStr); + } + } else { + // Case 2: User has passed entire json as a string in valueStr + + if (valueStr.empty()) + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Both keyStr and valueStr cannot be empty"); + + qualla::json config = qualla::json::parse(valueStr); + if (!config.contains("sampler")) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing field: sampler"); + } + + // component is used in the "ENFORCE" macros + const std::string component = "sampler"; + for (auto& item : config.items()) { + if (item.key() == "sampler") { + JSON_ENFORCE_OBJECT(); + validateSamplerConfig(item.value()); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, + "Unknown sampler config key: " + item.key()); + } + } + + m_config["sampler"]["seed"] = + qualla::Config::optional(config["sampler"], "seed", m_config["sampler"]["seed"]); + m_config["sampler"]["temp"] = + qualla::Config::optional(config["sampler"], "temp", m_config["sampler"]["temp"]); + m_config["sampler"]["top-k"] = + qualla::Config::optional(config["sampler"], "top-k", m_config["sampler"]["top-k"]); + m_config["sampler"]["top-p"] = + qualla::Config::optional(config["sampler"], "top-p", m_config["sampler"]["top-p"]); + m_config["sampler"]["version"] = qualla::Config::optional( + config["sampler"], "version", m_config["sampler"]["version"]); + + m_config["sampler"]["type"] = "basic"; + } +} + +void Sampler::SamplerConfig::validateSamplerConfig(const qualla::json& config) { + if (!config.is_object()) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "sampler config is not an object"); + } + + const std::set mandatoryFields{"version"}; + for (const auto& field : mandatoryFields) { + if (!config.contains(field)) { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Missing sampler field: " + field); + } + } + + // component is used in the "ENFORCE" macros + const std::string component = "sampler"; + + for (auto& item : config.items()) { + if (item.key() == "version") { + JSON_ENFORCE_NUMERIC(); + if (item.value().get() != 1) { + throw Exception(GENIE_STATUS_ERROR_JSON_VALUE, + "Invalid sampler config: unsupported version: " + item.value().dump()); + } + } else if (item.key() == "seed") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "temp") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "top-k") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "top-p") { + JSON_ENFORCE_NUMERIC(); + } else if (item.key() == "greedy") { + JSON_ENFORCE_BOOLEAN(); + } else { + throw Exception(GENIE_STATUS_ERROR_JSON_SCHEMA, "Unknown sampler config key: " + item.key()); + } + } +} + +void Sampler::SamplerConfig::translateSamplerConfig(const qualla::json& genieConfig, + qualla::json& quallaConfig) { + if (genieConfig["dialog"].contains("sampler")) { + quallaConfig["sampler"]["type"] = "basic"; + + if (genieConfig["dialog"]["sampler"].contains("seed")) { + quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"]; + } + if (genieConfig["dialog"]["sampler"].contains("temp")) { + quallaConfig["sampler"]["temp"] = genieConfig["dialog"]["sampler"]["temp"]; + } + + quallaConfig["sampler"]["role"] = "primary"; +#if defined(GENIE_SPD_FEATURE) + if (genieConfig["dialog"]["type"] == "spd") { + quallaConfig["sampler"]["role"] = "target"; + } +#endif + + if (genieConfig["dialog"]["sampler"].contains("top-k")) { + quallaConfig["sampler"]["top-k"] = genieConfig["dialog"]["sampler"]["top-k"]; + } + if (genieConfig["dialog"]["sampler"].contains("top-p")) { + quallaConfig["sampler"]["top-p"] = genieConfig["dialog"]["sampler"]["top-p"]; + } + if (genieConfig["dialog"]["sampler"].contains("greedy")) { + quallaConfig["sampler"]["greedy"] = genieConfig["dialog"]["sampler"]["greedy"]; + } + if (genieConfig["dialog"]["sampler"].contains("seed")) { + quallaConfig["sampler"]["seed"] = genieConfig["dialog"]["sampler"]["seed"]; + } + } +} + +qualla::json Sampler::SamplerConfig::getJson() const { return m_config; } diff --git a/Genie/Genie/src/Sampler.hpp b/Genie/Genie/src/Sampler.hpp new file mode 100644 index 0000000000000000000000000000000000000000..212e20acce13f8accc57bf07d527cc608f550b00 --- /dev/null +++ b/Genie/Genie/src/Sampler.hpp @@ -0,0 +1,60 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#pragma once +#include + +#include "GenieSampler.h" +#include "Util/HandleManager.hpp" +#include "qualla/env.hpp" +#include "qualla/sampler.hpp" + +namespace genie { +class Sampler { + public: + class SamplerConfig { + public: + static GenieSamplerConfig_Handle_t add(std::shared_ptr config); + + static std::shared_ptr get(GenieSamplerConfig_Handle_t handle); + + static void remove(GenieSamplerConfig_Handle_t handle); + + static void validateSamplerConfig(const qualla::json& config); + + static void translateSamplerConfig(const qualla::json& genieConfig, qualla::json& quallaConfig); + + SamplerConfig(const char* configStr); + + void setParam(const std::string& keyStr, const std::string& valueStr); + + qualla::json getJson() const; + + private: + static qnn::util::HandleManager s_manager; + qualla::json m_config; + }; + + static GenieSampler_Handle_t add(std::shared_ptr sampler); + static std::shared_ptr get(GenieSampler_Handle_t handle); + static void remove(GenieSampler_Handle_t handle); + + Sampler(qualla::json& origJson, + std::vector>& quallaSamplers); + + void applyConfig(qualla::json samplerConfigJson); + + const qualla::json& getJson(); + + private: + qualla::json m_origJson; + static qnn::util::HandleManager s_manager; + std::vector> m_quallaSamplers; +}; + +} // namespace genie diff --git a/Genie/Genie/src/qualla/context.cpp b/Genie/Genie/src/qualla/context.cpp index 9a71ce9c1754ea9bdee044d01fab6a99ef79c543..8ca76f99bde3a4b5cd6b1e3db833263bd600db1b 100644 --- a/Genie/Genie/src/qualla/context.cpp +++ b/Genie/Genie/src/qualla/context.cpp @@ -93,6 +93,10 @@ extern void needQnnHtpEngine(); extern void needQnnCpuEngine(); #endif + #ifdef QUALLA_ENGINE_QNN_GPU +extern void needQnnGpuEngine(); + #endif + static OnLoad needs([]() { needStdoutLogger(); needFileLogger(); @@ -111,6 +115,10 @@ static OnLoad needs([]() { #ifdef QUALLA_ENGINE_QNN_CPU needQnnCpuEngine(); #endif + + #ifdef QUALLA_ENGINE_QNN_GPU + needQnnGpuEngine(); + #endif }); #endif diff --git a/Genie/Genie/src/qualla/dialogs/ssd-q1.cpp b/Genie/Genie/src/qualla/dialogs/ssd-q1.cpp index 1e0048f8125451ad27f546e9f0f4210f79e09468..a306dee78c0867240fc57bc0876aa5a285807cde 100644 --- a/Genie/Genie/src/qualla/dialogs/ssd-q1.cpp +++ b/Genie/Genie/src/qualla/dialogs/ssd-q1.cpp @@ -161,7 +161,7 @@ SelfSpecDecDialog::SelfSpecDecDialog( m_inputType = _engine["primary"]->getInputType(); // Load KV prefix Timer timer; - size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name); + size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name, true); if (n_restored_prefix != _forecast_prefix) { // clang-format off throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$", @@ -1001,7 +1001,7 @@ bool SelfSpecDecDialog::process(std::vector& tokens, Dialog::Callback c void SelfSpecDecDialog::reset() { Dialog::reset(); _n_past = _forecast_prefix; - size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name); + size_t n_restored_prefix = _engine["primary"]->restore(_kv_prefix_name, true); if (n_restored_prefix != _forecast_prefix) { // clang-format off throw std::runtime_error( fmt::format( "SSD : Loaded {} KV$ from {} but expected {} KV$", diff --git a/Genie/Genie/src/qualla/engine.cpp b/Genie/Genie/src/qualla/engine.cpp index 2e2d3ce3db40f1e230259d7128b3c05790cb8543..c4d507c02d811569b02639e7aa78706b2f48b0ee 100644 --- a/Genie/Genie/src/qualla/engine.cpp +++ b/Genie/Genie/src/qualla/engine.cpp @@ -69,7 +69,7 @@ bool Engine::updateKV(size_t n_past, const std::vector& selected) { return false; } -size_t Engine::restore(const std::string& name) { +size_t Engine::restore(const std::string& name, bool chooseHigherVariant) { _env.logger().error(fmt::format("{}-engine does not support restore", _type)); return 0; } diff --git a/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.cpp b/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b3350cf33a4e91084d7846ca2ec2db4b2d3117b --- /dev/null +++ b/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.cpp @@ -0,0 +1,317 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include "QnnMem.h" +#include "DmaBufAllocator.hpp" +#include "QnnTypeMacros.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +DmaBufferAllocator::DmaBufferAllocator(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface) + : m_libDmaBufHeapHandle(nullptr), + m_dmaBufCreate(nullptr), + m_dmaBufAlloc(nullptr), + m_dmaBufDeinit(nullptr), + m_qnnInterface(qnnInterface), + m_contextHandle(contextHandle) {} + +bool DmaBufferAllocator::initialize() { + // On Android, 32-bit and 64-bit libdmaBufheap.so can be found at /system/lib and /system/lib64 + // respectively. + m_libDmaBufHeapHandle = dlopen("libdmabufheap.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == m_libDmaBufHeapHandle) { + QNN_ERROR("Unable to load backend. dlerror(): %s", dlerror()); + return false; + } + m_dmaBufCreate = (DmaBufCreateFn_t)dlsym( + m_libDmaBufHeapHandle, "CreateDmabufHeapBufferAllocator"); + m_dmaBufAlloc = + (DmaBufAllocFn_t)dlsym(m_libDmaBufHeapHandle, "DmabufHeapAlloc"); + m_dmaBufDeinit = (DmaBufDeinitFn_t)dlsym( + m_libDmaBufHeapHandle, "FreeDmabufHeapBufferAllocator"); + if (nullptr == m_dmaBufCreate || nullptr == m_dmaBufAlloc || nullptr == m_dmaBufDeinit) { + QNN_ERROR("Unable to access symbols in libdmaBufheap. dlerror(): %s", dlerror()); + return false; + } + return true; +} + +DmaBufferAllocator::~DmaBufferAllocator() { + if (m_libDmaBufHeapHandle) { + dlclose(m_libDmaBufHeapHandle); + m_libDmaBufHeapHandle = nullptr; + } +} + +DmaBufferData* DmaBufferAllocator::getDmaBufTensorData(Qnn_Tensor_t* tensor) { + if (tensor == nullptr) return nullptr; + Qnn_MemHandle_t mem_handle = QNN_TENSOR_GET_MEM_HANDLE(tensor); + if (mem_handle == nullptr) return nullptr; + return &m_memHandleToDmaBufMem.at(mem_handle); +} + +void* DmaBufferAllocator::getBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_WARN("DmaBufferAllocator: getBuffer: received a null pointer to a tensor"); + return nullptr; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("DmaBufferAllocator: Tensor not found with address = %p", tensor); + return nullptr; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + return dmaBufferData.memPointer; +} + + + +int DmaBufferAllocator::getFd(Qnn_Tensor_t* tensor) { + DmaBufferData* data = getDmaBufTensorData(tensor); + if (data == nullptr) { + QNN_ERROR("DmaBufferAllocator: getFd : Couldn't find tensor %p", tensor); + return -1; + } + return data->fd; +} + +size_t DmaBufferAllocator::getOffset(Qnn_Tensor_t* tensor) { + DmaBufferData* data = getDmaBufTensorData(tensor); + if (data == nullptr) { + QNN_ERROR("DmaBufferAllocator: getOffset : Couldn't find tensor %p", tensor); + return 0; + } + return data->offset; +} + +size_t DmaBufferAllocator::getBufferSize(Qnn_Tensor_t* tensor) { + DmaBufferData* data = getDmaBufTensorData(tensor); + if (data == nullptr) { + QNN_ERROR("DmaBufferAllocator: getBufferSize : Couldn't find tensor %p", tensor); + return 0; + } + return data->totalBufferSize; +}; + +size_t DmaBufferAllocator::getTotalBufferSize(Qnn_Tensor_t* tensor) { + DmaBufferData* data = getDmaBufTensorData(tensor); + if (data == nullptr) { + QNN_ERROR("DmaBufferAllocator: getTotalBufferSize : Couldn't find tensor %p", tensor); + return 0; + } + return data->totalBufferSize; +} + +bool DmaBufferAllocator::allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) { + if (m_libDmaBufHeapHandle == nullptr) { + QNN_ERROR("DmaBufferAllocator not initialized"); + return false; + } + + if (!tensor) { + QNN_ERROR("DmaBufferAllocator: Received nullptr for tensor"); + return false; + } + + if (m_tensorToDmaBufferData.find(tensor) != m_tensorToDmaBufferData.end()) { + QNN_ERROR("DmaBufferAllocator: Tensor already allocated"); + return false; + } + + void* dmaBufferAllocator = m_dmaBufCreate(); + if (dmaBufferAllocator == nullptr) { + QNN_ERROR("DmaBufferAllocator: nullptr returned for CreateDmabufHeapBufferAllocator()."); + return false; + } + + int fd = m_dmaBufAlloc(dmaBufferAllocator, "qcom,system", tensorDataSize, 0, 0); + if (fd < 0) { + QNN_ERROR("DmaBufAlloc returned a invalid file descriptor = %d", fd); + return false; + } + + void* memPointer = mmap(nullptr, tensorDataSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (MAP_FAILED == memPointer) { + printf("DmaBufferAllocator: Unable to open file returned by DmaBufAlloc with mmap"); + return false; + } + + Qnn_MemDescriptor_t memDescriptor = { + {QNN_TENSOR_GET_RANK(tensor), QNN_TENSOR_GET_DIMENSIONS(tensor), nullptr}, + QNN_TENSOR_GET_DATA_TYPE(tensor), + QNN_MEM_TYPE_DMA_BUF, + {.dmaBufInfo = {fd, memPointer}}}; + QNN_TENSOR_SET_MEM_TYPE(tensor, QNN_TENSORMEMTYPE_MEMHANDLE); + QNN_TENSOR_SET_MEM_HANDLE(tensor, nullptr); + Qnn_MemHandle_t memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor); + + if (QNN_SUCCESS != + m_qnnInterface->memRegister(m_contextHandle, &memDescriptor, 1, &(memHandle))) { + QNN_ERROR("DmaBufferAllocator: Failure to register ion memory with the backend"); + return false; + } + QNN_DEBUG("DmaBufferAllocator: Memregister successful with handle %p for DMA buffer with size: %zu and fd %d", + memHandle, + tensorDataSize, + fd); + QNN_TENSOR_SET_MEM_HANDLE(tensor, memHandle); + m_tensorToDmaBufferData.insert( + {tensor, DmaBufferData(dmaBufferAllocator, fd, memPointer, tensorDataSize)}); + + return true; +} + +bool DmaBufferAllocator::freeTensorBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_ERROR("DmaBufferAllocator: Received nullptr for tensor"); + return false; + } + auto memHandle = QNN_TENSOR_GET_MEM_HANDLE(tensor); + if (QNN_SUCCESS != m_qnnInterface->memDeRegister(&memHandle, 1)) { + QNN_ERROR("DmaBufferAllocator: Failed to deregister custom memory handle with the backend"); + return false; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("DmaBufferAllocator: Tensor not found with address = %p", tensor); + return false; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + if (!m_dmaBufDeinit) { + QNN_ERROR("DmaBufferAllocator: DmaBuf Deinit function pointer is null"); + return false; + } + munmap(dmaBufferData.memPointer, dmaBufferData.totalBufferSize); + m_dmaBufDeinit(dmaBufferData.dmaBufferAllocator); + m_tensorToDmaBufferData.erase(tensor); + return true; +} + +bool DmaBufferAllocator::useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) { + if (nullptr == dest || nullptr == src) { + QNN_ERROR("DmaBufferAllocator: Received nullptr"); + return false; + } + if (m_tensorToDmaBufferData.find(src) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("DmaBufferAllocator: Src Tensor not found"); + return false; + } + + QNN_TENSOR_SET_MEM_TYPE(dest, QNN_TENSOR_GET_MEM_TYPE(src)); + QNN_TENSOR_SET_MEM_HANDLE(dest, QNN_TENSOR_GET_MEM_HANDLE(src)); + m_tensorToDmaBufferData.insert({dest, m_tensorToDmaBufferData[src]}); + m_sameMemoryFreeTensors.insert(dest); + return true; +} + + + +bool DmaBufferAllocator::beforeWriteToBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_WARN("beforeWriteToBuffer: received a null pointer to a tensor"); + return false; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("beforeWriteToBuffer: Tensor not found with address = %p", tensor); + return false; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + struct dma_buf_sync buf_sync = {}; + buf_sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; + auto ioctlReturnValue = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync); + if (ioctlReturnValue) { + QNN_ERROR( + "beforeWriteToBuffer: Error preparing the cache for buffer writes." + "The DMA_BUF_IOCTL_SYNC operation returned %d", + ioctlReturnValue); + return false; + } + return true; +} + +bool DmaBufferAllocator::afterWriteToBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_WARN("afterWriteToBuffer: received a null pointer to a tensor"); + return false; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("afterWriteToBuffer: Tensor not found with address = %p", tensor); + return false; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + struct dma_buf_sync buf_sync = {}; + buf_sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; + auto ioctlReturnValue = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync); + if (ioctlReturnValue) { + QNN_ERROR( + "afterWriteToBuffer: Error close the cache after buffer writing." + "The DMA_BUF_IOCTL_SYNC operation returned %d", + ioctlReturnValue); + return false; + } + return true; +} + +bool DmaBufferAllocator::beforeReadFromBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_WARN("beforeReadFromBuffer: received a null pointer to a tensor"); + return false; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("beforeReadFromBuffer: Tensor not found with address = %p", tensor); + return false; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + struct dma_buf_sync buf_sync = {}; + buf_sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ; + auto ioctlReturnValue = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync); + if (ioctlReturnValue) { + QNN_ERROR( + "beforeReadFromBuffer: Error preparing the cache for buffer reading." + "The DMA_BUF_IOCTL_SYNC operation returned %d", + ioctlReturnValue); + return false; + } + return true; +} + +bool DmaBufferAllocator::afterReadFromBuffer(Qnn_Tensor_t* tensor) { + if (!tensor) { + QNN_WARN("afterReadFromBuffer: received a null pointer to a tensor"); + return false; + } + if (m_tensorToDmaBufferData.find(tensor) == m_tensorToDmaBufferData.end()) { + QNN_ERROR("afterReadFromBuffer: Tensor not found with address = %p", tensor); + return false; + } + DmaBufferData dmaBufferData = m_tensorToDmaBufferData[tensor]; + struct dma_buf_sync buf_sync = {}; + buf_sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ; + auto ioctlReturnValue = ioctl(dmaBufferData.fd, DMA_BUF_IOCTL_SYNC, &buf_sync); + if (ioctlReturnValue) { + QNN_ERROR( + "afterReadFromBuffer: Error closing the cache after buffer reading." + "The DMA_BUF_IOCTL_SYNC operation returned %d", + ioctlReturnValue); + return false; + } + return true; +} diff --git a/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.hpp b/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cc8b2309befa23106d9df66327788793abb79dbc --- /dev/null +++ b/Genie/Genie/src/qualla/engines/qnn-api/DmaBufAllocator.hpp @@ -0,0 +1,128 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== +#pragma once + +#include +#include +#include +#include + +#include "IBufferAlloc.hpp" +#include "QnnInterface.h" +#include "Log.hpp" + +typedef void *(*DmaBufCreateFn_t)(); +typedef int (*DmaBufAllocFn_t)(void *, const char *, size_t, unsigned int, size_t); +typedef void (*DmaBufDeinitFn_t)(void *); + +struct DmaBufferData { + void *dmaBufferAllocator; + int fd; + void* memPointer; + size_t totalBufferSize; + int offset{0}; + DmaBufferData() : dmaBufferAllocator(nullptr), fd(-1), memPointer(nullptr), totalBufferSize(0) {} + DmaBufferData(void *bufferAllocator, int fdIn, void* memPointerIn, size_t sizeIn) + : dmaBufferAllocator(bufferAllocator), fd(fdIn), memPointer(memPointerIn), totalBufferSize(sizeIn) {} +}; + +class DmaBufferAllocator final : public IBufferAlloc { + public: + DmaBufferAllocator(Qnn_ContextHandle_t contextHandle, QNN_INTERFACE_VER_TYPE* qnnInterface); + // Disable copy constructors, r-value referencing, etc + DmaBufferAllocator(const DmaBufferAllocator&) = delete; + DmaBufferAllocator& operator=(const DmaBufferAllocator&) = delete; + DmaBufferAllocator(DmaBufferAllocator&&) = delete; + DmaBufferAllocator& operator=(DmaBufferAllocator&&) = delete; + + bool initialize() override; + void* getBuffer(Qnn_Tensor_t* tensor) override; + int getFd(Qnn_Tensor_t* tensor) override; + size_t getOffset(Qnn_Tensor_t* tensor) override; + size_t getBufferSize(Qnn_Tensor_t* tensor) override; + size_t getTotalBufferSize(Qnn_Tensor_t* tensor) override; + + bool freeTensorBuffer(Qnn_Tensor_t* tensor) override; + + bool allocateTensorBuffer(Qnn_Tensor_t* tensor, size_t tensorDataSize) override; + bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src) override; + + virtual ~DmaBufferAllocator(); + + bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) override; + bool afterWriteToBuffer(Qnn_Tensor_t *tensor) override; + bool beforeReadFromBuffer(Qnn_Tensor_t *tensor) override; + bool afterReadFromBuffer(Qnn_Tensor_t *tensor) override; + + + bool useSameMemory(Qnn_Tensor_t* dest, Qnn_Tensor_t* src, int offset) override { + QNN_WARN("Offset based tensors not supported!!"); + return false;; + } + bool useExternalMemory(Qnn_Tensor_t* dest, void* extMem) override { + QNN_WARN("External Memory not supported!!"); + return false;; + } + void* allocateTensorFusedBuffer(uint64_t bufferSize, int32_t* fd) override { + QNN_WARN("Fused Buffers not supported\n"); + return nullptr; + }; + bool allocateBuffers( + const std::map>& allocs_per_chunk, + std::map>& tensor_offsets + ) override { + QNN_WARN("Fused Buffers not supported\n"); + return false; + }; + bool mapFusedBufferOffset( + Qnn_Tensor_t* tensor, + size_t tensorDataSize, + int32_t fd, + uint32_t offset, + uint64_t totalBufferSize, + void* memPointer, + Qnn_ContextHandle_t contextHandle + ) override { + QNN_WARN("Fused Buffers not supported\n"); + return false; + }; + bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) override { + QNN_WARN("Fused Buffers not supported\n"); + return false; + }; + void freeFusedBuffers() override { + return; + }; + bool mapFusedBufferOffset( + Qnn_Tensor_t* tensor, + int alloc_idx, + size_t offset, + Qnn_ContextHandle_t ctx, + size_t size + ) override { + QNN_WARN("Fused Buffers not supported\n"); + return false; + }; + + private: + DmaBufferData * getDmaBufTensorData(Qnn_Tensor_t* tensor); + + // Pointer to the dlopen'd libdmabufheap.so shared library which contains + // dmaBufCreate, dmaBufAlloc, dmaBufDeinit + void *m_libDmaBufHeapHandle; + DmaBufCreateFn_t m_dmaBufCreate; + DmaBufAllocFn_t m_dmaBufAlloc; + DmaBufDeinitFn_t m_dmaBufDeinit; + + QNN_INTERFACE_VER_TYPE* m_qnnInterface; + Qnn_ContextHandle_t m_contextHandle; + + std::unordered_map m_tensorToDmaBufferData; + std::unordered_set m_sameMemoryFreeTensors; + std::unordered_map m_memHandleToDmaBufMem; +}; diff --git a/Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp b/Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp index 6eac516d96a69ce8d07984b7a1ba2899c55997ae..23edb47c62c79ba0751e1e2d226d0d3e6ccef095 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/IBufferAlloc.hpp @@ -53,4 +53,18 @@ class IBufferAlloc { virtual bool deregisterTensorFusedBuffer(Qnn_Tensor_t* tensor) = 0; virtual void freeFusedBuffers() = 0; -}; \ No newline at end of file + + // Functions to sync memory buffers for Read/Write using DmaBuf. + virtual bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) { + return false; + }; + virtual bool afterWriteToBuffer(Qnn_Tensor_t *tensor) { + return false; + }; + virtual bool beforeReadFromBuffer(Qnn_Tensor_t *tensor) { + return false; + }; + virtual bool afterReadFromBuffer(Qnn_Tensor_t *tensor) { + return false; + }; +}; diff --git a/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp b/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp index 9ad51cb4a6b8ab777cf92c928fa19217f653a9a2..305c3b6960cbcb4117f42e71dfb8dd130d11de41 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.cpp @@ -10,6 +10,9 @@ #include #include "ClientBuffer.hpp" +#ifndef _WIN32 +#include "DmaBufAllocator.hpp" +#endif #include "IBufferAlloc.hpp" #include "IOTensor.hpp" #include "RpcMem.hpp" @@ -28,6 +31,14 @@ IOTensor::IOTensor(BufferAlloc bufferAllocIn, QNN_INTERFACE_VER_TYPE* qnnInterfa bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) { if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) { m_bufferManager = std::unique_ptr(new RpcMem(contextHandle, m_qnnInterface)); + } else if (m_bufferAlloc == BufferAlloc::DMABUF) { +#ifdef _WIN32 + return false; +#else + m_bufferManager = + std::unique_ptr(new DmaBufferAllocator(contextHandle, m_qnnInterface) + ); +#endif } if (true != m_bufferManager->initialize()) { @@ -39,7 +50,7 @@ bool IOTensor::initialize(Qnn_ContextHandle_t contextHandle) { } IOTensor::~IOTensor() { - if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER) { + if (m_bufferAlloc == BufferAlloc::SHARED_BUFFER || m_bufferAlloc == BufferAlloc::DMABUF) { m_bufferManager->freeFusedBuffers(); } } @@ -215,6 +226,70 @@ bool IOTensor::setupOutputTensors( return true; } +// Setup details for Qnn_Tensor_t for execution. +// Reuse same memory handle for KV input and output tensor. +bool IOTensor::setupOutputWithSharedTensors( + Qnn_Tensor_t** tensors, + std::unordered_map& tensorNameToTensorPointer, + const GraphInfo_t& graphInfo, + std::unordered_map& tensorsSize, + Qnn_ContextHandle_t contextHandle, + std::unordered_map sharedTensorMap +) { + uint32_t tensorCount = graphInfo.numOutputTensors; + TensorWrapper* tensorWrappers = graphInfo.outputTensors; + if (nullptr == tensorWrappers) { + QNN_ERROR("tensorWrappers is nullptr"); + return false; + } + + if (0 == tensorCount) { + QNN_DEBUG("tensor count is 0. Nothing to setup."); + return true; + } + + *tensors = (Qnn_Tensor_t*)calloc(1, tensorCount * sizeof(Qnn_Tensor_t)); + if (nullptr == *tensors) { + QNN_ERROR("mem alloc failed for *tensors"); + return false; + } + + bool returnStatus = true; + for (size_t tensorIdx = 0; tensorIdx < tensorCount; tensorIdx++) { + Qnn_Tensor_t wrapperTensor = GET_TENSOR_WRAPPER_TENSOR(tensorWrappers[tensorIdx]); + auto wrapperTensorName = std::string(GET_TENSOR_WRAPPER_NAME(tensorWrappers[tensorIdx])); + if (true == returnStatus) { + (*tensors)[tensorIdx] = QNN_TENSOR_INIT; + returnStatus = deepCopyQnnTensorInfo(((*tensors) + tensorIdx), &wrapperTensor); + } + if (true == returnStatus) { + if (sharedTensorMap.find(wrapperTensorName) == sharedTensorMap.end()) { + QNN_DEBUG("IoTensor :: Create Buffer for Tensor %s", wrapperTensorName.c_str()); + size_t tensorDataSize = tensorsSize[wrapperTensorName]; + returnStatus = m_bufferManager->allocateTensorBuffer( + ((*tensors) + tensorIdx), tensorDataSize + ); + } else { + std::string inputName = QNN_TENSOR_GET_NAME(sharedTensorMap[wrapperTensorName]); + QNN_DEBUG("IoTensor :: Reuse Buffer %s for Tensor %s", inputName.c_str(), wrapperTensorName.c_str()); + returnStatus = m_bufferManager->useSameMemory( + ((*tensors) + tensorIdx), sharedTensorMap[wrapperTensorName] + ); + } + } + if (true != returnStatus) { + QNN_ERROR("Failure in setupTensors, cleaning up resources"); + tearDownTensors(*tensors, tensorIdx); + *tensors = nullptr; + QNN_ERROR("Failure in setupTensors, done cleaning up resources"); + break; + } else { + tensorNameToTensorPointer.insert({wrapperTensorName, ((*tensors) + tensorIdx)}); + } + } + return returnStatus; +} + bool IOTensor::mapFusedBufferOffset( GraphInfo_t* graph_info, Qnn_ContextHandle_t context_handle, diff --git a/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp b/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp index 4212bd8af669b9f27bc8a2197f6a7735cc76f066..a5c93676bbf05cd5fce80d27e92b066707e65953 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/IOTensor.hpp @@ -28,6 +28,7 @@ enum class BufferAlloc { DEFAULT, // malloc based allocator SHARED_BUFFER, // shared buffer allocator; actual allocator depends on the platform + DMABUF, // dma buffer allocator INVALID }; class IBufferAlloc; @@ -60,6 +61,16 @@ class IOTensor { bool skipBufferAllocation = false ); + bool setupOutputWithSharedTensors( + Qnn_Tensor_t** outputs, + std::unordered_map& tensorNameToTensorPointer, + const GraphInfo_t& graphInfo, + std::unordered_map& outputTensorsSize, + Qnn_ContextHandle_t contextHandle, + std::unordered_map sharedTensorMap + ); + + bool tearDownTensors(Qnn_Tensor_t* tensors, uint32_t tensorCount); bool tearDownTensors(std::vector& tensors, uint32_t tensorCount); @@ -146,6 +157,20 @@ class IOTensor { std::unordered_set& getFreeTensorsPointerSet() { return m_freeTensorsPointerSet; } + // Functions to sync memory buffers for Read/Write using DmaBuf. + bool beforeWriteToBuffer(Qnn_Tensor_t *tensor) { + return m_bufferManager->beforeWriteToBuffer(tensor); + } + bool afterWriteToBuffer(Qnn_Tensor_t *tensor){ + return m_bufferManager->afterWriteToBuffer(tensor); + } + bool beforeReadFromBuffer(Qnn_Tensor_t *tensor){ + return m_bufferManager->beforeReadFromBuffer(tensor); + } + bool afterReadFromBuffer(Qnn_Tensor_t *tensor){ + return m_bufferManager->afterReadFromBuffer(tensor); + } + private: BufferAlloc m_bufferAlloc; QNN_INTERFACE_VER_TYPE* m_qnnInterface; diff --git a/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp b/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp index e7abc2b8db5d23ce164b037e9be2a1cfd0597415..6d15502963e5d7eb85df0461a5f275f94a13a02b 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.cpp @@ -106,11 +106,17 @@ bool QnnApi::getContextConfigs( ) { std::vector contextConfigPtrsVec; - if (contextPriority != QNN_PRIORITY_DEFAULT) { - contextConfigPtrsVec.push_back((QnnContext_Config_t*)malloc(sizeof(QnnContext_Config_t))); + if (contextPriority == QNN_PRIORITY_UNDEFINED) { + contextConfigPtrsVec.push_back((QnnContext_Config_t *) malloc(sizeof(QnnContext_Config_t))); contextConfigPtrsVec.back()->option = - QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_OPTION_PRIORITY; - contextConfigPtrsVec.back()->priority = contextPriority; + QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_UNDEFINED; + } else { + if (contextPriority != QNN_PRIORITY_DEFAULT) { + contextConfigPtrsVec.push_back((QnnContext_Config_t *) malloc(sizeof(QnnContext_Config_t))); + contextConfigPtrsVec.back()->option = + QnnContext_ConfigOption_t::QNN_CONTEXT_CONFIG_OPTION_PRIORITY; + contextConfigPtrsVec.back()->priority = contextPriority; + } } const char** graphNames = nullptr; @@ -891,6 +897,8 @@ bool QnnApi::composeGraphs( QnnLog_Level_t::QNN_LOG_LEVEL_VERBOSE ); + graphCountPerContext = m_graphsCount; + if (status == MODEL_NO_ERROR) { return true; } @@ -1163,33 +1171,6 @@ bool QnnApi::createFromBinary( } } - QnnContext_Config_t** contextConfigs = nullptr; - uint32_t contextConfigCount = 0; - if (true != getContextConfigs( - &contextConfigs, - contextConfigCount, - contextConfig.priority, - graphSwitching, - execSelectGraphs, - loadSelectGraphs - )) { - QNN_ERROR("Couldn't populate context configs"); - return false; - } - - // Merge BE specific and agnostic configs - QnnContext_Config_t** allContextConfigs{nullptr}; - if (true != mergeAllContextConfigs( - &allContextConfigs, - customConfigs, - contextConfigs, - customConfigCount, - contextConfigCount - )) { - QNN_ERROR("Error merging custom and context configs"); - return false; - } - if (nullptr == m_qnnSystemInterface.systemContextCreate || nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo || nullptr == m_qnnSystemInterface.systemContextFree) { @@ -1299,9 +1280,36 @@ bool QnnApi::createFromBinary( } bool isIOBufferMgrInitialized = false; - for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) { + // Create context configs for each context + QnnContext_Config_t** contextConfigs = nullptr; + uint32_t contextConfigCount = 0; + if (true != getContextConfigs( + &contextConfigs, + contextConfigCount, + contextConfig.priority, + graphSwitching, + execSelectGraphs, + loadSelectGraphs + )) { + QNN_ERROR("Couldn't populate context configs"); + return false; + } + + // Merge BE specific and agnostic configs + QnnContext_Config_t** allContextConfigs{nullptr}; + if (true != mergeAllContextConfigs( + &allContextConfigs, + customConfigs, + contextConfigs, + customConfigCount, + contextConfigCount + )) { + QNN_ERROR("Error merging custom and context configs"); + return false; + } + if (nullptr == m_qnnInterface.contextCreateFromBinary) { QNN_ERROR( "contextCreateFromBinaryFnHandle is nullptr for context index = %zu", contextIdx @@ -1498,7 +1506,13 @@ bool QnnApi::createFromBinary( first_contextHandle = contextHandle; } #endif - + if (true != freeContextConfigs(contextConfigs, contextConfigCount)) { + QNN_ERROR("Couldn't free context configs"); + return false; + } + if (allContextConfigs) { + free(allContextConfigs); + } } m_isContextCreated = true; @@ -1507,14 +1521,6 @@ bool QnnApi::createFromBinary( "Initialized %u graphs from %lu contexts", m_graphsCount, cachedBinariesPathVec.size() ); - if (true != freeContextConfigs(contextConfigs, contextConfigCount)) { - QNN_ERROR("Couldn't free context configs"); - return false; - } - if (allContextConfigs) { - free(allContextConfigs); - } - if (nullptr != m_backendExtensions && m_backendExtensions->interface()) { if (!m_backendExtensions->interface()->afterCreateFromBinary()) { QNN_ERROR("Extensions Failure in afterCreateFromBinary()"); @@ -1599,34 +1605,6 @@ bool QnnApi::createFromBinaryListAsync( } } - - QnnContext_Config_t** contextConfigs = nullptr; - uint32_t contextConfigCount = 0; - if (true != getContextConfigs( - &contextConfigs, - contextConfigCount, - contextConfig.priority, - graphSwitching, - execSelectGraphs, - loadSelectGraphs - )) { - QNN_ERROR("Couldn't populate context configs"); - return false; - } - - // Merge BE specific and agnostic configs - QnnContext_Config_t** allContextConfigs{nullptr}; - if (true != mergeAllContextConfigs( - &allContextConfigs, - customConfigs, - contextConfigs, - customConfigCount, - contextConfigCount - )) { - QNN_ERROR("Error merging custom and context configs"); - return false; - } - if (nullptr == m_qnnSystemInterface.systemContextCreate || nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo || nullptr == m_qnnSystemInterface.systemContextFree) { @@ -1642,6 +1620,8 @@ bool QnnApi::createFromBinaryListAsync( GraphInfo_t*** graphsInfo = (GraphInfo_t***)calloc(cachedBinariesPathVec.size(), sizeof(GraphInfo_t**)); uint32_t graphsTotalNum = 0; + std::vector allContextConfigs{(unsigned int)cachedBinariesPathVec.size(), nullptr}; + std::vector allContextConfigsSize{(unsigned int)cachedBinariesPathVec.size()}; for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) { auto _startPerContext = std::chrono::steady_clock::now(); @@ -1710,17 +1690,41 @@ bool QnnApi::createFromBinaryListAsync( m_qnnSystemInterface.systemContextFree(sysCtxHandle); sysCtxHandle = nullptr; - uint32_t customConfigCountSF = 0; + uint32_t contextConfigCount = 0; + if (true != getContextConfigs( + &allContextConfigs[contextIdx], + contextConfigCount, + contextConfig.priority, + graphSwitching, + execSelectGraphs, + loadSelectGraphs + )) { + QNN_ERROR("Couldn't populate context configs"); + return false; + } + allContextConfigsSize[contextIdx] = contextConfigCount; + // Merge BE specific and agnostic configs + if (true != mergeAllContextConfigs( + &allContextConfigs[contextIdx], + customConfigs, + allContextConfigs[contextIdx], + customConfigCount, + contextConfigCount + )) { + QNN_ERROR("Error merging custom and context configs"); + return false; + } + allContextConfigsSize[contextIdx] += customConfigCount; + + uint32_t customConfigCountSF = 0; if (mmap_budget > 0) { QnnHtpContext_CustomConfig_t customConfigReadBudget; customConfigReadBudget.option = QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET; customConfigReadBudget.fileReadMemoryBudgetInMb = mmap_budget; QnnContext_Config_t** cfgs{nullptr}; - uint32_t customConfigCountReadBudget = 1; - cfgs = (QnnContext_Config_t**)malloc( customConfigCountReadBudget * sizeof(QnnContext_Config_t*) ); @@ -1729,15 +1733,16 @@ bool QnnApi::createFromBinaryListAsync( cfgs[0]->customConfig = reinterpret_cast(&customConfigReadBudget); if (true != mergeAllContextConfigs( - &allContextConfigs, + &allContextConfigs[contextIdx], cfgs, - allContextConfigs, + allContextConfigs[contextIdx], customConfigCountReadBudget, contextConfigCount + customConfigCount + customConfigCountSF )) { QNN_ERROR("Error merging custom and context configs"); return false; } + allContextConfigsSize[contextIdx] += customConfigCountReadBudget; } if (m_profileBackendHandle) { @@ -1751,7 +1756,7 @@ bool QnnApi::createFromBinaryListAsync( .version = QNN_CONTEXT_PARAMS_VERSION_1, .v1 = QnnContext_ParamsV1_t{ - (const QnnContext_Config_t**)allContextConfigs, + (const QnnContext_Config_t**)allContextConfigs[contextIdx], (const void*)buffer.get(), bufferSize, nullptr, @@ -1778,18 +1783,15 @@ bool QnnApi::createFromBinaryListAsync( } auto start = std::chrono::steady_clock::now(); - - auto errCode = m_qnnInterface.contextCreateFromBinaryListAsync( m_backendHandle, m_deviceHandle, const_cast(context_params_list.data()), - (const QnnContext_Config_t**)allContextConfigs, + (const QnnContext_Config_t**)customConfigs, nullptr ); - - auto stop = std::chrono::steady_clock::now(); + QNN_DEBUG( "Initializing %lu context with %u graphs took: %lld us", cachedBinariesPathVec.size(), @@ -1824,26 +1826,24 @@ bool QnnApi::createFromBinaryListAsync( m_isContextCreated = true; - if (true != freeContextConfigs(contextConfigs, contextConfigCount)) { - QNN_ERROR("Couldn't free context configs"); - return false; - } - if (true != freeContextParams(context_params_list.data(), cachedBinariesPathVec.size())) { QNN_ERROR("Couldn't free context params list"); return false; } - if (allContextConfigs) { - free(allContextConfigs); - } - if (nullptr != m_backendExtensions && m_backendExtensions->interface()) { if (!m_backendExtensions->interface()->afterCreateContextsFromBinaryList()) { QNN_ERROR("Extensions Failure in afterCreateContextsFromBinaryList()"); return false; } } + + for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) { + if (true != freeContextConfigs(allContextConfigs[contextIdx], allContextConfigsSize[contextIdx])) { + QNN_ERROR("Couldn't free context configs"); + return false; + } + } return true; } #endif @@ -2543,6 +2543,64 @@ bool QnnApi::extractProfilingEvent(QnnProfile_EventId_t profileEventId) { return true; } +bool QnnApi::applyBinarySection(uint32_t graphId, std::string binSectionPath) { +#if QUALLA_QNN_API_VERSION < 21700 + QNN_ERROR("LoRA adaptors require QNN SDK >= 2.25.1. Please update your libraries"); + return false; +#else + // assumption splitNum from 0 + QNN_DEBUG("QnnApi::applyBinarySection %d ", graphId); + if (nullptr == m_qnnInterface.contextApplyBinarySection) { + QNN_ERROR("contextApplyBinarySection Interface not suported!!"); + return false; + } + if (graphId >= m_graphsCount) { + QNN_ERROR(" Passed split %d base Model graphcount %d ", graphId, m_graphsCount); + return false; + } + uint64_t bufferSize{0}; + std::shared_ptr buffer{nullptr}; + bufferSize = getFileSize(binSectionPath); + buffer = std::shared_ptr(new uint8_t[bufferSize]); + if (true != readBinaryFromFile(binSectionPath, buffer.get(), bufferSize)) { + QNN_ERROR("Failed to read binary data for context index = %d", graphId); + return false; + } + + QnnContext_Buffer_t qnnBuffer; + qnnBuffer.version = QNN_CONTEXT_BUFFER_VERSION_1; + qnnBuffer.v1.memType = QNN_CONTEXTMEMTYPE_RAW; + qnnBuffer.v1.binaryBuf.dataSize = bufferSize; + qnnBuffer.v1.binaryBuf.data = static_cast(buffer.get()); + auto graphCountPerContext = getGraphCountPerContext(); + if (graphCountPerContext <= 0) { + QNN_ERROR(" graphCountPerContext is <=0 "); + return false; + } + + auto contextHandle = m_contextVec[graphId / graphCountPerContext]; + auto graphHandle = m_graphsInfo[graphId]->graph; + if (contextHandle == nullptr || graphHandle == nullptr) { + QNN_ERROR(" contexthandle or graph handle is null for patch no = %d ", graphId); + return false; + } + + auto errorCode = m_qnnInterface.contextApplyBinarySection( + contextHandle, + graphHandle, + QNN_CONTEXT_SECTION_UPDATABLE, + &qnnBuffer, + nullptr, //profile handle is null + nullptr //singal handle is null + ); + if (errorCode != QNN_SUCCESS) { + QNN_ERROR("Could not Apply Patch for graph = %d errocode = %zu ", graphId, errorCode); + return false; + } + return true; +#endif +} + bool QnnApi::applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch) { #if QUALLA_QNN_API_VERSION < 21700 QNN_ERROR("LoRA adaptors require QNN SDK >= 2.25.1. Please update your libraries"); @@ -2650,7 +2708,7 @@ bool QnnApi::applyBinarySection(uint32_t binIndex, std::string binSectionPath,bo #endif } -bool QnnApi::updateIOEncodings(std::shared_ptr& buffer,uint64_t bufferSize,uint32_t graphIndex){ +bool QnnApi::updateIOEncodings(std::shared_ptr& buffer,uint64_t bufferSize,uint32_t graphIndex) { QNN_DEBUG("Applying adapter Encodings"); QnnSystemContext_Handle_t sysCtxHandle{nullptr}; @@ -2679,3 +2737,224 @@ bool QnnApi::updateIOEncodings(std::shared_ptr& buffer,uint64_t buffer QNN_DEBUG(" updateIOEncodings success "); return true; } + +// This is a light weight function of existing ::createFromBinary, used for +// GPU execution to avoid conflicts with HTP use-case and for better readability. +bool QnnApi::createFromBinary( + std::vector cachedBinariesPathVec +) { + auto _start = std::chrono::steady_clock::now(); + + if (nullptr == m_qnnSystemInterface.systemContextCreate || + nullptr == m_qnnSystemInterface.systemContextGetBinaryInfo || + nullptr == m_qnnSystemInterface.systemContextFree) { + QNN_ERROR("QNN System function pointers are not populated."); + return false; + } + + graphCountPerContext = getGraphCountPerContext(); + + for (size_t contextIdx = 0; contextIdx < cachedBinariesPathVec.size(); contextIdx++) { + uint64_t bufferSize{0}; + std::shared_ptr buffer{nullptr}; + uint32_t graphsCount; + + // read serialized binary into a byte buffer + bufferSize = getFileSize(cachedBinariesPathVec[contextIdx]); + if (0 == bufferSize) { + QNN_ERROR( + "Received path to an empty file for context index = %zu. Nothing to deserialize.", + contextIdx + ); + return false; + } + + buffer = std::shared_ptr( + new uint8_t[bufferSize], std::default_delete() + ); + if (!buffer) { + QNN_ERROR("Failed to allocate memory for context index = %zu", contextIdx); + return false; + } + if (true != + readBinaryFromFile(cachedBinariesPathVec[contextIdx], buffer.get(), bufferSize)) { + QNN_ERROR("Failed to read binary data for context index = %zu", contextIdx); + return false; + } + + // inspect binary info + QnnSystemContext_Handle_t sysCtxHandle{nullptr}; + if (QNN_SUCCESS != m_qnnSystemInterface.systemContextCreate(&sysCtxHandle)) { + QNN_ERROR("Could not create system handle for context index = %zu", contextIdx); + return false; + } + + const QnnSystemContext_BinaryInfo_t* binaryInfo{nullptr}; + Qnn_ContextBinarySize_t binaryInfoSize{0}; + + if (QNN_SUCCESS != m_qnnSystemInterface.systemContextGetBinaryInfo( + sysCtxHandle, + static_cast(buffer.get()), + bufferSize, + &binaryInfo, + &binaryInfoSize + )) { + QNN_ERROR("Failed to get context binary info for context index = %zu", contextIdx); + return false; + } + + GraphInfo_t** graphsInfo; + if (!copyMetadataToGraphsInfo(binaryInfo, graphsInfo, graphsCount)) { + QNN_ERROR("Failed to copy metadata for graph index = %zu", contextIdx); + freeGraphsInfo(&graphsInfo, graphsCount); + if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + + if (graphCountPerContext == -1) { + graphCountPerContext = graphsCount; + m_graphsInfo = (GraphInfo_t**)calloc( + graphCountPerContext * cachedBinariesPathVec.size(), sizeof(GraphInfo_t*) + ); + } else if (graphCountPerContext != graphsCount) { + QNN_ERROR( + "Different len(graphs) found in different context files. Found %u vs %u", + graphsCount, + graphCountPerContext + ); + freeGraphsInfo(&graphsInfo, graphsCount); + if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + m_qnnSystemInterface.systemContextFree(sysCtxHandle); + sysCtxHandle = nullptr; + + if (nullptr == m_qnnInterface.contextCreateFromBinary) { + QNN_ERROR( + "contextCreateFromBinaryFnHandle is nullptr for context index = %zu", contextIdx + ); + freeGraphsInfo(&graphsInfo, graphsCount); + if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + Qnn_ContextHandle_t contextHandle{nullptr}; + auto _stop = std::chrono::steady_clock::now(); + QNN_DEBUG( + "Loading contexts[%lu] took: %lld us", + contextIdx, + std::chrono::duration_cast(_stop - _start).count() + ); + + auto start = std::chrono::steady_clock::now(); + + auto errCode = m_qnnInterface.contextCreateFromBinary( + m_backendHandle, + m_deviceHandle, + nullptr, + (const void*)buffer.get(), + bufferSize, + &contextHandle, + nullptr // profile handle + + ); + + if (errCode != QNN_SUCCESS) { + QNN_ERROR( + "Could not create context from binary for context index = %zu : err %d", + contextIdx, + (int)errCode + ); + freeGraphsInfo(&graphsInfo, graphsCount); + if (contextIdx > 0) freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + + auto stop = std::chrono::steady_clock::now(); + QNN_DEBUG( + "Initializing context[%lu] with %u graphs took: %lld us", + contextIdx, + graphsCount, + std::chrono::duration_cast(stop - start).count() + ); + + for (int n_graph = 0; n_graph < graphsCount; n_graph++) { + // Allocate inputTensors and outputTensors + GraphInfo_t* cur_graph = graphsInfo[n_graph]; + + m_graphsInfo[m_graphsCount++] = cur_graph; + m_contextMap[cur_graph] = contextHandle; + } + m_contextVec.push_back(contextHandle); + } + + m_isContextCreated = true; + + QNN_DEBUG( + "Initialized %u graphs from %lu contexts", m_graphsCount, cachedBinariesPathVec.size() + ); + + if (nullptr == m_qnnInterface.graphRetrieve) { + QNN_ERROR("graphRetrieveFnHandle is nullptr."); + freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + + for (size_t graphIdx = 0; graphIdx < m_graphsCount; graphIdx++) { + if (!m_graphsInfo || QNN_SUCCESS != m_qnnInterface.graphRetrieve( + m_contextVec[graphIdx / graphCountPerContext], + m_graphsInfo[graphIdx]->graphName, + &(m_graphsInfo[graphIdx]->graph) + )) { + QNN_ERROR("Unable to retrieve graph handle for graph index = %zu", graphIdx); + freeGraphsInfo(&m_graphsInfo, m_graphsCount); + return false; + } + } + + return true; +} + +bool QnnApi::initialize( + std::string backendPath, + std::vector modelPathOrCachedBinaryPath +) { + if (modelPathOrCachedBinaryPath.size() != 1) { + QNN_ERROR("Multiple Files not supported for now!!"); + return false; + } + + if (false == getQnnInterface(backendPath)) { + QNN_ERROR("Qnn getQnnInterface FAILED!"); + return false; + } + + const std::string systemLibraryPath = "libQnnSystem.so"; + if (false == getQnnSystemInterface(systemLibraryPath)) { + QNN_ERROR("Qnn getQnnSystemInterface FAILED!"); + return false; + } + + QnnLog_Level_t logLevel = QNN_LOG_LEVEL_INFO; + if (false == initializeLogging(logLevel, false)) { + QNN_ERROR("Unable to Initialize logging in backend"); + return false; + } + + // Initialize Backend + if (false == initializeBackend()) { + QNN_ERROR("Qnn initializeBackend FAILED!"); + return false; + } + + if (false == createFromBinary(modelPathOrCachedBinaryPath)) { + QNN_ERROR("Create From Binary FAILED!"); + return false; + } + + for (size_t graphIdx = 0; graphIdx < m_graphsCount; graphIdx++) { + m_graphNameToIndex[m_graphsInfo[graphIdx]->graphName] = graphIdx; + } + QNN_DEBUG("Model Initialized"); + + return true; +} diff --git a/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp b/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp index 8392265aa950a95015dce233bbeb5c0456ea0665..6481aea2cc34deeb58a7d5810fc87287142c6035 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/QnnApi.hpp @@ -370,6 +370,8 @@ class QnnApi { bool applyBinarySection(uint32_t binIndex, std::string binSectionPath,bool useMmap,bool graphSwitch); + bool applyBinarySection(uint32_t graphId, std::string binSectionPath); + QNN_INTERFACE_VER_TYPE* getQnnInterfaceVer() { return &m_qnnInterface; }; GraphInfo_t**& getGraphsInfo() { return m_graphsInfo; }; uint32_t getGraphsCount() { return m_graphsCount; }; @@ -426,4 +428,11 @@ class QnnApi { bool updateIOEncodings(std::shared_ptr& buffer, uint64_t bufferSize, uint32_t graphIndex); + + bool createFromBinary(std::vector cachedBinariesPathVec); + + bool initialize( + std::string backendPath, + std::vector modelPathOrCachedBinaryPath + ); }; diff --git a/Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp b/Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp index ca6efa2441cbf8548697412436ed7474ceb65cb6..124ba2f10a3e4b803218868feaf8a9631ad1eac5 100644 --- a/Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-api/qnn-utils.hpp @@ -46,14 +46,14 @@ bool writeRawData(void* tensorData, size_t tensorSize, const std::filesystem::pa bool readRawData(void* tensorData, size_t tensorSize, const std::filesystem::path& path); struct Dims { - int32_t batch = 1; - int32_t height, width, channel, bitWidth; + uint32_t batch = 1; + uint32_t height, width, channel, bitWidth; Dims() : height(0), width(0), channel(0), bitWidth(0) {} - Dims(int32_t height, int32_t width, int32_t channel, int32_t bitWidth) + Dims(uint32_t height, uint32_t width, uint32_t channel, uint32_t bitWidth) : height(height), width(width), channel(channel), bitWidth(bitWidth) {} Dims(std::vector& tDims) - : height((int32_t)tDims[1]), width((int32_t)tDims[2]), channel((int32_t)tDims[3]), - bitWidth((int32_t)tDims[4]) { + : height((uint32_t)tDims[1]), width((uint32_t)tDims[2]), channel((uint32_t)tDims[3]), + bitWidth((uint32_t)tDims[4]) { // Hack to mix batch dimension if (tDims[0] != 1 && tDims[1] == 1) height = tDims[0]; if (tDims[0] > 1 && tDims[1] != 1) batch = tDims[0]; diff --git a/Genie/Genie/src/qualla/engines/qnn-cpu.cpp b/Genie/Genie/src/qualla/engines/qnn-cpu.cpp index 48b1ab1aa4d820acd7581dbd6b6afa09aed08b83..64f3ea25cc60d36b5dc8480f2b2264c590c4ae29 100644 --- a/Genie/Genie/src/qualla/engines/qnn-cpu.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-cpu.cpp @@ -55,8 +55,10 @@ class QnnCpuEngine : public Engine { virtual bool updateKV(size_t n_past) override; virtual bool updateKV(size_t n_past, const std::vector& selected) override; virtual bool save(const std::string& name) override; - virtual size_t restore(const std::string& name) override; + virtual size_t restore(const std::string& name, bool chooseHigherVariant) override; virtual void reset() override; + virtual bool applyLoraAdapter(std::string lora_adapter_name) override; + virtual bool applyLoraStrength(std::string tensor_name, float tensor_val) override; }; namespace fs = std::filesystem; @@ -98,7 +100,40 @@ QnnCpuEngine::QnnCpuEngine(Context& ctx, const qualla::json& json) : Engine(ctx, p.use_mmap = conf.optional("use-mmap", false); p.ctx_size = _ctx.size(); p.n_vocab_size = _ctx.n_vocab(); - + p.lora_config_type = LoraConfigType::LORA_DISABLE; + qualla::json lora_conf = conf.optional("lora", {}); + if (lora_conf.size() != 0) { + p.lora_config_type = LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE; + if (lora_conf.is_array()) { + for (auto lc : lora_conf) { + std::string lnm = lc["adapter-name"]; + p.lora_config[lnm].lora_name = lnm; + p.lora_config[lnm].alpha_tensor_name = lc["alpha-tensor-name"]; + p.lora_config[lnm].alpha_tensor_val = 0.0f; + if(lc.contains("alpha-tensor-value")){ + p.lora_config[lnm].alpha_tensor_val = lc["alpha-tensor-value"]; + } + std::string basedir = ""; + if(lc.contains("binsection-basedir")){ + basedir = lc["binsection-basedir"]; + } + uint32_t n = lc["bin-sections"].size(); + for (uint32_t i = 0; i < n; i++) { + auto binSec = lc["bin-sections"].get>(); + fs::path binsection_path = fs::path(binSec[i]); + if (binsection_path.is_relative()) binsection_path = basedir / fs::path(binSec[i]); + if (!fs::is_regular_file(binsection_path)) { + __ERROR("qnn-cpu: Can't access Lora binsection adapter : {}", + binsection_path.string()); + throw std::runtime_error( + "qnn-cpu: Can't open adapter file : " + binsection_path.string() + ); + } + p.lora_config[lnm].binsection_list.push_back(binsection_path.string()); + } + } + } + } _model = std::make_unique(_env, p); // Load model @@ -211,7 +246,7 @@ size_t QnnCpuEngine::process( ); } -size_t QnnCpuEngine::restore(const std::string& name) { +size_t QnnCpuEngine::restore(const std::string& name, bool chooseHigherVariant) { fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-cpu", _role); return _model->loadKVCache(cache_path.string()); } @@ -226,6 +261,23 @@ void QnnCpuEngine::reset() { updateKV(0); } +// For Lora +bool QnnCpuEngine::applyLoraAdapter(std::string lora_adapter_name) { + if (!_model) { + __ERROR("qnn-cpu: applyLoraAdapter failed, model not initialized"); + return false; + } + return _model->applyLoraAdapter(lora_adapter_name); +} + +bool QnnCpuEngine::applyLoraStrength(std::string tensor_name, float tensor_val) { + if (!_model) { + __ERROR("qnn-cpu: applyLoraStrength failed, model not initialized"); + return false; + } + return _model->applyLoraStrength(tensor_name, tensor_val); +} + // Registrator instance static OnLoad regy([]() { Engine::__register("qnn-cpu", [](Context& ctx, const json& conf) { diff --git a/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp b/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp index 4e71728021596f2ad9d3d4679b5719f38079000c..45efd84227d0db07df70ab14aa4c6a0507ab5847 100644 --- a/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.cpp @@ -61,6 +61,12 @@ QnnCpuModel::QnnCpuModel(Env& env, const Params& params) m_output_dim.push_back(m_numLogits); m_output_dim.push_back(m_embd); } + m_loraConfigType = params.lora_config_type; + m_lora_alpha_val = 1.0f; + + if (m_loraConfigType == LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE) { + m_loraConfig.insert(params.lora_config.begin(), params.lora_config.end()); + } } QnnCpuModel::~QnnCpuModel() { @@ -383,6 +389,7 @@ bool QnnCpuModel::initializeTensorPointers() { t_input_ids_k_cache = &input_specs["x3"]; t_input_ids_v_cache = &input_specs["x4"]; t_input_ids_n_past = &input_specs["x5"]; + t_input_lora_alpha = &input_specs["x6"]; auto& output_specs = m_output_specs[model_order.back()]; t_logits = &output_specs["output_genAI"]; @@ -406,6 +413,7 @@ void QnnCpuModel::setupInputTensors(const std::vector& tokens, bool run uint32_t* input_id_num_token_buffer = (uint32_t*)getBuffer(t_input_ids_num_token); uint32_t* input_id_reset_kvcache_buffer = (uint32_t*)getBuffer(t_input_ids_reset_kvcache); uint32_t* input_id_n_past_buffer = (uint32_t*)getBuffer(t_input_ids_n_past); + float* input_id_lora_alpha = (float*)getBuffer(t_input_lora_alpha); uint32_t size = 1; for (auto dim : m_input_dim) { @@ -420,6 +428,7 @@ void QnnCpuModel::setupInputTensors(const std::vector& tokens, bool run std::memcpy(input_id_buffer, tokens.data(), tokens.size() * sizeof(uint32_t)); *input_id_num_token_buffer = tokens.size(); *input_id_n_past_buffer = m_nPast; + *input_id_lora_alpha = m_lora_alpha_val; auto stop = std::chrono::steady_clock::now(); // QnnUtils::logProfile("setupInputTensors (cpp) took", start, stop); @@ -589,6 +598,48 @@ size_t QnnCpuModel::getDequantLogits(std::vector& dequant_logits, bool lo return logits_all? prev_run.num_tokens_processed : 1; } +bool QnnCpuModel::applyBinarySections(std::vector& binsection_list) { + //apply binary section for lora config + for (int i = 0; i < binsection_list.size(); i++) { + __DEBUG("qnn-cpu: applyBinarySections adapters {}", binsection_list.at(i)); + if (!m_qnnApi->applyBinarySection(i, binsection_list.at(i))) { + __ERROR("qnn-cpu: Error in applyBinarySections {}", i); + return false; + } + } + return true; +} + +bool QnnCpuModel::applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val) { + m_lora_alpha_val = alpha_val; + return true; +} + +bool QnnCpuModel::applyLoraAdapter(const std::string& lora_adapter_name) { + if (m_loraConfigType != LoraConfigType::LORA_ADAPTER_WEIGHT_ENABLE) { + __ERROR("qnn-cpu: Lora config is not enable for adapters"); + return false; + } + + if (!m_loraConfig.contains(lora_adapter_name)) { + __ERROR("qnn-cpu: Could not find lora adapters config to apply "); + return false; + } + if (!applyLoraStrength( + m_loraConfig[lora_adapter_name].alpha_tensor_name, + m_loraConfig[lora_adapter_name].alpha_tensor_val + )) { + __ERROR("qnn-cpu: Could not apply Alpha tensor "); + return false; + } + + if (!applyBinarySections(m_loraConfig[lora_adapter_name].binsection_list)) { + __ERROR("qnn-cpu: Could not apply binary Sections "); + return false; + } + return true; +} + // TODO: implement save/restore size_t QnnCpuModel::loadKVCache(const std::string& load_path) { //TO read the cache file into KV tensor diff --git a/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp b/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp index 5d6b606acb4e357752df896fe65d8ae4c0afbe26..6bfd7aaad0c62e6f61757de15b2aca1787777154 100644 --- a/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-cpu/cpu-model.hpp @@ -26,6 +26,12 @@ namespace qualla { +enum LoraConfigType { + LORA_DISABLE = 0, + LORA_INPUT_WEIGHT_ENABLE = 1, + LORA_ADAPTER_WEIGHT_ENABLE = 2 +}; + class QnnCpuModel { enum ExecutionMode { AUTODETECT, BERT_KV, KV_ONLY, BERT_ONLY }; @@ -34,6 +40,13 @@ class QnnCpuModel { public: enum ModelOutput { LOGITS = 0x0, EMBEDDINGS= 0x1 }; + struct LoraConfig { + std::string lora_name; + std::vector binsection_list; //loRAv2 adapter bins filenames + std::string alpha_tensor_name; //loRAv2 alpha tensor names + float alpha_tensor_val; //loRAv2 alpha tensor values + }; + struct Params { std::filesystem::path model_basedir; std::string op_package; @@ -50,6 +63,8 @@ class QnnCpuModel { uint32_t n_layer; uint32_t n_embd; uint32_t n_heads; + LoraConfigType lora_config_type; + std::map lora_config; }; const std::filesystem::path model_basedir; @@ -92,6 +107,11 @@ class QnnCpuModel { std::vector m_params; ExecutionMode m_mode{ExecutionMode::AUTODETECT}; + // LoRA params and configs + float m_lora_alpha_val; + LoraConfigType m_loraConfigType; + std::map m_loraConfig; + // Save some information about the last inference run struct PreviousRunInfo { bool was_bert_mode; @@ -118,6 +138,7 @@ class QnnCpuModel { QnnUtils::Tensor* t_input_ids_k_cache; QnnUtils::Tensor* t_input_ids_v_cache; QnnUtils::Tensor* t_input_ids_n_past; + QnnUtils::Tensor* t_input_lora_alpha; float* dequant_logits_ptr{nullptr}; // Store pointers for bert @@ -171,6 +192,10 @@ class QnnCpuModel { size_t loadKVCache(const std::string& save_path); bool saveKVCache(const std::string& load_path); + bool applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val); + bool applyLoraAdapter(const std::string& lora_adapter_name); + bool applyBinarySections(std::vector& binsection_list); + private: bool m_mmap_context_bins = false; // mmap context binary files instead of reading them in memory // Internal functions to separate different runInference logic diff --git a/Genie/Genie/src/qualla/engines/qnn-gpu.cpp b/Genie/Genie/src/qualla/engines/qnn-gpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb0a6f0157b69ca6b2fc8c107560c88e6c210c0e --- /dev/null +++ b/Genie/Genie/src/qualla/engines/qnn-gpu.cpp @@ -0,0 +1,193 @@ +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI") + +#include +#include + +#include +#include +#include +#include + +#include + +#include "gpu-model.hpp" + +#define __INFO(__fmt, ...) _env.logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__)) +#define __WARN(__fmt, ...) _env.logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__)) +#define __ERROR(__fmt, ...) _env.logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__)) +#define __KPIS(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }) +#define __DEBUG(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }) +#define __TRACE(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }) + +namespace qualla { + +class GpuEngine : public Engine { + private: + QnnGpuModel::Params _params; + std::unique_ptr _model; + + public: + GpuEngine(Context& ctx, const qualla::json& json); + ~GpuEngine(); + + virtual size_t process( + const std::vector& tokens, + std::vector& logits, + bool logits_all + ) override; + + virtual bool updateKV(size_t n_past) override; + virtual bool save(const std::string& name) override; + virtual size_t restore(const std::string& name, bool chooseHigherVariant) override; + virtual void reset() override; + + virtual bool load() override; + virtual bool unload() override; +}; + +namespace fs = std::filesystem; + +GpuEngine::GpuEngine(Context& ctx, const qualla::json& json) : Engine(ctx, "qnn-gpu", json) { + qualla::Timer start; + + using FF = Feature::Flags; + _features = FF::OUTPUT_LOGITS | FF::SAVE_RESTORE | FF::DYNAMIC_LOAD; + + __DEBUG("Qnn-Gpu : init start"); + + qualla::Config conf(json, _type + "-engine:"); + + // Parse config + _params.model_basedir = conf.optional("model-basedir", ""); + if (_params.model_basedir.is_relative()) { + _params.model_basedir = _env.path().models / _params.model_basedir; + _params.model_basedir = _params.model_basedir.make_preferred(); + } + _params.model_list = conf.mandatory>("model-list"); + + _params.ctx_size = _ctx.size(); + _params.num_heads = conf.optional("num-heads", 32); + _params.head_dim = conf.optional("head-dim", 128); + + if (!conf.optional("dynamic-load", false)) { + load(); + } +}; + +GpuEngine::~GpuEngine() { + unload(); +} + +bool GpuEngine::load() { +#ifdef _WIN32 + // QnnGpu Engine does not support Windows. + return false; +#endif + if (_model) return true; + + qualla::Timer start; + bool status = true; + + __INFO("Qnn-Gpu : Loading Model"); + + _model = std::make_unique(_env, _params); + + // Load model + status = _model->initializeModel(); + if (!status) { + throw std::runtime_error("Qnn-Gpu :Failure to initialize model"); + } + + // Initialize IO Tensor buffers + status = _model->initializeIOTensors(); + if (!status) { + throw std::runtime_error("Qnn-Gpu :Error in setting up IO Tensors"); + } + + // Initialize IO Tensor Pointers + if (true != _model->initializeTensorPointers()) { + throw std::runtime_error("Qnn-Gpu :Could not find I/O tensors in loaded graphs"); + } + + // Validate the model + if (true != _model->validateModel()) { + throw std::runtime_error("Qnn-Gpu :Model Validation Failed"); + } + + _kpis.load.update(start.elapsed_usec()); + return true; +} + +bool GpuEngine::unload() { + qualla::Timer start; + __DEBUG("Qnn-Gpu : Unloading Model"); + _model.reset(nullptr); + _kpis.unload.update(start.elapsed_usec()); + return true; +} + +// KV Cache updation after each inference is handled inside QnnGpu Backend +// GPU Engine uses same memory handle for each KV input/output to the graph and uses +// Scatter op to update KV after each inference to the same memory handle. +bool GpuEngine::updateKV(size_t n_past) { + return true; +} + +size_t GpuEngine::process( + const std::vector& tokens, + std::vector& logits, + bool logits_all +) { + if (!_model && !load()) { + return 0; + } + qualla::Timer start; + size_t n_tok = _model->runInference(tokens, logits, logits_all); + if (n_tok == 0) { + State::error("Qnn-Gpu : RunInference Failed!"); + } + _kpis.process.update(start.elapsed_usec()); + return n_tok; +} + +size_t GpuEngine::restore(const std::string& name, bool chooseHigherVariant) { + if (!_model && !load()) { + return 0; + } + + fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-gpu", _role); + return _model->loadKVCache(cache_path.string()); +} + +bool GpuEngine::save(const std::string& name) { + if (!_model && !load()) { + return false; + } + + fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-gpu", _role); + return _model->saveKVCache(cache_path.string()); +} + + +// Reset requires clearing of KV caches only +void GpuEngine::reset() { + if (!_model && !load()) { + return; + } + _model->reset(); +} + +// Registrator instance +static OnLoad regy([]() { + Engine::__register("qnn-gpu", [](Context& ctx, const json& conf) { + return (Engine*)new GpuEngine(ctx, conf); + }); +}); + +void needQnnGpuEngine() {} + +} // namespace qualla diff --git a/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.cpp b/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e070fb4b42d51c181c0ef5c45bd0d1d2df957e2e --- /dev/null +++ b/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.cpp @@ -0,0 +1,603 @@ +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI") + +#include +#include +#include +#include +#include + +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "fp16/fp16.h" +#include "gpu-model.hpp" +#include "qualla/detail/cache-file.hpp" +#include "qualla/detail/timer.hpp" +#include "qualla/env.hpp" + +namespace fs = std::filesystem; + +static constexpr uint32_t g_magicNum = 0xC0DE; + +#define __INFO(__fmt, ...) _env.logger().post(Logger::INFO, fmt::format(__fmt, ##__VA_ARGS__)); +#define __WARN(__fmt, ...) _env.logger().post(Logger::WARN, fmt::format(__fmt, ##__VA_ARGS__)); +#define __ERROR(__fmt, ...) _env.logger().post(Logger::ERROR, fmt::format(__fmt, ##__VA_ARGS__)); +#define __KPIS(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_KPIS, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }); +#define __DEBUG(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_DEBUG, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }); +#define __TRACE(__fmt, ...) \ + _env.logger().post(Logger::ENGINE_TRACE, [&]() { return fmt::format(__fmt, ##__VA_ARGS__); }); + +namespace qualla { + +QnnGpuModel::QnnGpuModel(Env& env, const Params& params) + : _env(env), _modelBaseDir(params.model_basedir) { + // Initialize _qnnApi + _qnnApi = std::unique_ptr(new QnnApi()); + + _ctxSize = params.ctx_size; + _numHeads = params.num_heads; + _headDim = params.head_dim; +#ifdef _WIN32 + _useDmabufIo = false; +#else + _useDmabufIo = true; +#endif + // Set up filename list for context binaries. + for (auto& i : params.model_list) { + fs::path model_path = _modelBaseDir / fs::path(i); + if (!fs::is_regular_file(model_path)) { + __ERROR("Qnn-Gpu-Model : Can't access model file : {}", model_path.string()); + throw std::runtime_error("Qnn-Gpu-Model : Can't access model file : " + model_path.string()); + } + _modelList.push_back(model_path.string()); + } +} + +QnnGpuModel::~QnnGpuModel() { __INFO("Qnn-Gpu-Model : model destruct complete"); } + +// Given a filename, initializeModel load and initializes QNN runtime libraries and the model +bool QnnGpuModel::initializeModel(void) { + qualla::Timer start; + + __INFO("Qnn-Gpu-Model : Model Init Start"); + + const std::string backend = "libQnnGpu.so"; + + __INFO("Backend Library : {}", backend); + __INFO("Model Files : {}", _modelList); + + if (!_qnnApi->initialize(backend, _modelList)) { + __ERROR("Qnn-Api : Initialization Failed!"); + return false; + } + + // Initialize QNN IO Tensor + if (_useDmabufIo) { + _ioTensor = + std::unique_ptr(new IOTensor(BufferAlloc::DMABUF, _qnnApi->getQnnInterfaceVer())); + } else { + _ioTensor = std::unique_ptr( + new IOTensor(BufferAlloc::DEFAULT, _qnnApi->getQnnInterfaceVer())); + } + _numGraphs = _qnnApi->getGraphsCount(); + __INFO("Qnn-Gpu-Model : initialized with {} graph(s)", _numGraphs); + + GraphInfo_t** graphs_info = _qnnApi->getGraphsInfo(); + for (size_t graphIdx = 0; graphIdx < _numGraphs; graphIdx++) { + GraphInfo_t* const graphInfo = graphs_info[graphIdx]; + char* graphName = graphInfo->graphName; + std::string graphStr = std::string(graphName); + + _modelOrder.push_back(graphStr); + } + __INFO("Qnn-Gpu-Model : model init complete: {} usec", start.elapsed_usec()); + + return true; +} + +// Once the model has been loaded, initialize IO Tensors +// _ioTensors is initialized by the context for now +bool QnnGpuModel::initializeIOTensors() { + qualla::Timer start; + + // For QNN-GPU, we have only one context per model. + bool status = _ioTensor->initialize(_qnnApi->getContexts().back()); + if (!status) { + __ERROR("Qnn-Gpu-Model : failure to initialize IOTensor"); + return false; + } + // Getting graph info, Hardcoding single graph for now. + GraphInfo_t** const& graphsInfo = _qnnApi->getGraphsInfo(); + + for (size_t graphIdx = 0; graphIdx < _numGraphs; graphIdx++) { + GraphInfo_t* const& graphInfo = graphsInfo[graphIdx]; + std::string graphName = std::string(graphInfo->graphName); + + __DEBUG("Qnn-Gpu-Model : numInputTensors {} numOutputTensors {}", + graphInfo->numInputTensors, + graphInfo->numOutputTensors); + // Setup Inputs + { + std::unordered_map inputTensorsSize; + for (size_t tensorIdx = 0; tensorIdx < graphInfo->numInputTensors; tensorIdx++) { + std::string tensorName; + std::vector tensorDims; + auto& tensor = graphInfo->inputTensors[tensorIdx]; + _qnnApi->getTensorNameAndShape(tensorName, tensorDims, tensor); + auto dims = QnnUtils::Dims(tensorDims); + inputTensorsSize[tensorName] = dims.getSize(); + __DEBUG("Qnn-Gpu-Model : Input Tensor Info {} {} {} {}", + tensorIdx, + tensorName, + tensorDims, + inputTensorsSize[tensorName]); + std::vector quantParams; + if (!_qnnApi->getTensorQuantParams(&tensor, quantParams)) { + quantParams.emplace_back(0, 0); + } + + std::shared_ptr tensorUtil = + std::shared_ptr(new (std::nothrow) QnnUtils::Tensor); + tensorUtil->dims = dims; + tensorUtil->dtype = QNN_TENSOR_GET_DATA_TYPE(tensor); + tensorUtil->quantParam = quantParams; + _inputSpecs[graphName][tensorName] = tensorUtil; + } + + Qnn_Tensor_t* tensor_bank = nullptr; + std::unordered_map tensor_ptr_map; + if (true != _ioTensor->setupInputTensors(&tensor_bank, + tensor_ptr_map, + *graphInfo, + inputTensorsSize, + _qnnApi->getContexts()[graphIdx], + false)) { + QNN_ERROR("Qnn-Gpu-Model : Error in setting up Input Tensors for graph %s", + graphName.c_str()); + return false; + } + + _inputTensors[graphName] = tensor_bank; + for (auto& [tensorName, tensor_ptr] : tensor_ptr_map) { + _inputSpecs[graphName][tensorName]->tensor = (Qnn_Tensor_t*)tensor_ptr; + } + __DEBUG("Qnn-Gpu-Model : Input Tensor Allocated for {}", graphName); + } + + // Setup Outputs + { + std::unordered_map outputTensorsSize; + std::unordered_map sharedTensorMap; + for (size_t tensorIdx = 0; tensorIdx < graphInfo->numOutputTensors; tensorIdx++) { + std::string tensorName; + std::vector tensorDims; + + auto& tensor = graphInfo->outputTensors[tensorIdx]; + _qnnApi->getTensorNameAndShape(tensorName, tensorDims, tensor); + + if (tensorName.starts_with("past_")) { + std::string tensorInName = tensorName.substr(0, tensorName.size() - 3) + "in"; + sharedTensorMap[tensorName] = _inputSpecs[graphName][tensorInName]->tensor; + + // Update Gpu _kvCache + auto [type, layer_id] = parseKVTensorName(tensorName); + _kvCache.push_back( + GpuKVCache((type == 1), layer_id, _inputSpecs[graphName][tensorInName].get())); + } + std::vector quantParams; + if (!_qnnApi->getTensorQuantParams(&tensor, quantParams)) { + quantParams.emplace_back(0, 0); + } + + auto dims = QnnUtils::Dims(tensorDims); + outputTensorsSize[tensorName] = dims.getAlignedSize(); + + __DEBUG("Qnn-Gpu-Model : Output Tensor Info {} {} {} {}", + tensorIdx, + tensorName, + tensorDims, + outputTensorsSize[tensorName]); + std::shared_ptr tensorUtil = + std::shared_ptr(new (std::nothrow) QnnUtils::Tensor); + tensorUtil->dims = dims; + tensorUtil->dtype = QNN_TENSOR_GET_DATA_TYPE(tensor); + tensorUtil->quantParam = quantParams; + _outputSpecs[graphName][tensorName] = tensorUtil; + } + + Qnn_Tensor_t* tensor_bank = nullptr; + std::unordered_map tensor_ptr_map; + if (_ioTensor->getBufferAllocType() == BufferAlloc::DMABUF) { + if (true != _ioTensor->setupOutputWithSharedTensors(&tensor_bank, + tensor_ptr_map, + *graphInfo, + outputTensorsSize, + _qnnApi->getContexts()[graphIdx], + sharedTensorMap)) { + QNN_ERROR("Qnn-Gpu-Model : Error in setting up Output Tensors for graph %s", + graphName.c_str()); + return false; + } + } else { + if (true != _ioTensor->setupOutputTensors(&tensor_bank, + tensor_ptr_map, + *graphInfo, + outputTensorsSize, + _qnnApi->getContexts()[graphIdx], + false)) { + QNN_ERROR("Qnn-Gpu-Model : Error in setting up Input Tensors for graph %s", + graphName.c_str()); + return false; + } + } + + _outputTensors[graphName] = tensor_bank; + for (auto& [tensorName, tensor_ptr] : tensor_ptr_map) { + _outputSpecs[graphName][tensorName]->tensor = (Qnn_Tensor_t*)tensor_ptr; + } + + __DEBUG("Qnn-Gpu-Model : Output Tensor Allocated {} {}", graphName, _outputTensors.size()); + } + } + auto stop = std::chrono::steady_clock::now(); + return true; +} + +bool QnnGpuModel::initializeTensorPointers() { + auto inputSpec = _inputSpecs[_modelOrder.back()]; + auto outputSpec = _outputSpecs[_modelOrder.back()]; + + t_inputIds = inputSpec[INPUT_IDS].get(); + t_attnMask = inputSpec[ATTN_MASK].get(); + t_positionIds = inputSpec[POS_IDS].get(); + t_logits = outputSpec[LOGITS].get(); + + auto status = !(t_inputIds == nullptr || t_attnMask == nullptr || t_positionIds == nullptr || + t_logits == nullptr); + + if (!status) { + __ERROR("Qnn-Gpu-Model : error in setting up named tensor pointers for llama."); + return false; + } + return true; +} + +bool QnnGpuModel::validateModel() { + // Validating context Size. + size_t numInputs = t_inputIds->dims.getNumElements(); + size_t dimMask = t_attnMask->dims.getNumElements(); + size_t modelCtxSize = dimMask / numInputs; + + if (modelCtxSize != _ctxSize) { + __ERROR("Qnn-Gpu-Model : Invalid Context Size {} {}.", modelCtxSize, _ctxSize); + return false; + } + return true; +} + +void QnnGpuModel::setupInputTensors(const std::vector& tokens) { + auto start = std::chrono::steady_clock::now(); + + if (tokens.size() > _ctxSize) { + std::string errMsg = "Called inference with more tokens than model supports: "; + errMsg += std::to_string(tokens.size()) + " vs. " + std::to_string(_ctxSize); + throw std::runtime_error(errMsg); + } + + // Setup 1. input_ids + // Index of input tokens in the embedding vocabulary + uint32_t* inputIdBuffer = (uint32_t*)getBuffer(t_inputIds); + if (inputIdBuffer) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_inputIds->tensor); + } + inputIdBuffer[0] = tokens[0]; + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_inputIds->tensor); + } + } + + // Setup 2. attention_mask + // Mask to avoid performing attention of padding. + uint32_t* attnMaskBuffer = (uint32_t*)getBuffer(t_attnMask); + if (attnMaskBuffer) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_attnMask->tensor); + } + attnMaskBuffer[_numTokensProcessed] = 1; + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_attnMask->tensor); + } + } + + // Setup 3. position_ids + // Indices of positions of each input tokens in position embeddings. + uint32_t* positionIdBuffer = (uint32_t*)getBuffer(t_positionIds); + if (positionIdBuffer) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_positionIds->tensor); + } + positionIdBuffer[0] = (uint32_t)(_numTokensProcessed); + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_positionIds->tensor); + } + } + + auto stop = std::chrono::steady_clock::now(); +} + +template +inline bool QnnGpuModel::executeModel(T1& input, T2& output, std::string graphName) { + bool ret = _qnnApi->graphExecute(input, output, graphName, timeLogs); + if (ret != true) { + QNN_ERROR("Qnn-Gpu-Model : Error executing inference: %d for graph %s", ret, graphName.c_str()); + return false; + } + QNN_DEBUG("Qnn-Gpu-Model : Execute finished for graph %s", graphName.c_str()); + return true; +} + +bool QnnGpuModel::runInferenceHelper(std::vector& exec_models, + int32_t* wait_time_total, + int32_t* exec_time_total, + bool pipeline_kv_update, + size_t update_size) { + int32_t exec_time = 0; + int32_t wait_time = 0; + for (auto& graphName : exec_models) { + { + auto start_time = std::chrono::steady_clock::now(); + Qnn_Tensor_t* inputTensors; + Qnn_Tensor_t* outputTensors; + try { + inputTensors = _inputTensors[graphName]; + outputTensors = _outputTensors[graphName]; + } catch (std::exception e) { + __DEBUG("Qnn-Gpu-Model : Could not find tensors %s", graphName.c_str()); + return false; + } + bool status = executeModel(inputTensors, outputTensors, graphName); + if (!status) { + return false; + } + auto end_time = std::chrono::steady_clock::now(); + exec_time += static_cast( + std::chrono::duration_cast(end_time - start_time).count()); + } + } + + *exec_time_total += exec_time; + *wait_time_total += wait_time; + return true; +} + +size_t QnnGpuModel::runInference(const std::vector& tokens, + std::vector& logits, + bool logits_all) { + auto start = std::chrono::steady_clock::now(); + int32_t totalWaitTime = 0; + int32_t totalExecTime = 0; + + // Setup inputs for inference + auto& execModels = _modelOrder; + int numIters = tokens.size(); + for (int i = 0; i < numIters; i++) { + if (numIters > 1) { + __DEBUG("Qnn-Gpu-Model : Prompt Processing {} of {} tokens", i + 1, numIters); + } else { + __DEBUG("Qnn-Gpu-Model : Token Generation {} of {} tokens", i + 1, numIters); + } + std::vector curr_tokens; + curr_tokens.push_back(tokens[i]); + setupInputTensors(curr_tokens); + bool status = + runInferenceHelper(execModels, &totalWaitTime, &totalExecTime, false, tokens.size()); + if (!status) { + return 0; + } + processLogits(logits, logits_all); + + // Update the numProcessTokens to updated with Accepted Tokens. + _numTokensProcessed++; + } + + auto stop = std::chrono::steady_clock::now(); + timeLogs["Run Inference (cpp) "].first += static_cast( + std::chrono::duration_cast(stop - start).count()); + timeLogs["Run Inference (cpp) "].second++; + QNN_DEBUG("[TIME] Wait[%d] Exec[%d]\n", totalWaitTime, totalExecTime); + if (!logits_all) { + return 1; + } + return tokens.size(); +} + +// Parse KV$ Tensor names here - supports past_{key,value}_{layer_idx}[_h0]_{in,out} +std::tuple QnnGpuModel::parseKVTensorName(std::string name) { + if (!name.starts_with("past_")) return {0, 0}; + + const bool is_key = name.starts_with("past_key"); + const size_t pos0 = (is_key) ? 9 : 11; // "past_key_" OR "past_value_" + const size_t pos1 = name.find('_', pos0); + + int layer_idx = static_cast(std::stoi(name.substr(pos0, pos1 - pos0))); + + return std::make_tuple(is_key ? 1 : 2, layer_idx); +} + +size_t QnnGpuModel::loadKVCache(const std::string& load_path) { + std::ifstream fs(load_path, std::ios::in | std::ios::binary); + if (fs.fail()) { + __ERROR("Qnn-Gpu-Model : loadKVCache errror reading file {}", load_path); + return 0; + } + + CacheFileSpec spec; + fs.read((char*)&spec, sizeof(spec)); + if (spec.magic != g_magicNum) { + __ERROR("Qnn-Gpu-Model : loadKVCache expected {} found {:#x}", g_magicNum, spec.magic); + return 0; + } + + // clang-format off + __INFO("Qnn-Gpu-Model : loadKVCache {{ num_tensors {}, magic {}, dtype {}, n_heads {}, embed_dim {} update_size {} }}", + spec.num_tensors, spec.magic, int(spec.dtype), spec.n_heads, spec.embed_dim, spec.update_size); fflush(stdout); + // clang-format on + + _numTokensProcessed = static_cast(spec.update_size); + if (_numTokensProcessed > 0) { + // Loop over _kvCache tensor and read from file + for (auto cache : _kvCache) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_inputIds->tensor); + } + char* buffer = (char*)getBuffer(cache.tensorUtil); + if (cache.isKey) { + // Kye Cache Dims [1, num_heads, head_dim, ctx_size] + // float16 bits equivalent to uint16_t + const size_t copySize = _numTokensProcessed; + const size_t skipSize = _ctxSize; + for (int i = 0; i < _numHeads; i++) { + for (int j = 0; j < _headDim; j++) { + fs.read(buffer, copySize * sizeof(uint16_t)); + buffer += skipSize * sizeof(uint16_t); + } + } + } else { + // Kye Cache Dims [1, num_heads, ctx_size, head_dim] + // float16 bits equivalent to uint16_t + const size_t copySize = _numTokensProcessed * _headDim; + const size_t skipSize = _ctxSize * _headDim; + for (int i = 0; i < _numHeads; i++) { + fs.read(buffer, copySize * sizeof(uint16_t)); + buffer += skipSize * sizeof(uint16_t); + } + } + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_inputIds->tensor); + } + } + } + return _numTokensProcessed; +} + +bool QnnGpuModel::saveKVCache(const std::string& save_path) { + std::ofstream fs(save_path, std::ios::out | std::ios::binary); + if (fs.fail()) { + __ERROR("Qnn-Gpu-Model : saveKVCache error opening file : {}", save_path); + throw std::runtime_error("Failed to write to cache file. Please re-check path"); + } + + const CacheFileSpec::DataType dtype = CacheFileSpec::DataType::FLOAT16_T; + + uint32_t numKVTensors = _kvCache.size(); + + // Save the cache file metadata + CacheFileSpec file_spec( + numKVTensors, g_magicNum, dtype, 0x0, _numHeads, _headDim, _numTokensProcessed); + fs.write((char*)&file_spec, sizeof(file_spec)); + + // clang-format off + __INFO("Qnn-Gpu-Model : saveKVCache {{ num_tensors {}, magic {}, dtype {}, n_heads {}, embed_dim {} update_size {} }}", + numKVTensors, g_magicNum, int(dtype), _numHeads, _headDim, _numTokensProcessed); fflush(stdout); + // clang-format on + + if (_numTokensProcessed > 0) { + // Loop over _kvCache tensor and write to file + for (auto cache : _kvCache) { + if (_useDmabufIo) { + _ioTensor->beforeReadFromBuffer(t_inputIds->tensor); + } + char* buffer = (char*)getBuffer(cache.tensorUtil); + if (cache.isKey) { + // Kye Cache Dims [1, num_heads, head_dim, ctx_size] + // float16 bits equivalent to uint16_t + const size_t copySize = _numTokensProcessed; + const size_t skipSize = _ctxSize; + for (int i = 0; i < _numHeads; i++) { + for (int j = 0; j < _headDim; j++) { + fs.write((char*)buffer, copySize * sizeof(uint16_t)); + buffer += skipSize * sizeof(uint16_t); + } + } + } else { + // Kye Cache Dims [1, num_heads, ctx_size, head_dim] + // float16 bits equivalent to uint16_t + const size_t copySize = _numTokensProcessed * _headDim; + const size_t skipSize = _ctxSize * _headDim; + for (int i = 0; i < _numHeads; i++) { + fs.write((char*)buffer, copySize * sizeof(uint16_t)); + buffer += skipSize; + } + } + if (_useDmabufIo) { + _ioTensor->afterReadFromBuffer(t_inputIds->tensor); + } + } + } + fs.flush(); + fs.close(); + + return true; +} + +size_t QnnGpuModel::processLogits(std::vector& logits, bool logits_all) { + auto logitsSpec = _outputSpecs[_modelOrder.back()][LOGITS].get(); + size_t logitsSize = getNumElements(logitsSpec); + if (_useDmabufIo) { + _ioTensor->beforeReadFromBuffer(t_inputIds->tensor); + } + uint16_t* logitBuf = (uint16_t*)getBuffer(logitsSpec); + + if (!logits_all) { + logits.clear(); + } + size_t allocateSize = logits.size() + logitsSize; + logits.reserve(allocateSize); + for (auto i = 0; i < logitsSize; ++i) { + logits.push_back(fp16_ieee_to_fp32_value(logitBuf[i])); + } + if (_useDmabufIo) { + _ioTensor->afterReadFromBuffer(t_inputIds->tensor); + } + + return logits.size() / logitsSize; +} + +bool QnnGpuModel::reset() { + // Reset Token Counter + _numTokensProcessed = 0; + + // Reset Attention Mask + uint32_t* attnMaskBuffer = (uint32_t*)getBuffer(t_attnMask); + uint32_t attnMaskSize = getBufferSize(t_attnMask); + if (attnMaskBuffer) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_attnMask->tensor); + } + memset(attnMaskBuffer, 0, attnMaskSize); + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_attnMask->tensor); + } + } + + // Reset KV Cache. + // TODO : Check if mask_neg -100 is enough to remove + // effect of KV Cache. Test with mask_neg = -float_inf + for (auto cache : _kvCache) { + if (_useDmabufIo) { + _ioTensor->beforeWriteToBuffer(t_inputIds->tensor); + } + char* buffer = (char*)getBuffer(cache.tensorUtil); + uint32_t bufferSize = getBufferSize(cache.tensorUtil); + memset(buffer, 0, bufferSize); + if (_useDmabufIo) { + _ioTensor->afterWriteToBuffer(t_inputIds->tensor); + } + } + return true; +} + +} // namespace qualla diff --git a/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.hpp b/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.hpp new file mode 100644 index 0000000000000000000000000000000000000000..10c1158e8b9038f4695bf24a9bea85ed361bbdc3 --- /dev/null +++ b/Genie/Genie/src/qualla/engines/qnn-gpu/gpu-model.hpp @@ -0,0 +1,136 @@ +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// Confidential & Proprietary - Qualcomm Technologies, Inc. ("QTI") + +#ifndef __QUALLA_QNN_GPU_MODEL_H_ +#define __QUALLA_QNN_GPU_MODEL_H_ + +#include +#include +#include +#include + +#include "IOTensor.hpp" +#include "QnnApi.hpp" +#include "qnn-utils.hpp" +#include "qualla/env.hpp" + +namespace qualla { + +// Maintain a list of named tensors for +static std::string INPUT_IDS = "input_ids"; +static std::string ATTN_MASK = "attention_mask"; +static std::string LOGITS = "logits"; +static std::string POS_IDS = "position_ids"; + +class QnnGpuModel { + public: + struct Params { + std::filesystem::path model_basedir; + std::vector model_list; // model filenames + uint32_t ctx_size; + uint32_t num_heads; + uint32_t head_dim; + }; + + struct GpuKVCache { + bool isKey; + uint32_t tensorId; + QnnUtils::Tensor* tensorUtil; + + GpuKVCache() { + isKey = false; + tensorUtil = nullptr; + tensorId = 0; + } + GpuKVCache(bool _isKey, uint32_t _tensorId, QnnUtils::Tensor* _tensorUtil) + : isKey(_isKey), tensorId(_tensorId), tensorUtil(_tensorUtil) {} + }; + + // QNN specific variables + std::unique_ptr _qnnApi; + std::unique_ptr _ioTensor{nullptr}; + + // Model Location Storage + const std::filesystem::path _modelBaseDir; + std::vector _modelList; + std::vector _modelOrder; + + bool _useDmabufIo; + + // Model parameters + uint32_t _ctxSize{0}; + uint32_t _numHeads{0}; + uint32_t _headDim{0}; + + // Information regarding model execution settings and last inference + + // Model specific variables + uint32_t _numGraphs; + // I/O Tensor Informations + std::unordered_map _inputTensors; + std::unordered_map>> + _inputSpecs; + + std::unordered_map _outputTensors; + std::unordered_map>> + _outputSpecs; + + // Store some pointers for easier access + QnnUtils::Tensor* t_inputIds{nullptr}; + QnnUtils::Tensor* t_attnMask{nullptr}; + QnnUtils::Tensor* t_positionIds{nullptr}; + QnnUtils::Tensor* t_logits{nullptr}; + + // _numTokensProcessed defines number of population of kvcache + size_t _numTokensProcessed{0}; + + std::vector _kvCache; + + std::map> timeLogs; + + // Model Constructor + QnnGpuModel(Env& env, const Params& params); + ~QnnGpuModel(); + + bool initializeModel(void); + bool initializeIOTensors(void); + void setupInputTensors(const std::vector& tokens); + bool initializeTensorPointers(); + bool validateModel(); + + template + inline bool executeModel(T1& input, T2& output, std::string graph_name); + + size_t runInference(const std::vector& tokens, + std::vector& logits, + bool logits_all = false); + + size_t loadKVCache(const std::string& save_path); + bool saveKVCache(const std::string& load_path); + bool reset(); + + private: + Env& _env; + // Internal functions to separate different runInference logic + bool runInferenceHelper(std::vector& exec_models, + int32_t* wait_time_total, + int32_t* exec_time_total, + bool pipeline_kv_update, + size_t update_size); + size_t processLogits(std::vector& logits, bool logits_all); + inline void* getBuffer(QnnUtils::Tensor& spec) { return _ioTensor->getBuffer(spec.tensor); } + inline void* getBuffer(QnnUtils::Tensor* spec) { return _ioTensor->getBuffer(spec->tensor); } + inline size_t getBufferSize(QnnUtils::Tensor& spec) { return spec.dims.getSize(); } + inline size_t getBufferSize(QnnUtils::Tensor* spec) { return spec->dims.getSize(); } + inline size_t getNumElements(QnnUtils::Tensor& spec) { return spec.dims.getNumElements(); } + inline size_t getNumElements(QnnUtils::Tensor* spec) { return spec->dims.getNumElements(); } + + // Parse KV$ Tensor names here - supports past_{key,value}_{layer_idx}[_h0]_{in,out} + std::tuple parseKVTensorName(std::string name); +}; + +} // namespace qualla + +#endif diff --git a/Genie/Genie/src/qualla/engines/qnn-htp.cpp b/Genie/Genie/src/qualla/engines/qnn-htp.cpp index 5e825d009be6464474ca9c25781a5f7837c4d70a..4a716af766b5e2404bbc998d8487bc8efb1c03af 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp.cpp @@ -353,11 +353,11 @@ qualla::InputType NspEngine::getInputType(){ return _model->m_inputType; } -size_t NspEngine::restore(const std::string& name) { +size_t NspEngine::restore(const std::string& name, bool chooseHigherVariant) { if (!_model && !load()) return 0; fs::path cache_path = std::filesystem::path(name) / fmt::format("kv-cache.{}.qnn-htp", _role); - return _model->loadKVCache(cache_path.string()); + return _model->loadKVCache(cache_path.string(), chooseHigherVariant); } bool NspEngine::save(const std::string& name) { diff --git a/Genie/Genie/src/qualla/engines/qnn-htp.hpp b/Genie/Genie/src/qualla/engines/qnn-htp.hpp index 4b04bb1911cea93c67b2d6c9831837baee2b9e5e..d274f56495b6026acdafed09d7dd6a760e41b6c7 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp.hpp @@ -70,7 +70,7 @@ class NspEngine : public Engine { virtual bool updateKV(size_t n_past) override; virtual bool updateKV(size_t n_past, const std::vector& selected) override; virtual bool save(const std::string& name) override; - virtual size_t restore(const std::string& name) override; + virtual size_t restore(const std::string& name, bool chooseHigherVariant) override; virtual void reset() override; virtual bool set(qualla::json data) override; diff --git a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp index 7046ba06c94b1c3e12233b496da152c8ae1366b4..51c4f6420dc00368b86255fb1753b4b7b61c8d34 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.cpp @@ -338,7 +338,7 @@ bool NewNSPKVManager::registerPointerOffset() { return true; } -bool NewNSPKVManager::updateState() { +void NewNSPKVManager::updateKVCache(){ // clang-format off __TRACE("qnn-kv : graph[{}] updateState to AR-{}(n_past={}, ptr={})", _mgr_idx, _req_state.variant, _req_state.n_past, _req_state.ptr_offset); @@ -354,9 +354,15 @@ bool NewNSPKVManager::updateState() { cache.output_buffer += cache.is_key ? _n_ctx * _bw : _n_ctx * _n_embed * _bw; } } - _cur_state = _req_state; +} +void NewNSPKVManager::updateKVDispatcher(){ _counter = _callback_fn(_mgr_idx); +} + +bool NewNSPKVManager::updateState() { + updateKVCache(); + updateKVDispatcher(); return true; } @@ -525,7 +531,7 @@ bool NewNSPKVManager::loadCache( } _req_state = {variant, n_valid, 0}; - updateState(); + updateKVCache(); return true; } diff --git a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp index f1208aa5364193f472c3bd88e6cbd00a7a813854..fd5e6de2e6faddee960e046eec741d3f6458a72e 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-kvmanager.hpp @@ -134,7 +134,8 @@ class NewNSPKVManager { int32_t n_heads ); bool dumpCache(std::ofstream* fs, bool is_key, int32_t n_valid, int32_t n_heads); - + void updateKVCache(); + void updateKVDispatcher(); bool updateState(); void runKVUpdateJob(int thread_idx); // Worker thread function void setTensorAllocInfo(std::map>* alloc_info) { diff --git a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp index 24da5f8473e41e9fb61bf91b2d4940977873c5d8..e5bc6ffd03b687f38292c341b09a79c4f24481ce 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.cpp @@ -1960,6 +1960,9 @@ bool QnnNspModel::calculate_rope_embeddings(void) { const size_t nmemb = m_ctx_size * m_pos_dim; const int pos_bw = d_pos.bw(); + const double theta = m_positional_encoding.rope_params.theta; + const RopeScalingParams& rope_scaling = m_positional_encoding.rope_params.rope_scaling; + rope_sin = malloc(nmemb * pos_bw); rope_cos = malloc(nmemb * pos_bw); @@ -1973,7 +1976,7 @@ bool QnnNspModel::calculate_rope_embeddings(void) { std::vector inv_freq(m_pos_dim); const double exponent = 1.0 / static_cast(m_pos_dim); for (int j = 0; j < m_pos_dim; j++) - inv_freq[j] = 1.0 / pow(rope_theta, j * exponent); + inv_freq[j] = 1.0 / pow(theta, j * exponent); double attention_factor = 1.0; if (rope_scaling.rope_type == RopeScalingParams::ROPE_LLAMA3) { // Implemented from HuggingFace @@ -1991,7 +1994,7 @@ bool QnnNspModel::calculate_rope_embeddings(void) { if (wavelen < high_freq_wavelen) // wavelen < high_freq_wavelen: do nothing continue; else if (wavelen > low_freq_wavelen) // wavelen > low_freq_wavelen: divide by factor - inv_freq[j] = 1.0 / static_cast(factor * pow(rope_theta, j * exponent)); + inv_freq[j] = 1.0 / static_cast(factor * pow(theta, j * exponent)); else { // otherwise: interpolate between the two, using a smooth factor assert(low_freq_wavelen != high_freq_wavelen); const double smooth = @@ -2266,7 +2269,7 @@ void QnnNspModel::dumpTensorSpecs() { } } -size_t QnnNspModel::loadKVCache(const std::string& load_path) { +size_t QnnNspModel::loadKVCache(const std::string& load_path, bool chooseHigherVariant) { if(m_disableKvCache){ __ERROR("KV cache is disabled, loading KV cache is not allowed"); @@ -2308,7 +2311,8 @@ size_t QnnNspModel::loadKVCache(const std::string& load_path) { // clang-format on const int32_t n_valid = static_cast(spec.update_size); - const int32_t variant = nsp_graph_count.begin()->first; // Set KVManager to smallest variant + int32_t variant = nsp_graph_count.begin()->first; // Set KVManager to smallest variant + if(chooseHigherVariant) variant = nsp_graph_count.rbegin()->first; // Ideal for loading KV prefix cache _kv_dispatcher->setVariant(variant); // Lock, load KeyCache then ValueCache, unlock diff --git a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp index 67eee738ed09a690f14ad73a6921499314040415..100eed93067d1dd7db3a9be369c5db78270f4515 100644 --- a/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp +++ b/Genie/Genie/src/qualla/engines/qnn-htp/nsp-model.hpp @@ -54,14 +54,14 @@ struct RopeScalingParams { double low_freq_factor; double high_freq_factor; int original_max_position_embeddings; - } llama3_params; + } llama3_params {0}; struct { double factor; std::vector long_factor; std::vector short_factor; int original_max_position_embeddings; - } longrope_params; + } longrope_params {0}; RopeScalingParams() {} }; @@ -79,7 +79,7 @@ struct PositionalEncoding { int32_t dims; double theta; RopeScalingParams rope_scaling; - } rope_params; + } rope_params {0}; PositionalEncoding() { type = ROPE; } }; @@ -265,10 +265,8 @@ class QnnNspModel { QnnUtils::Tensor* t_position_ids{nullptr}; // PositionalEncodingType::ROPE variables int32_t m_pos_dim{-1}; // Dimension of positional embedding tensor (incl partial_factor) - double rope_theta{10000.0}; // Base theta parameter for RoPE calculations void* rope_sin{nullptr}; // Pre-calculated RoPE sin table of size [ctx_size, m_pos_dim] void* rope_cos{nullptr}; // Pre-calculated RoPE cos table of size [ctx_size, m_pos_dim] - RopeScalingParams rope_scaling; // RoPE scaling parameters QnnUtils::Tensor* t_position_ids_sin{nullptr}; QnnUtils::Tensor* t_position_ids_cos{nullptr}; @@ -398,7 +396,7 @@ class QnnNspModel { bool debugOutputs(QnnUtils::Tensor* outTensor, std::string& outTensorName); - size_t loadKVCache(const std::string& load_path); + size_t loadKVCache(const std::string& load_path, bool chooseHigherVariant=false); bool saveKVCache(const std::string& save_path); bool applyLoraStrength(const std::string& alpha_tensor_name, const float alpha_val); bool applyLoraAdapter(const std::string& lora_adapter_name); diff --git a/Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp b/Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp index 0cee18fd02e535816518bbbcc1e82d1bf89d6c12..841c82787f32091c3de3381ceb2bd2a760234e2c 100644 --- a/Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp +++ b/Genie/Genie/src/qualla/include/qualla/detail/basic-sampler.hpp @@ -39,6 +39,7 @@ class BasicSampler : public Sampler { virtual bool save(const std::string& name) override; virtual bool restore(const std::string& name) override; virtual void reset() override; + virtual void applyConfig(const qualla::json& conf) override; protected: int32_t _process(std::span logits, std::vector* probs_out, bool samp_tok); diff --git a/Genie/Genie/src/qualla/include/qualla/dialog.hpp b/Genie/Genie/src/qualla/include/qualla/dialog.hpp index 47953b70c8bee23091aafdc7543a7a8c9ecfc1c1..c7e75933b9b2f901dd3a7860f3e233be9b2abdc7 100644 --- a/Genie/Genie/src/qualla/include/qualla/dialog.hpp +++ b/Genie/Genie/src/qualla/include/qualla/dialog.hpp @@ -107,6 +107,7 @@ class Dialog : public State { Tokenizer& tokenizer() { return *_tokenizer; } Sampler& sampler(const std::string& role = "primary") { return *_sampler[role]; } Engine& engine(const std::string& role = "primary") { return *_engine[role]; } + bool isSamplerPresent(std::string role) { return _sampler.find(role) != _sampler.end(); } // Get latest KPIs. // Updates TPS, etc as needed. diff --git a/Genie/Genie/src/qualla/include/qualla/engine.hpp b/Genie/Genie/src/qualla/include/qualla/engine.hpp index 73c9bfb2950c3dbea1f97abdf2b9d33e5f26aae6..a3c10b8da7f60084d8f0e7ff72627f30e1918735 100644 --- a/Genie/Genie/src/qualla/include/qualla/engine.hpp +++ b/Genie/Genie/src/qualla/include/qualla/engine.hpp @@ -86,7 +86,7 @@ class Engine : public State { QUALLA_API virtual bool updateKV(size_t n_past, const std::vector& selected); QUALLA_API virtual bool save(const std::string& name); - QUALLA_API virtual size_t restore(const std::string& name); + QUALLA_API virtual size_t restore(const std::string& name, bool chooseHigherVariant=false); QUALLA_API virtual void reset(); QUALLA_API virtual bool cacheEosEmbedding(std::vector& eosEmbedding); diff --git a/Genie/Genie/src/qualla/include/qualla/sampler.hpp b/Genie/Genie/src/qualla/include/qualla/sampler.hpp index e1f0f5249cde4a54f9316dde0b35c9bfb1b2c659..7841fa4415268e0f1e6ce263edebe8b5fcdaa727 100644 --- a/Genie/Genie/src/qualla/include/qualla/sampler.hpp +++ b/Genie/Genie/src/qualla/include/qualla/sampler.hpp @@ -54,6 +54,7 @@ class Sampler : public State { QUALLA_API virtual bool save(const std::string& name); QUALLA_API virtual bool restore(const std::string& name); QUALLA_API virtual void reset(); + QUALLA_API virtual void applyConfig(const qualla::json& conf); // Get sampler type const std::string& type() const { return _type; } diff --git a/Genie/Genie/src/qualla/sampler.cpp b/Genie/Genie/src/qualla/sampler.cpp index e8398b5f6e00e8fba89f2671af2a800d44280bd6..1bd4e0cffdd62981ff037faddc7598e85fed4f83 100644 --- a/Genie/Genie/src/qualla/sampler.cpp +++ b/Genie/Genie/src/qualla/sampler.cpp @@ -84,6 +84,10 @@ std::vector Sampler::process_multiple( return {-1}; } +void Sampler::applyConfig(const qualla::json& conf) { + _env.logger().warn(fmt::format("Basic sampler supports this for now")); +} + // Sampler registry using Registry = std::unordered_map; diff --git a/Genie/Genie/src/qualla/samplers/basic.cpp b/Genie/Genie/src/qualla/samplers/basic.cpp index 52084a336ce4e416aa2c1619f7308f213f891e3c..044efa27bd55490481529588388e9d1365ab514c 100644 --- a/Genie/Genie/src/qualla/samplers/basic.cpp +++ b/Genie/Genie/src/qualla/samplers/basic.cpp @@ -221,4 +221,12 @@ static OnLoad regy([]() { void needBasicSampler() {} +void BasicSampler::applyConfig(const json& conf) { + if (conf.contains("seed")) _seed = conf["seed"]; + if (conf.contains("temp")) _temp = conf["temp"]; + + if (conf.contains("top-k")) _top_k = conf["top-k"]; + if (conf.contains("top-p")) _top_p = conf["top-p"]; +} + } // namespace qualla diff --git a/Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock b/Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock index f48eb7a896e34b6d820eedccb0b6cbad3dbbee3d..5c65879523bc20ed14f0a4408c5cee1c0be83760 100644 --- a/Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock +++ b/Genie/Genie/src/qualla/tokenizers/rust/Cargo.lock @@ -31,9 +31,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.1.34" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b9470d453346108f93a59222a9a1a5724db32d0a4727b7ab7ace4b4d822dc9" +checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" dependencies = [ "shlex", ] @@ -190,9 +190,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2" [[package]] name = "lazy_static" @@ -202,9 +202,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.161" +version = "0.2.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" [[package]] name = "log" @@ -322,9 +322,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3" dependencies = [ "unicode-ident", ] @@ -413,9 +413,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -436,18 +436,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -498,9 +498,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.87" +version = "2.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e" dependencies = [ "proc-macro2", "quote", @@ -509,18 +509,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.66" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d171f59dbaa811dbbb1aee1e73db92ec2b122911a48e1390dfe327a821ddede" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.66" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08be0f17bd307950653ce45db00cd31200d82b624b36e181337d9c7d92765b5" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", @@ -529,9 +529,9 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.20.1" +version = "0.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b172ffa9a2e5c31bbddc940cd5725d933ced983a9333bbebc4c7eda3bbce1557" +checksum = "67b67c92f6d705e2a1d106fb0b28c696f9074901a9c656ee5d9f5de204c39bf7" dependencies = [ "aho-corasick", "derive_builder", @@ -569,9 +569,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-normalization-alignments" diff --git a/Genie/Model/model.cpp b/Genie/Model/model.cpp index 05b4347399fd36a0a8087c3740fea934a18863d6..ff1809a348105b921b43cece3e209fa5fe4dbda7 100644 --- a/Genie/Model/model.cpp +++ b/Genie/Model/model.cpp @@ -179,8 +179,29 @@ MODEL_LIB_EXPORT ModelError_t QnnModel_GenAI_composeGraphs(Qnn_BackendHandle_t b (Qnn_Tensor_t)tin6), err); + uint32_t input6Dim[1] = {1}; + Qnn_Tensor_t tin7; + tin7.version = QNN_TENSOR_VERSION_1; + tin7.v1.id = 0; + tin7.v1.name = "x6"; + tin7.v1.type = QNN_TENSOR_TYPE_APP_WRITE; + tin7.v1.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; + tin7.v1.dataType = QNN_DATATYPE_FLOAT_32; + tin7.v1.quantizeParams.encodingDefinition = QNN_DEFINITION_UNDEFINED; + tin7.v1.quantizeParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED; + tin7.v1.quantizeParams.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}; + tin7.v1.rank = 1; + tin7.v1.dimensions = input6Dim; + tin7.v1.memType = QNN_TENSORMEMTYPE_RAW; + tin7.v1.clientBuf = {.data = nullptr, .dataSize = 0}; + VALIDATE(qnn_model.addTensor( + "x6", // Node Name + (Qnn_Tensor_t)tin7), + err); + /* ADDING NODE FOR genAI */ - const char* inputs_genAI[] = {"x0", "x1", "x2", "x3", "x4", "x5"}; + const char* inputs_genAI[] = {"x0", "x1", "x2", "x3", "x4", "x5", "x6"}; Qnn_Tensor_t tout; tout.version = QNN_TENSOR_VERSION_1; @@ -224,7 +245,7 @@ MODEL_LIB_EXPORT ModelError_t QnnModel_GenAI_composeGraphs(Qnn_BackendHandle_t b params, // Node Params numParams, // Num Node Params inputs_genAI, // Input Tensor Names - 6, // Num Input Tensor Names + 7, // Num Input Tensor Names outputs_genAI, // Output Tensors 2 // Num Output Tensors ), diff --git a/Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json b/Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json index deba8e660921e50d0240a412a8a7bfaaec2b4ea1..c3e3e3eaea15b58f5128e91358604b6c5e926299 100644 --- a/Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json +++ b/Genie/configs/llama2-7b/llama2-7b-draft-htp-target-htp-spd.json @@ -43,7 +43,8 @@ "cpu-mask": "0xe0", "kv-dim": 64, "kv-update-method": "SHIFT_CONCAT", - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions": "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-genaitransformer-lora.json b/Genie/configs/llama2-7b/llama2-7b-genaitransformer-lora.json new file mode 100644 index 0000000000000000000000000000000000000000..786421057caa818ad7a2f15c0a353869cbd42975 --- /dev/null +++ b/Genie/configs/llama2-7b/llama2-7b-genaitransformer-lora.json @@ -0,0 +1,62 @@ +{ + "dialog" : { + "version" : 1, + "type" : "basic", + "stop-sequence" : [""], + "max-num-tokens" : 200, + "context" : { + "version" : 1, + "size": 512, + "n-vocab": 32000, + "bos-token": 1, + "eos-token": 2 + }, + "sampler" : { + "version" : 1, + "seed" : 100, + "temp" : 1.2, + "top-k" : 20, + "top-p" : 0.75, + "greedy" : false + }, + "tokenizer" : { + "version" : 1, + "path" : "your/path/to/tokenizer_file.json" + }, + "engine" : { + "version" : 1, + "n-threads" : 6, + "backend" : { + "version" : 1, + "type" : "QnnGenAiTransformer", + "QnnGenAiTransformer" : { + "version" : 1, + "n-layer": 32, + "n-embd": 4096, + "n-heads": 32 + } + }, + "model" : { + "version" : 1, + "type" : "library", + "library" : { + "version" : 1, + "model-bin" : "your/path/to/model/file.bin", + "lora": { + "version": 1, + "alpha-tensor-name": "alpha", + "adapters": [ + { + "version": 1, + "name": "lora1", + "bin-sections": [ + "your/path/to/model/lora/file.bin" + ] + } + ] + } + } + } + } + } +} \ No newline at end of file diff --git a/Genie/configs/llama2-7b/llama2-7b-genaitransformer.json b/Genie/configs/llama2-7b/llama2-7b-genaitransformer.json index 482bc6315c28a4d1b388d6cc9181f0067ba1ed86..0c49d20bb57773551332953b1210b44d1eff1d64 100644 --- a/Genie/configs/llama2-7b/llama2-7b-genaitransformer.json +++ b/Genie/configs/llama2-7b/llama2-7b-genaitransformer.json @@ -30,7 +30,10 @@ "version" : 1, "type" : "QnnGenAiTransformer", "QnnGenAiTransformer" : { - "version" : 1 + "version" : 1, + "n-layer": 32, + "n-embd": 4096, + "n-heads": 32 } }, "model" : { diff --git a/Genie/configs/llama2-7b/llama2-7b-gpu.json b/Genie/configs/llama2-7b/llama2-7b-gpu.json new file mode 100644 index 0000000000000000000000000000000000000000..162a635d2fe701d47c48ab1d8cd1a4de528b7234 --- /dev/null +++ b/Genie/configs/llama2-7b/llama2-7b-gpu.json @@ -0,0 +1,43 @@ +{ + "dialog" : { + "version" : 1, + "type" : "basic", + "context" : { + "version" : 1, + "size": 1024, + "n-vocab": 32000, + "bos-token": 1, + "eos-token": 2 + }, + "sampler" : { + "version" : 1, + "seed" : 42, + "temp" : 1.1, + "top-k" : 40, + "top-p" : 0.95, + "greedy" : false + }, + "tokenizer" : { + "version" : 1, + "path" : "/path/to/tokenizer.json" + }, + "engine" : { + "version" : 1, + "n-threads" : 3, + "backend" : { + "version" : 1, + "type" : "QnnGpu" + }, + "model" : { + "version" : 1, + "type" : "binary", + "binary" : { + "version" : 1, + "ctx-bins" : [ + "/path/to/model.bin" + ] + } + } + } + } +} diff --git a/Genie/configs/llama2-7b/llama2-7b-htp-lade.json b/Genie/configs/llama2-7b/llama2-7b-htp-lade.json index 5f58c7f2f1858f645e61a9e224114cef587b881b..10c2ae0934e6cd5ab71bcb699773c32acdeee9f0 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp-lade.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp-lade.json @@ -43,7 +43,8 @@ "pos-id-dim" : 64, "cpu-mask" : "0xe0", "kv-dim" : 128, - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-htp-lora.json b/Genie/configs/llama2-7b/llama2-7b-htp-lora.json index dca87b77308ed8685e2a962a4d4774d14365e5aa..b9d2b770adc7bf0daf342d19b5909b5a45cc10b0 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp-lora.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp-lora.json @@ -36,7 +36,8 @@ "pos-id-dim" : 64, "cpu-mask" : "0xe0", "kv-dim" : 128, - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-htp-multistream.json b/Genie/configs/llama2-7b/llama2-7b-htp-multistream.json index 4b92aec9ac2639fbf1d4da98f09d4e39406c8bec..346e4fd6f6011f6adcbd9d1180c9cb98b26e8cc7 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp-multistream.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp-multistream.json @@ -39,7 +39,8 @@ "poll" : true, "pos-id-dim" : 64, "cpu-mask" : "0xe0", - "kv-dim" : 128 + "kv-dim" : 128, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-htp-ssd.json b/Genie/configs/llama2-7b/llama2-7b-htp-ssd.json index 4b73c2880cdec9a676995b0a4ed4f5a4ad87c36a..3961fd62bfebd39ca40fa08765743ef084b6dfd1 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp-ssd.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp-ssd.json @@ -46,7 +46,8 @@ "pos-id-dim" : 64, "cpu-mask" : "0xe0", "kv-dim" : 128, - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-htp-windows.json b/Genie/configs/llama2-7b/llama2-7b-htp-windows.json index de0c403d346090f8e1bed4314cf58c8e28a6e282..e5394f0c5beaec3dc483d54ae508d6e747bd1c5e 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp-windows.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp-windows.json @@ -36,7 +36,8 @@ "pos-id-dim" : 64, "cpu-mask" : "0xe0", "kv-dim" : 128, - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama2-7b/llama2-7b-htp.json b/Genie/configs/llama2-7b/llama2-7b-htp.json index 3dd67a5bcab5529396de2bfda2c989930acba375..f348facb0b25d4decee6db4bca7cd96ef0f03642 100644 --- a/Genie/configs/llama2-7b/llama2-7b-htp.json +++ b/Genie/configs/llama2-7b/llama2-7b-htp.json @@ -36,7 +36,8 @@ "poll" : true, "cpu-mask" : "0xe0", "kv-dim" : 128, - "allow-async-init": false + "allow-async-init": false, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llama3-8b/llama3-8b-htp.json b/Genie/configs/llama3-8b/llama3-8b-htp.json index d9061158aebac38cb920821e0269ad2e159da1a5..f6cb57296a3095d7eba47860f23901ef1dc0573b 100644 --- a/Genie/configs/llama3-8b/llama3-8b-htp.json +++ b/Genie/configs/llama3-8b/llama3-8b-htp.json @@ -36,7 +36,8 @@ "pos-id-dim" : 64, "cpu-mask" : "0xe0", "kv-dim" : 128, - "rope-theta": 10000 + "rope-theta": 10000, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/llava-e2t/llava-e2t-htp.json b/Genie/configs/llava-e2t/llava-e2t-htp.json index 9b246870ef0e0da9fa1bd566892c6eb855a0ddb5..acc7120a91755919e5355591ac57ad0a5992a37c 100644 --- a/Genie/configs/llava-e2t/llava-e2t-htp.json +++ b/Genie/configs/llava-e2t/llava-e2t-htp.json @@ -43,7 +43,8 @@ "cpu-mask" : "0xe0", "kv-dim" : 128, "allow-async-init" : false, - "rope-theta" : 10000 + "rope-theta" : 10000, + "enable-graph-switching": false }, "extensions" : "htp_backend_ext_config.json" }, diff --git a/Genie/configs/sampler.json b/Genie/configs/sampler.json new file mode 100644 index 0000000000000000000000000000000000000000..a5bd72be89249a9c4c5a2c11fdc41214e063e2d9 --- /dev/null +++ b/Genie/configs/sampler.json @@ -0,0 +1,10 @@ +{ + "sampler" : { + "version" : 1, + "seed" : 100, + "temp" : 1.2, + "top-k" : 25, + "top-p" : 0.8, + "greedy" : false + } +} diff --git a/Genie/genie-t2t-run/main.cpp b/Genie/genie-t2t-run/main.cpp index aea8d2b9b2906c88c7fbfd7db4ba725441e586ca..f7156b59eb71b5f0060a11402526e2cac5ae2ec4 100644 --- a/Genie/genie-t2t-run/main.cpp +++ b/Genie/genie-t2t-run/main.cpp @@ -21,6 +21,7 @@ #include "GenieCommon.h" #include "GenieDialog.h" +#include "GenieSampler.h" std::string config{}; std::string prompt{}; @@ -146,6 +147,15 @@ void printUsage(const char* program) { std::cout << "Input tokens provided as a file. Mutually exclusive with --prompt, --prompt_file " "and --embedding_file." << std::endl; + std::cout << std::endl; + std::cout << std::setw(width) << " -s PATH or --save PATH"; + std::cout << "Saves the dialog state after the dialog is queried. PATH must be an existing path." + << std::endl; + std::cout << std::endl; + std::cout << std::setw(width) << " -r PATH or --restore PATH"; + std::cout << "Restores the dialog state before the dialog is queried. PATH must contain a " + "previous save state." + << std::endl; } std::vector split(const std::string& str) { @@ -495,6 +505,49 @@ void tokenToTokenCallback(const uint32_t* token, } } } +/* + * This class can be used to update sampler parameters in between queries + * Usage: + SamplerConfig sc = SamplerConfig(); + sc.createSamplerConfig(configPath); + sc.setParam("top-p", "0.8"); // You can refer to sampler.json for the parameters that can be + updated dialog.getSampler(); dialog.applyConfig(sc()); + */ +class SamplerConfig { + public: + void createSamplerConfig(const std::string& configPath) { + std::ifstream confStream(configPath); + std::string config; + std::getline(confStream, config, '\0'); + m_config = config; + const int32_t status = GenieSamplerConfig_createFromJson(config.c_str(), &m_handle); + if (GENIE_STATUS_SUCCESS != status) { + throw std::runtime_error("Failed to create sampler config."); + } + } + + std::string getConfigString() { return m_config; } + + void setParam(const std::string keyStr, const std::string valueStr) { + const int32_t status = GenieSamplerConfig_setParam(m_handle, keyStr.c_str(), valueStr.c_str()); + if (GENIE_STATUS_SUCCESS != status) { + throw std::runtime_error("Failed to setParam"); + } + } + + ~SamplerConfig() { + const int32_t status = GenieSamplerConfig_free(m_handle); + if (GENIE_STATUS_SUCCESS != status) { + std::cerr << "Failed to free the sampler config." << std::endl; + } + } + + GenieSamplerConfig_Handle_t operator()() const { return m_handle; } + + private: + GenieSamplerConfig_Handle_t m_handle = NULL; + std::string m_config; +}; class Dialog { public: @@ -559,6 +612,19 @@ class Dialog { } } + void getSampler() { + const int32_t status = GenieDialog_getSampler(m_handle, &m_samplerHandle); + if (GENIE_STATUS_SUCCESS != status) { + throw std::runtime_error("Failed to get sampler."); + } + } + + void applyConfig(GenieSamplerConfig_Handle_t samplerConfigHandle) { + const int32_t status = GenieSampler_applyConfig(m_samplerHandle, samplerConfigHandle); + if (GENIE_STATUS_SUCCESS != status) { + throw std::runtime_error("Failed to apply sampler config."); + } + } #if defined(GENIE_E2T_FEATURE) void embeddingQuery(const void* embeddings, const uint32_t embeddingsSize) { GenieDialog_TokenToEmbeddingCallback_t t2eCallback{nullptr}; @@ -632,7 +698,8 @@ class Dialog { #endif private: - GenieDialog_Handle_t m_handle = NULL; + GenieDialog_Handle_t m_handle = NULL; + GenieSampler_Handle_t m_samplerHandle = NULL; }; int main(int argc, char** argv) { @@ -655,6 +722,9 @@ int main(int argc, char** argv) { dialog.setLoraStrength("primary", loraAlphaName, loraAlphaValue); } #endif + if (!restorePath.empty()) { + dialog.restore(restorePath); + } #if defined(GENIE_E2T_FEATURE) if (embeddingBufferSize != 0) { @@ -675,13 +745,10 @@ int main(int argc, char** argv) { } else { std::cout << "[PROMPT]: " << prompt.c_str() << std::endl; std::cout << std::endl; - if (!restorePath.empty()) { - dialog.restore(restorePath); - } dialog.query(prompt); - if (!savePath.empty()) { - dialog.save(savePath); - } + } + if (!savePath.empty()) { + dialog.save(savePath); } } catch (const std::exception& e) { std::cerr << e.what() << std::endl; diff --git a/Genie/genie-t2t-run/make/Makefile.linux-x86_64 b/Genie/genie-t2t-run/make/Makefile.linux-x86_64 index c1425fda9406b0dc4c7680c9ba9ee747a81386ed..1d919fe0d49f6507f4687e56fdea81a66fd39646 100644 --- a/Genie/genie-t2t-run/make/Makefile.linux-x86_64 +++ b/Genie/genie-t2t-run/make/Makefile.linux-x86_64 @@ -18,7 +18,7 @@ endif QNN_TARGET ?= x86_64-linux-clang export TARGET_DIR := ./bin/$(QNN_TARGET) -genie-t2t-run := $(TARGET_DIR)/genie-t2t-run +genie-t2t-run := $(TARGET_DIR)/genie-t2t-run-new # define target architecture if not previously defined, default is x86