/** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation * and any modifications thereto. Any use, reproduction, disclosure or * distribution of this software and related documentation without an express * license agreement from NVIDIA CORPORATION is strictly prohibited. */ #include "../native/crt.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(_WIN64) extern "C" void __chkstk(); #elif defined(__APPLE__) extern "C" void __bzero(void*, size_t); extern "C" __double2 __sincos_stret(double); extern "C" __float2 __sincosf_stret(float); #endif extern "C" { // GDB and LLDB support debugging of JIT-compiled code by observing calls to __jit_debug_register_code() // by putting a breakpoint on it, and retrieving the debug info through __jit_debug_descriptor. // On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain // their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug // info. By forward-declaring them here it suffices to compile this file with /Zi. extern struct jit_descriptor __jit_debug_descriptor; extern void __jit_debug_register_code(); } namespace wp { #if defined (_WIN32) // Windows defaults to using the COFF binary format (aka. "msvc" in the target triple). // Override it to use the ELF format to support DWARF debug info, but keep using the // Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html). static const char* target_triple = "x86_64-pc-windows-elf"; #else static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE; #endif static void initialize_llvm() { llvm::InitializeAllTargetInfos(); llvm::InitializeAllTargets(); llvm::InitializeAllTargetMCs(); llvm::InitializeAllAsmPrinters(); } static std::unique_ptr cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context) { // Compilation arguments std::vector args; args.push_back(input_file.c_str()); args.push_back("-I"); args.push_back(include_dir); args.push_back(debug ? "-O0" : "-O2"); args.push_back("-triple"); args.push_back(target_triple); #if defined(__x86_64__) || defined(_M_X64) args.push_back("-target-feature"); args.push_back("+f16c"); // Enables support for _Float16 #endif clang::IntrusiveRefCntPtr diagnostic_options = new clang::DiagnosticOptions(); std::unique_ptr text_diagnostic_printer = std::make_unique(llvm::errs(), &*diagnostic_options); clang::IntrusiveRefCntPtr diagnostic_ids; std::unique_ptr diagnostic_engine = std::make_unique(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release()); clang::CompilerInstance compiler_instance; auto& compiler_invocation = compiler_instance.getInvocation(); clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release()); if(debug) { compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo); } // Map code to a MemoryBuffer std::unique_ptr buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src); compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get()); if(!debug) { compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG"); } compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64 compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false); clang::EmitLLVMOnlyAction emit_llvm_only_action(&context); bool success = compiler_instance.ExecuteAction(emit_llvm_only_action); buffer.release(); return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr; } static std::unique_ptr cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context) { // Compilation arguments std::vector args; args.push_back(input_file.c_str()); args.push_back("-I"); args.push_back(include_dir); args.push_back(debug ? "-O0" : "-O2"); args.push_back("-triple"); args.push_back("nvptx64-nvidia-cuda"); args.push_back("-target-cpu"); args.push_back("sm_70"); clang::IntrusiveRefCntPtr diagnostic_options = new clang::DiagnosticOptions(); std::unique_ptr text_diagnostic_printer = std::make_unique(llvm::errs(), &*diagnostic_options); clang::IntrusiveRefCntPtr diagnostic_ids; std::unique_ptr diagnostic_engine = std::make_unique(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release()); clang::CompilerInstance compiler_instance; auto& compiler_invocation = compiler_instance.getInvocation(); clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release()); if(debug) { compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo); } // Map code to a MemoryBuffer std::unique_ptr buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src); compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get()); // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation." // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include. // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__"); if(!debug) { compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG"); } compiler_instance.getLangOpts().CUDA = 1; compiler_instance.getLangOpts().CUDAIsDevice = 1; compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1; compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false); clang::EmitLLVMOnlyAction emit_llvm_only_action(&context); bool success = compiler_instance.ExecuteAction(emit_llvm_only_action); buffer.release(); return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr; } extern "C" { WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug) { initialize_llvm(); llvm::LLVMContext context; std::unique_ptr module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, context); if(!module) { return -1; } std::string error; const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error); const char* CPU = "generic"; const char* features = ""; llvm::TargetOptions target_options; llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model); module->setDataLayout(target_machine->createDataLayout()); std::error_code error_code; llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None); llvm::legacy::PassManager pass_manager; llvm::CodeGenFileType file_type = llvm::CGFT_ObjectFile; target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type); pass_manager.run(*module); output.flush(); delete target_machine; return 0; } WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug) { initialize_llvm(); llvm::LLVMContext context; std::unique_ptr module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context); if(!module) { return -1; } std::string error; const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error); const char* CPU = "sm_70"; const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5 llvm::TargetOptions target_options; llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model); module->setDataLayout(target_machine->createDataLayout()); // Link libdevice llvm::SMDiagnostic diagnostic; std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc"; std::unique_ptr libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context)); if(!libdevice) { return -1; } llvm::Linker linker(*module.get()); if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true) { return -1; } std::error_code error_code; llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None); llvm::legacy::PassManager pass_manager; llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile; target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type); pass_manager.run(*module); output.flush(); delete target_machine; return 0; } // Global JIT instance static llvm::orc::LLJIT* jit = nullptr; // Load an object file into an in-memory DLL named `module_name` WP_API int load_obj(const char* object_file, const char* module_name) { if(!jit) { initialize_llvm(); auto jit_expected = llvm::orc::LLJITBuilder() .setObjectLinkingLayerCreator( [&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) { auto get_memory_manager = []() { return std::make_unique(); }; auto obj_linking_layer = std::make_unique(session, std::move(get_memory_manager)); // Register the event listener. obj_linking_layer->registerJITEventListener(*llvm::JITEventListener::createGDBRegistrationListener()); // Make sure the debug info sections aren't stripped. obj_linking_layer->setProcessAllSections(true); return obj_linking_layer; }) .create(); if(!jit_expected) { std::cerr << "Failed to create JIT instance: " << toString(jit_expected.takeError()) << std::endl; return -1; } jit = (*jit_expected).release(); } auto dll = jit->createJITDylib(module_name); if(!dll) { std::cerr << "Failed to create JITDylib: " << toString(dll.takeError()) << std::endl; return -1; } // Define symbols for Warp's CRT functions subset { #if defined(__APPLE__) #define MANGLING_PREFIX "_" #else #define MANGLING_PREFIX "" #endif const auto flags = llvm::JITSymbolFlags::Exported | llvm::JITSymbolFlags::Absolute; #define SYMBOL(sym) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(&::sym), flags} } #define SYMBOL_T(sym, T) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(static_cast(&::sym)), flags} } auto error = dll->define(llvm::orc::absoluteSymbols({ SYMBOL(printf), SYMBOL(puts), SYMBOL(putchar), SYMBOL_T(abs, int(*)(int)), SYMBOL(llabs), SYMBOL(fmodf), SYMBOL_T(fmod, double(*)(double, double)), SYMBOL(logf), SYMBOL_T(log, double(*)(double)), SYMBOL(log2f), SYMBOL_T(log2, double(*)(double)), SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)), SYMBOL(expf), SYMBOL_T(exp, double(*)(double)), SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)), SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)), SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)), SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)), SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)), SYMBOL(fabsf), SYMBOL_T(fabs, double(*)(double)), SYMBOL(roundf), SYMBOL_T(round, double(*)(double)), SYMBOL(truncf), SYMBOL_T(trunc, double(*)(double)), SYMBOL(rintf), SYMBOL_T(rint, double(*)(double)), SYMBOL(acosf), SYMBOL_T(acos, double(*)(double)), SYMBOL(asinf), SYMBOL_T(asin, double(*)(double)), SYMBOL(atanf), SYMBOL_T(atan, double(*)(double)), SYMBOL(atan2f), SYMBOL_T(atan2, double(*)(double, double)), SYMBOL(cosf), SYMBOL_T(cos, double(*)(double)), SYMBOL(sinf), SYMBOL_T(sin, double(*)(double)), SYMBOL(tanf), SYMBOL_T(tan, double(*)(double)), SYMBOL(sinhf), SYMBOL_T(sinh, double(*)(double)), SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)), SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)), SYMBOL(fmaf), SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove), SYMBOL(_wp_assert), SYMBOL(_wp_isfinite), #if defined(_WIN64) // For functions with large stack frames the compiler will emit a call to // __chkstk() to linearly touch each memory page. This grows the stack without // triggering the stack overflow guards. SYMBOL(__chkstk), #elif defined(__APPLE__) SYMBOL(__bzero), SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret), #else SYMBOL(sincosf), SYMBOL_T(sincos, void(*)(double,double*,double*)), #endif })); if(error) { std::cerr << "Failed to define symbols: " << llvm::toString(std::move(error)) << std::endl; return -1; } } // Load the object file into a memory buffer auto buffer = llvm::MemoryBuffer::getFile(object_file); if(!buffer) { std::cerr << "Failed to load object file: " << buffer.getError().message() << std::endl; return -1; } auto err = jit->addObjectFile(*dll, std::move(*buffer)); if(err) { std::cerr << "Failed to add object file: " << llvm::toString(std::move(err)) << std::endl; return -1; } return 0; } WP_API int unload_obj(const char* module_name) { if(!jit) // If there's no JIT instance there are no object files loaded { return 0; } auto* dll = jit->getJITDylibByName(module_name); llvm::Error error = jit->getExecutionSession().removeJITDylib(*dll); if(error) { std::cerr << "Failed to unload: " << llvm::toString(std::move(error)) << std::endl; return -1; } return 0; } WP_API uint64_t lookup(const char* dll_name, const char* function_name) { auto* dll = jit->getJITDylibByName(dll_name); auto func = jit->lookup(*dll, function_name); if(!func) { std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl; return 0; } return func->getValue(); } } // extern "C" } // namespace wp