Spaces:
Sleeping
Sleeping
| /** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. | |
| * NVIDIA CORPORATION and its licensors retain all intellectual property | |
| * and proprietary rights in and to this software, related documentation | |
| * and any modifications thereto. Any use, reproduction, disclosure or | |
| * distribution of this software and related documentation without an express | |
| * license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| */ | |
| extern "C" void __chkstk(); | |
| extern "C" void __bzero(void*, size_t); | |
| extern "C" __double2 __sincos_stret(double); | |
| extern "C" __float2 __sincosf_stret(float); | |
| extern "C" { | |
| // GDB and LLDB support debugging of JIT-compiled code by observing calls to __jit_debug_register_code() | |
| // by putting a breakpoint on it, and retrieving the debug info through __jit_debug_descriptor. | |
| // On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain | |
| // their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug | |
| // info. By forward-declaring them here it suffices to compile this file with /Zi. | |
| extern struct jit_descriptor __jit_debug_descriptor; | |
| extern void __jit_debug_register_code(); | |
| } | |
| namespace wp { | |
| // Windows defaults to using the COFF binary format (aka. "msvc" in the target triple). | |
| // Override it to use the ELF format to support DWARF debug info, but keep using the | |
| // Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html). | |
| static const char* target_triple = "x86_64-pc-windows-elf"; | |
| static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE; | |
| static void initialize_llvm() | |
| { | |
| llvm::InitializeAllTargetInfos(); | |
| llvm::InitializeAllTargets(); | |
| llvm::InitializeAllTargetMCs(); | |
| llvm::InitializeAllAsmPrinters(); | |
| } | |
| static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context) | |
| { | |
| // Compilation arguments | |
| std::vector<const char*> args; | |
| args.push_back(input_file.c_str()); | |
| args.push_back("-I"); | |
| args.push_back(include_dir); | |
| args.push_back(debug ? "-O0" : "-O2"); | |
| args.push_back("-triple"); | |
| args.push_back(target_triple); | |
| args.push_back("-target-feature"); | |
| args.push_back("+f16c"); // Enables support for _Float16 | |
| clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions(); | |
| std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer = | |
| std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options); | |
| clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids; | |
| std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine = | |
| std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release()); | |
| clang::CompilerInstance compiler_instance; | |
| auto& compiler_invocation = compiler_instance.getInvocation(); | |
| clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release()); | |
| if(debug) | |
| { | |
| compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo); | |
| } | |
| // Map code to a MemoryBuffer | |
| std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src); | |
| compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get()); | |
| if(!debug) | |
| { | |
| compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG"); | |
| } | |
| compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64 | |
| compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec | |
| compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false); | |
| clang::EmitLLVMOnlyAction emit_llvm_only_action(&context); | |
| bool success = compiler_instance.ExecuteAction(emit_llvm_only_action); | |
| buffer.release(); | |
| return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr; | |
| } | |
| static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context) | |
| { | |
| // Compilation arguments | |
| std::vector<const char*> args; | |
| args.push_back(input_file.c_str()); | |
| args.push_back("-I"); | |
| args.push_back(include_dir); | |
| args.push_back(debug ? "-O0" : "-O2"); | |
| args.push_back("-triple"); | |
| args.push_back("nvptx64-nvidia-cuda"); | |
| args.push_back("-target-cpu"); | |
| args.push_back("sm_70"); | |
| clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions(); | |
| std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer = | |
| std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options); | |
| clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids; | |
| std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine = | |
| std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release()); | |
| clang::CompilerInstance compiler_instance; | |
| auto& compiler_invocation = compiler_instance.getInvocation(); | |
| clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release()); | |
| if(debug) | |
| { | |
| compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo); | |
| } | |
| // Map code to a MemoryBuffer | |
| std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src); | |
| compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get()); | |
| // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation." | |
| // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include. | |
| // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp | |
| compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__"); | |
| if(!debug) | |
| { | |
| compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG"); | |
| } | |
| compiler_instance.getLangOpts().CUDA = 1; | |
| compiler_instance.getLangOpts().CUDAIsDevice = 1; | |
| compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1; | |
| compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false); | |
| clang::EmitLLVMOnlyAction emit_llvm_only_action(&context); | |
| bool success = compiler_instance.ExecuteAction(emit_llvm_only_action); | |
| buffer.release(); | |
| return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr; | |
| } | |
| extern "C" { | |
| WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug) | |
| { | |
| initialize_llvm(); | |
| llvm::LLVMContext context; | |
| std::unique_ptr<llvm::Module> module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, context); | |
| if(!module) | |
| { | |
| return -1; | |
| } | |
| std::string error; | |
| const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error); | |
| const char* CPU = "generic"; | |
| const char* features = ""; | |
| llvm::TargetOptions target_options; | |
| llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code | |
| llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes | |
| llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model); | |
| module->setDataLayout(target_machine->createDataLayout()); | |
| std::error_code error_code; | |
| llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None); | |
| llvm::legacy::PassManager pass_manager; | |
| llvm::CodeGenFileType file_type = llvm::CGFT_ObjectFile; | |
| target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type); | |
| pass_manager.run(*module); | |
| output.flush(); | |
| delete target_machine; | |
| return 0; | |
| } | |
| WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug) | |
| { | |
| initialize_llvm(); | |
| llvm::LLVMContext context; | |
| std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context); | |
| if(!module) | |
| { | |
| return -1; | |
| } | |
| std::string error; | |
| const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error); | |
| const char* CPU = "sm_70"; | |
| const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5 | |
| llvm::TargetOptions target_options; | |
| llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; | |
| llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model); | |
| module->setDataLayout(target_machine->createDataLayout()); | |
| // Link libdevice | |
| llvm::SMDiagnostic diagnostic; | |
| std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc"; | |
| std::unique_ptr<llvm::Module> libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context)); | |
| if(!libdevice) | |
| { | |
| return -1; | |
| } | |
| llvm::Linker linker(*module.get()); | |
| if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true) | |
| { | |
| return -1; | |
| } | |
| std::error_code error_code; | |
| llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None); | |
| llvm::legacy::PassManager pass_manager; | |
| llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile; | |
| target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type); | |
| pass_manager.run(*module); | |
| output.flush(); | |
| delete target_machine; | |
| return 0; | |
| } | |
| // Global JIT instance | |
| static llvm::orc::LLJIT* jit = nullptr; | |
| // Load an object file into an in-memory DLL named `module_name` | |
| WP_API int load_obj(const char* object_file, const char* module_name) | |
| { | |
| if(!jit) | |
| { | |
| initialize_llvm(); | |
| auto jit_expected = llvm::orc::LLJITBuilder() | |
| .setObjectLinkingLayerCreator( | |
| [&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) { | |
| auto get_memory_manager = []() { | |
| return std::make_unique<llvm::SectionMemoryManager>(); | |
| }; | |
| auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager)); | |
| // Register the event listener. | |
| obj_linking_layer->registerJITEventListener(*llvm::JITEventListener::createGDBRegistrationListener()); | |
| // Make sure the debug info sections aren't stripped. | |
| obj_linking_layer->setProcessAllSections(true); | |
| return obj_linking_layer; | |
| }) | |
| .create(); | |
| if(!jit_expected) | |
| { | |
| std::cerr << "Failed to create JIT instance: " << toString(jit_expected.takeError()) << std::endl; | |
| return -1; | |
| } | |
| jit = (*jit_expected).release(); | |
| } | |
| auto dll = jit->createJITDylib(module_name); | |
| if(!dll) | |
| { | |
| std::cerr << "Failed to create JITDylib: " << toString(dll.takeError()) << std::endl; | |
| return -1; | |
| } | |
| // Define symbols for Warp's CRT functions subset | |
| { | |
| const auto flags = llvm::JITSymbolFlags::Exported | llvm::JITSymbolFlags::Absolute; | |
| auto error = dll->define(llvm::orc::absoluteSymbols({ | |
| SYMBOL(printf), SYMBOL(puts), SYMBOL(putchar), | |
| SYMBOL_T(abs, int(*)(int)), SYMBOL(llabs), | |
| SYMBOL(fmodf), SYMBOL_T(fmod, double(*)(double, double)), | |
| SYMBOL(logf), SYMBOL_T(log, double(*)(double)), | |
| SYMBOL(log2f), SYMBOL_T(log2, double(*)(double)), | |
| SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)), | |
| SYMBOL(expf), SYMBOL_T(exp, double(*)(double)), | |
| SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)), | |
| SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)), | |
| SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)), | |
| SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)), | |
| SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)), | |
| SYMBOL(fabsf), SYMBOL_T(fabs, double(*)(double)), | |
| SYMBOL(roundf), SYMBOL_T(round, double(*)(double)), | |
| SYMBOL(truncf), SYMBOL_T(trunc, double(*)(double)), | |
| SYMBOL(rintf), SYMBOL_T(rint, double(*)(double)), | |
| SYMBOL(acosf), SYMBOL_T(acos, double(*)(double)), | |
| SYMBOL(asinf), SYMBOL_T(asin, double(*)(double)), | |
| SYMBOL(atanf), SYMBOL_T(atan, double(*)(double)), | |
| SYMBOL(atan2f), SYMBOL_T(atan2, double(*)(double, double)), | |
| SYMBOL(cosf), SYMBOL_T(cos, double(*)(double)), | |
| SYMBOL(sinf), SYMBOL_T(sin, double(*)(double)), | |
| SYMBOL(tanf), SYMBOL_T(tan, double(*)(double)), | |
| SYMBOL(sinhf), SYMBOL_T(sinh, double(*)(double)), | |
| SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)), | |
| SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)), | |
| SYMBOL(fmaf), | |
| SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove), | |
| SYMBOL(_wp_assert), | |
| SYMBOL(_wp_isfinite), | |
| // For functions with large stack frames the compiler will emit a call to | |
| // __chkstk() to linearly touch each memory page. This grows the stack without | |
| // triggering the stack overflow guards. | |
| SYMBOL(__chkstk), | |
| SYMBOL(__bzero), | |
| SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret), | |
| SYMBOL(sincosf), SYMBOL_T(sincos, void(*)(double,double*,double*)), | |
| })); | |
| if(error) | |
| { | |
| std::cerr << "Failed to define symbols: " << llvm::toString(std::move(error)) << std::endl; | |
| return -1; | |
| } | |
| } | |
| // Load the object file into a memory buffer | |
| auto buffer = llvm::MemoryBuffer::getFile(object_file); | |
| if(!buffer) | |
| { | |
| std::cerr << "Failed to load object file: " << buffer.getError().message() << std::endl; | |
| return -1; | |
| } | |
| auto err = jit->addObjectFile(*dll, std::move(*buffer)); | |
| if(err) | |
| { | |
| std::cerr << "Failed to add object file: " << llvm::toString(std::move(err)) << std::endl; | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| WP_API int unload_obj(const char* module_name) | |
| { | |
| if(!jit) // If there's no JIT instance there are no object files loaded | |
| { | |
| return 0; | |
| } | |
| auto* dll = jit->getJITDylibByName(module_name); | |
| llvm::Error error = jit->getExecutionSession().removeJITDylib(*dll); | |
| if(error) | |
| { | |
| std::cerr << "Failed to unload: " << llvm::toString(std::move(error)) << std::endl; | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| WP_API uint64_t lookup(const char* dll_name, const char* function_name) | |
| { | |
| auto* dll = jit->getJITDylibByName(dll_name); | |
| auto func = jit->lookup(*dll, function_name); | |
| if(!func) | |
| { | |
| std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl; | |
| return 0; | |
| } | |
| return func->getValue(); | |
| } | |
| } // extern "C" | |
| } // namespace wp | |