qbhf2's picture
added NvidiaWarp and GarmentCode repos
66c9c8a
/** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/
#include "../native/crt.h"
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Basic/DiagnosticOptions.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/Basic/TargetInfo.h>
#include <clang/Lex/PreprocessorOptions.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/ExecutionEngine/GenericValue.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/MC/TargetRegistry.h>
#include <llvm/Support/Host.h>
#include <llvm/PassRegistry.h>
#include <llvm/InitializePasses.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Linker/Linker.h>
#include <llvm/ExecutionEngine/Orc/LLJIT.h>
#include <llvm/ExecutionEngine/JITEventListener.h>
#include <llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h>
#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
#include <llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h>
#include <llvm/ExecutionEngine/SectionMemoryManager.h>
#include <cmath>
#include <vector>
#include <iostream>
#include <string>
#include <cstring>
#if defined(_WIN64)
extern "C" void __chkstk();
#elif defined(__APPLE__)
extern "C" void __bzero(void*, size_t);
extern "C" __double2 __sincos_stret(double);
extern "C" __float2 __sincosf_stret(float);
#endif
extern "C" {
// GDB and LLDB support debugging of JIT-compiled code by observing calls to __jit_debug_register_code()
// by putting a breakpoint on it, and retrieving the debug info through __jit_debug_descriptor.
// On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain
// their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug
// info. By forward-declaring them here it suffices to compile this file with /Zi.
extern struct jit_descriptor __jit_debug_descriptor;
extern void __jit_debug_register_code();
}
namespace wp {
#if defined (_WIN32)
// Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
// Override it to use the ELF format to support DWARF debug info, but keep using the
// Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
static const char* target_triple = "x86_64-pc-windows-elf";
#else
static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
#endif
static void initialize_llvm()
{
llvm::InitializeAllTargetInfos();
llvm::InitializeAllTargets();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmPrinters();
}
static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
{
// Compilation arguments
std::vector<const char*> args;
args.push_back(input_file.c_str());
args.push_back("-I");
args.push_back(include_dir);
args.push_back(debug ? "-O0" : "-O2");
args.push_back("-triple");
args.push_back(target_triple);
#if defined(__x86_64__) || defined(_M_X64)
args.push_back("-target-feature");
args.push_back("+f16c"); // Enables support for _Float16
#endif
clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
clang::CompilerInstance compiler_instance;
auto& compiler_invocation = compiler_instance.getInvocation();
clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
if(debug)
{
compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
}
// Map code to a MemoryBuffer
std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
if(!debug)
{
compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
}
compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec
compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
buffer.release();
return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
}
static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
{
// Compilation arguments
std::vector<const char*> args;
args.push_back(input_file.c_str());
args.push_back("-I");
args.push_back(include_dir);
args.push_back(debug ? "-O0" : "-O2");
args.push_back("-triple");
args.push_back("nvptx64-nvidia-cuda");
args.push_back("-target-cpu");
args.push_back("sm_70");
clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
clang::CompilerInstance compiler_instance;
auto& compiler_invocation = compiler_instance.getInvocation();
clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
if(debug)
{
compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
}
// Map code to a MemoryBuffer
std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
// According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
// But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
// The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
if(!debug)
{
compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
}
compiler_instance.getLangOpts().CUDA = 1;
compiler_instance.getLangOpts().CUDAIsDevice = 1;
compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
buffer.release();
return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
}
extern "C" {
WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
{
initialize_llvm();
llvm::LLVMContext context;
std::unique_ptr<llvm::Module> module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, context);
if(!module)
{
return -1;
}
std::string error;
const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
const char* CPU = "generic";
const char* features = "";
llvm::TargetOptions target_options;
llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code
llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes
llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
module->setDataLayout(target_machine->createDataLayout());
std::error_code error_code;
llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);
llvm::legacy::PassManager pass_manager;
llvm::CodeGenFileType file_type = llvm::CGFT_ObjectFile;
target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);
pass_manager.run(*module);
output.flush();
delete target_machine;
return 0;
}
WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
{
initialize_llvm();
llvm::LLVMContext context;
std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);
if(!module)
{
return -1;
}
std::string error;
const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
const char* CPU = "sm_70";
const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5
llvm::TargetOptions target_options;
llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
module->setDataLayout(target_machine->createDataLayout());
// Link libdevice
llvm::SMDiagnostic diagnostic;
std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc";
std::unique_ptr<llvm::Module> libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context));
if(!libdevice)
{
return -1;
}
llvm::Linker linker(*module.get());
if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true)
{
return -1;
}
std::error_code error_code;
llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);
llvm::legacy::PassManager pass_manager;
llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile;
target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);
pass_manager.run(*module);
output.flush();
delete target_machine;
return 0;
}
// Global JIT instance
static llvm::orc::LLJIT* jit = nullptr;
// Load an object file into an in-memory DLL named `module_name`
WP_API int load_obj(const char* object_file, const char* module_name)
{
if(!jit)
{
initialize_llvm();
auto jit_expected = llvm::orc::LLJITBuilder()
.setObjectLinkingLayerCreator(
[&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
auto get_memory_manager = []() {
return std::make_unique<llvm::SectionMemoryManager>();
};
auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager));
// Register the event listener.
obj_linking_layer->registerJITEventListener(*llvm::JITEventListener::createGDBRegistrationListener());
// Make sure the debug info sections aren't stripped.
obj_linking_layer->setProcessAllSections(true);
return obj_linking_layer;
})
.create();
if(!jit_expected)
{
std::cerr << "Failed to create JIT instance: " << toString(jit_expected.takeError()) << std::endl;
return -1;
}
jit = (*jit_expected).release();
}
auto dll = jit->createJITDylib(module_name);
if(!dll)
{
std::cerr << "Failed to create JITDylib: " << toString(dll.takeError()) << std::endl;
return -1;
}
// Define symbols for Warp's CRT functions subset
{
#if defined(__APPLE__)
#define MANGLING_PREFIX "_"
#else
#define MANGLING_PREFIX ""
#endif
const auto flags = llvm::JITSymbolFlags::Exported | llvm::JITSymbolFlags::Absolute;
#define SYMBOL(sym) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(&::sym), flags} }
#define SYMBOL_T(sym, T) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(static_cast<T>(&::sym)), flags} }
auto error = dll->define(llvm::orc::absoluteSymbols({
SYMBOL(printf), SYMBOL(puts), SYMBOL(putchar),
SYMBOL_T(abs, int(*)(int)), SYMBOL(llabs),
SYMBOL(fmodf), SYMBOL_T(fmod, double(*)(double, double)),
SYMBOL(logf), SYMBOL_T(log, double(*)(double)),
SYMBOL(log2f), SYMBOL_T(log2, double(*)(double)),
SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)),
SYMBOL(expf), SYMBOL_T(exp, double(*)(double)),
SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)),
SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)),
SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)),
SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)),
SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)),
SYMBOL(fabsf), SYMBOL_T(fabs, double(*)(double)),
SYMBOL(roundf), SYMBOL_T(round, double(*)(double)),
SYMBOL(truncf), SYMBOL_T(trunc, double(*)(double)),
SYMBOL(rintf), SYMBOL_T(rint, double(*)(double)),
SYMBOL(acosf), SYMBOL_T(acos, double(*)(double)),
SYMBOL(asinf), SYMBOL_T(asin, double(*)(double)),
SYMBOL(atanf), SYMBOL_T(atan, double(*)(double)),
SYMBOL(atan2f), SYMBOL_T(atan2, double(*)(double, double)),
SYMBOL(cosf), SYMBOL_T(cos, double(*)(double)),
SYMBOL(sinf), SYMBOL_T(sin, double(*)(double)),
SYMBOL(tanf), SYMBOL_T(tan, double(*)(double)),
SYMBOL(sinhf), SYMBOL_T(sinh, double(*)(double)),
SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)),
SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)),
SYMBOL(fmaf),
SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove),
SYMBOL(_wp_assert),
SYMBOL(_wp_isfinite),
#if defined(_WIN64)
// For functions with large stack frames the compiler will emit a call to
// __chkstk() to linearly touch each memory page. This grows the stack without
// triggering the stack overflow guards.
SYMBOL(__chkstk),
#elif defined(__APPLE__)
SYMBOL(__bzero),
SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
#else
SYMBOL(sincosf), SYMBOL_T(sincos, void(*)(double,double*,double*)),
#endif
}));
if(error)
{
std::cerr << "Failed to define symbols: " << llvm::toString(std::move(error)) << std::endl;
return -1;
}
}
// Load the object file into a memory buffer
auto buffer = llvm::MemoryBuffer::getFile(object_file);
if(!buffer)
{
std::cerr << "Failed to load object file: " << buffer.getError().message() << std::endl;
return -1;
}
auto err = jit->addObjectFile(*dll, std::move(*buffer));
if(err)
{
std::cerr << "Failed to add object file: " << llvm::toString(std::move(err)) << std::endl;
return -1;
}
return 0;
}
WP_API int unload_obj(const char* module_name)
{
if(!jit) // If there's no JIT instance there are no object files loaded
{
return 0;
}
auto* dll = jit->getJITDylibByName(module_name);
llvm::Error error = jit->getExecutionSession().removeJITDylib(*dll);
if(error)
{
std::cerr << "Failed to unload: " << llvm::toString(std::move(error)) << std::endl;
return -1;
}
return 0;
}
WP_API uint64_t lookup(const char* dll_name, const char* function_name)
{
auto* dll = jit->getJITDylibByName(dll_name);
auto func = jit->lookup(*dll, function_name);
if(!func)
{
std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl;
return 0;
}
return func->getValue();
}
} // extern "C"
} // namespace wp