Spaces:

qbhf2
/

GarmentCode

Sleeping

App Files Files Community

GarmentCode / NvidiaWarp-GarmentCode /warp /native /clang /clang.cpp

qbhf2

added NvidiaWarp and GarmentCode repos

66c9c8a 11 months ago

raw

history blame contribute delete

17.2 kB

	/** Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
	* NVIDIA CORPORATION and its licensors retain all intellectual property
	* and proprietary rights in and to this software, related documentation
	* and any modifications thereto. Any use, reproduction, disclosure or
	* distribution of this software and related documentation without an express
	* license agreement from NVIDIA CORPORATION is strictly prohibited.
	*/

	#include "../native/crt.h"

	#include <clang/Frontend/CompilerInstance.h>
	#include <clang/Basic/DiagnosticOptions.h>
	#include <clang/Frontend/TextDiagnosticPrinter.h>
	#include <clang/CodeGen/CodeGenAction.h>
	#include <clang/Basic/TargetInfo.h>
	#include <clang/Lex/PreprocessorOptions.h>

	#include <llvm/Support/TargetSelect.h>
	#include <llvm/IR/Module.h>
	#include <llvm/IR/LLVMContext.h>
	#include <llvm/ExecutionEngine/GenericValue.h>
	#include <llvm/Target/TargetMachine.h>
	#include <llvm/MC/TargetRegistry.h>
	#include <llvm/Support/Host.h>
	#include <llvm/PassRegistry.h>
	#include <llvm/InitializePasses.h>
	#include <llvm/IR/LegacyPassManager.h>
	#include <llvm/IRReader/IRReader.h>
	#include <llvm/Linker/Linker.h>

	#include <llvm/ExecutionEngine/Orc/LLJIT.h>
	#include <llvm/ExecutionEngine/JITEventListener.h>
	#include <llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h>
	#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
	#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
	#include <llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h>
	#include <llvm/ExecutionEngine/SectionMemoryManager.h>

	#include <cmath>
	#include <vector>
	#include <iostream>
	#include <string>
	#include <cstring>

	#if defined(_WIN64)
	extern "C" void __chkstk();
	#elif defined(__APPLE__)
	extern "C" void __bzero(void*, size_t);
	extern "C" __double2 __sincos_stret(double);
	extern "C" __float2 __sincosf_stret(float);
	#endif

	extern "C" {

	// GDB and LLDB support debugging of JIT-compiled code by observing calls to __jit_debug_register_code()
	// by putting a breakpoint on it, and retrieving the debug info through __jit_debug_descriptor.
	// On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain
	// their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug
	// info. By forward-declaring them here it suffices to compile this file with /Zi.
	extern struct jit_descriptor __jit_debug_descriptor;
	extern void __jit_debug_register_code();

	}

	namespace wp {

	#if defined (_WIN32)
	// Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
	// Override it to use the ELF format to support DWARF debug info, but keep using the
	// Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
	static const char* target_triple = "x86_64-pc-windows-elf";
	#else
	static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
	#endif

	static void initialize_llvm()
	{
	llvm::InitializeAllTargetInfos();
	llvm::InitializeAllTargets();
	llvm::InitializeAllTargetMCs();
	llvm::InitializeAllAsmPrinters();
	}

	static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
	{
	// Compilation arguments
	std::vector<const char*> args;
	args.push_back(input_file.c_str());

	args.push_back("-I");
	args.push_back(include_dir);

	args.push_back(debug ? "-O0" : "-O2");

	args.push_back("-triple");
	args.push_back(target_triple);

	#if defined(__x86_64__) \|\| defined(_M_X64)
	args.push_back("-target-feature");
	args.push_back("+f16c"); // Enables support for _Float16
	#endif

	clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
	std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
	std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
	clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
	std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
	std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());

	clang::CompilerInstance compiler_instance;

	auto& compiler_invocation = compiler_instance.getInvocation();
	clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());

	if(debug)
	{
	compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
	}

	// Map code to a MemoryBuffer
	std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
	compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());

	if(!debug)
	{
	compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
	}

	compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
	compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec

	compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);

	clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
	bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
	buffer.release();

	return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
	}

	static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
	{
	// Compilation arguments
	std::vector<const char*> args;
	args.push_back(input_file.c_str());

	args.push_back("-I");
	args.push_back(include_dir);

	args.push_back(debug ? "-O0" : "-O2");

	args.push_back("-triple");
	args.push_back("nvptx64-nvidia-cuda");

	args.push_back("-target-cpu");
	args.push_back("sm_70");

	clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
	std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
	std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
	clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
	std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
	std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());

	clang::CompilerInstance compiler_instance;

	auto& compiler_invocation = compiler_instance.getInvocation();
	clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());

	if(debug)
	{
	compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
	}

	// Map code to a MemoryBuffer
	std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
	compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());

	// According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
	// But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
	// The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
	compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");

	if(!debug)
	{
	compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
	}

	compiler_instance.getLangOpts().CUDA = 1;
	compiler_instance.getLangOpts().CUDAIsDevice = 1;
	compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;

	compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);

	clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
	bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
	buffer.release();

	return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
	}

	extern "C" {

	WP_API int compile_cpp(const char* cpp_src, const char input_file, const char include_dir, const char* output_file, bool debug)
	{
	initialize_llvm();

	llvm::LLVMContext context;
	std::unique_ptr<llvm::Module> module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, context);

	if(!module)
	{
	return -1;
	}

	std::string error;
	const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);

	const char* CPU = "generic";
	const char* features = "";
	llvm::TargetOptions target_options;
	llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code
	llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes
	llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);

	module->setDataLayout(target_machine->createDataLayout());

	std::error_code error_code;
	llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);

	llvm::legacy::PassManager pass_manager;
	llvm::CodeGenFileType file_type = llvm::CGFT_ObjectFile;
	target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);

	pass_manager.run(*module);
	output.flush();

	delete target_machine;

	return 0;
	}

	WP_API int compile_cuda(const char* cpp_src, const char input_file, const char include_dir, const char* output_file, bool debug)
	{
	initialize_llvm();

	llvm::LLVMContext context;
	std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);

	if(!module)
	{
	return -1;
	}

	std::string error;
	const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);

	const char* CPU = "sm_70";
	const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5
	llvm::TargetOptions target_options;
	llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
	llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);

	module->setDataLayout(target_machine->createDataLayout());

	// Link libdevice
	llvm::SMDiagnostic diagnostic;
	std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc";
	std::unique_ptr<llvm::Module> libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context));
	if(!libdevice)
	{
	return -1;
	}

	llvm::Linker linker(*module.get());
	if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true)
	{
	return -1;
	}

	std::error_code error_code;
	llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);

	llvm::legacy::PassManager pass_manager;
	llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile;
	target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);

	pass_manager.run(*module);
	output.flush();

	delete target_machine;

	return 0;
	}

	// Global JIT instance
	static llvm::orc::LLJIT* jit = nullptr;

	// Load an object file into an in-memory DLL named `module_name`
	WP_API int load_obj(const char* object_file, const char* module_name)
	{
	if(!jit)
	{
	initialize_llvm();

	auto jit_expected = llvm::orc::LLJITBuilder()
	.setObjectLinkingLayerCreator(
	[&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
	auto get_memory_manager = []() {
	return std::make_unique<llvm::SectionMemoryManager>();
	};
	auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager));

	// Register the event listener.
	obj_linking_layer->registerJITEventListener(*llvm::JITEventListener::createGDBRegistrationListener());

	// Make sure the debug info sections aren't stripped.
	obj_linking_layer->setProcessAllSections(true);

	return obj_linking_layer;
	})
	.create();

	if(!jit_expected)
	{
	std::cerr << "Failed to create JIT instance: " << toString(jit_expected.takeError()) << std::endl;
	return -1;
	}

	jit = (*jit_expected).release();
	}

	auto dll = jit->createJITDylib(module_name);

	if(!dll)
	{
	std::cerr << "Failed to create JITDylib: " << toString(dll.takeError()) << std::endl;
	return -1;
	}

	// Define symbols for Warp's CRT functions subset
	{
	#if defined(__APPLE__)
	#define MANGLING_PREFIX "_"
	#else
	#define MANGLING_PREFIX ""
	#endif

	const auto flags = llvm::JITSymbolFlags::Exported \| llvm::JITSymbolFlags::Absolute;
	#define SYMBOL(sym) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(&::sym), flags} }
	#define SYMBOL_T(sym, T) { jit->getExecutionSession().intern(MANGLING_PREFIX #sym), { llvm::pointerToJITTargetAddress(static_cast<T>(&::sym)), flags} }

	auto error = dll->define(llvm::orc::absoluteSymbols({
	SYMBOL(printf), SYMBOL(puts), SYMBOL(putchar),
	SYMBOL_T(abs, int(*)(int)), SYMBOL(llabs),
	SYMBOL(fmodf), SYMBOL_T(fmod, double(*)(double, double)),
	SYMBOL(logf), SYMBOL_T(log, double(*)(double)),
	SYMBOL(log2f), SYMBOL_T(log2, double(*)(double)),
	SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)),
	SYMBOL(expf), SYMBOL_T(exp, double(*)(double)),
	SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)),
	SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)),
	SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)),
	SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)),
	SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)),
	SYMBOL(fabsf), SYMBOL_T(fabs, double(*)(double)),
	SYMBOL(roundf), SYMBOL_T(round, double(*)(double)),
	SYMBOL(truncf), SYMBOL_T(trunc, double(*)(double)),
	SYMBOL(rintf), SYMBOL_T(rint, double(*)(double)),
	SYMBOL(acosf), SYMBOL_T(acos, double(*)(double)),
	SYMBOL(asinf), SYMBOL_T(asin, double(*)(double)),
	SYMBOL(atanf), SYMBOL_T(atan, double(*)(double)),
	SYMBOL(atan2f), SYMBOL_T(atan2, double(*)(double, double)),
	SYMBOL(cosf), SYMBOL_T(cos, double(*)(double)),
	SYMBOL(sinf), SYMBOL_T(sin, double(*)(double)),
	SYMBOL(tanf), SYMBOL_T(tan, double(*)(double)),
	SYMBOL(sinhf), SYMBOL_T(sinh, double(*)(double)),
	SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)),
	SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)),
	SYMBOL(fmaf),
	SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove),
	SYMBOL(_wp_assert),
	SYMBOL(_wp_isfinite),
	#if defined(_WIN64)
	// For functions with large stack frames the compiler will emit a call to
	// __chkstk() to linearly touch each memory page. This grows the stack without
	// triggering the stack overflow guards.
	SYMBOL(__chkstk),
	#elif defined(__APPLE__)
	SYMBOL(__bzero),
	SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
	#else
	SYMBOL(sincosf), SYMBOL_T(sincos, void()(double,double,double*)),
	#endif
	}));

	if(error)
	{
	std::cerr << "Failed to define symbols: " << llvm::toString(std::move(error)) << std::endl;
	return -1;
	}
	}

	// Load the object file into a memory buffer
	auto buffer = llvm::MemoryBuffer::getFile(object_file);
	if(!buffer)
	{
	std::cerr << "Failed to load object file: " << buffer.getError().message() << std::endl;
	return -1;
	}

	auto err = jit->addObjectFile(dll, std::move(buffer));
	if(err)
	{
	std::cerr << "Failed to add object file: " << llvm::toString(std::move(err)) << std::endl;
	return -1;
	}

	return 0;
	}

	WP_API int unload_obj(const char* module_name)
	{
	if(!jit) // If there's no JIT instance there are no object files loaded
	{
	return 0;
	}

	auto* dll = jit->getJITDylibByName(module_name);
	llvm::Error error = jit->getExecutionSession().removeJITDylib(*dll);

	if(error)
	{
	std::cerr << "Failed to unload: " << llvm::toString(std::move(error)) << std::endl;
	return -1;
	}

	return 0;
	}

	WP_API uint64_t lookup(const char* dll_name, const char* function_name)
	{
	auto* dll = jit->getJITDylibByName(dll_name);

	auto func = jit->lookup(*dll, function_name);

	if(!func)
	{
	std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl;
	return 0;
	}

	return func->getValue();
	}

	} // extern "C"

	} // namespace wp