Spaces:

jaothan
/

podman_rag

No application file

App Files Files Community

podman_rag / rag /training /nvidia-bootc /Containerfile

jaothan

Upload 356 files

ad73d17 verified about 1 year ago

raw

history blame contribute delete

9.82 kB

	ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest"
	ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9"
	ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"

	FROM ${DRIVER_TOOLKIT_IMAGE} as builder

	ARG BASE_URL='https://us.download.nvidia.com/tesla'

	ARG BUILD_ARCH=''
	ARG TARGET_ARCH=''

	ARG DRIVER_VERSION='550.90.07'

	ARG VENDOR=''
	ARG RPM_HOST=''

	USER builder

	WORKDIR /home/builder
	COPY --chown=1001:0 x509-configuration.ini x509-configuration.ini

	RUN export KVER=$(rpm -q --qf "%{VERSION}" kernel-core) \
	KREL=$(rpm -q --qf "%{RELEASE}" kernel-core \| sed 's/\.el.$_.$*$//') \
	KDIST=$(rpm -q --qf "%{RELEASE}" kernel-core \| awk -F '.' '{ print "."$NF}') \
	OS_VERSION_MAJOR=$(grep "^VERSION=" /etc/os-release \| cut -d '=' -f 2 \| sed 's/"//g' \| cut -d '.' -f 1) \
	&& if [ "${BUILD_ARCH}" == "" ]; then \
	export BUILD_ARCH=$(arch) \
	&& export TARGET_ARCH=$(echo "${BUILD_ARCH}" \| sed 's/+64k//') ;\
	fi \
	&& DRIVER_STREAM=$(echo ${DRIVER_VERSION} \| cut -d '.' -f 1) \
	&& git clone --depth 1 --single-branch -b rhel${OS_VERSION_MAJOR} https://github.com/NVIDIA/yum-packaging-precompiled-kmod \
	&& cd yum-packaging-precompiled-kmod \
	&& mkdir BUILD BUILDROOT RPMS SRPMS SOURCES SPECS \
	&& mkdir nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH} \
	&& curl -sLOf ${BASE_URL}/${DRIVER_VERSION}/NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run \
	&& sh ./NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run --extract-only --target tmp \
	&& mv tmp/kernel-open nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH}/kernel \
	&& tar -cJf SOURCES/nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH}.tar.xz nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH} \
	&& mv kmod-nvidia.spec SPECS/ \
	&& openssl req -x509 -new -nodes -utf8 -sha256 -days 36500 -batch \
	-config ${HOME}/x509-configuration.ini \
	-outform DER -out SOURCES/public_key.der \
	-keyout SOURCES/private_key.priv \
	&& rpmbuild \
	--define "% _arch ${BUILD_ARCH}" \
	--define "%_topdir $(pwd)" \
	--define "debug_package %{nil}" \
	--define "kernel ${KVER}" \
	--define "kernel_release ${KREL}" \
	--define "kernel_dist ${KDIST}" \
	--define "driver ${DRIVER_VERSION}" \
	--define "driver_branch ${DRIVER_STREAM}" \
	--define "vendor ${VENDOR:-undefined}" \
	--define "_buildhost ${RPM_HOST:-${HOSTNAME}}" \
	-v -bb SPECS/kmod-nvidia.spec

	FROM ${BASEIMAGE}

	ARG BASE_URL='https://us.download.nvidia.com/tesla'

	ARG VENDOR=''
	LABEL vendor=${VENDOR}
	LABEL org.opencontainers.image.vendor=${VENDOR}

	ARG DRIVER_TYPE=passthrough
	ENV NVIDIA_DRIVER_TYPE=${DRIVER_TYPE}

	ARG DRIVER_VERSION='550.90.07'
	ENV NVIDIA_DRIVER_VERSION=${DRIVER_VERSION}
	ARG CUDA_VERSION='12.4.1'

	ARG TARGET_ARCH=''
	ENV TARGETARCH=${TARGET_ARCH}

	ARG EXTRA_RPM_PACKAGES=''

	# Disable vGPU version compatibility check by default
	ARG DISABLE_VGPU_VERSION_CHECK=true
	ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK

	USER root

	COPY --from=builder /home/builder/yum-packaging-precompiled-kmod/RPMS//.rpm /rpms/
	# Temporary workaround until the permanent fix for libdnf is merged
	COPY nvidia-toolkit-firstboot.service /usr/lib/systemd/system/nvidia-toolkit-firstboot.service
	# Enable common services
	COPY duplicated/common/usr /usr

	ARG IMAGE_VERSION_ID

	# TODO: rework this monstrosity into a build.sh (or even not shell script)
	# The need for the `cp /etc/dnf/dnf.conf` is a workaround for https://github.com/containers/bootc/issues/637
	RUN mv /etc/selinux /etc/selinux.tmp \
	&& dnf install -y /rpms/kmod-nvidia-*.rpm \
	&& export OS_VERSION_MAJOR=$(grep "^VERSION=" /etc/os-release \| cut -d '=' -f 2 \| sed 's/"//g' \| cut -d '.' -f 1) \
	&& if [ "${TARGET_ARCH}" == "" ]; then \
	export TARGET_ARCH="$(arch)" ;\
	fi \
	&& if [ "${TARGET_ARCH}" == "aarch64" ]; then CUDA_REPO_ARCH="sbsa"; fi \
	&& export DRIVER_STREAM=$(echo ${DRIVER_VERSION} \| cut -d '.' -f 1) \
	CUDA_VERSION_ARRAY=(${CUDA_VERSION//./ }) \
	CUDA_DASHED_VERSION=${CUDA_VERSION_ARRAY[0]}-${CUDA_VERSION_ARRAY[1]} \
	CUDA_REPO_ARCH=${TARGET_ARCH} \
	&& cp -a /etc/dnf/dnf.conf{,.tmp} && mv /etc/dnf/dnf.conf{.tmp,} \
	&& dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save \
	&& dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel${OS_VERSION_MAJOR}/${CUDA_REPO_ARCH}/cuda-rhel${OS_VERSION_MAJOR}.repo \
	&& dnf -y module enable nvidia-driver:${DRIVER_STREAM}/default \
	&& dnf install -y \
	cloud-init \
	git \
	git-lfs \
	pciutils \
	tmux \
	nvidia-driver-${DRIVER_VERSION} \
	nvidia-driver-cuda-${DRIVER_VERSION} \
	nvidia-driver-libs-${DRIVER_VERSION} \
	nvidia-driver-NVML-${DRIVER_VERSION} \
	cuda-compat-${CUDA_DASHED_VERSION} \
	cuda-cudart-${CUDA_DASHED_VERSION} \
	nvidia-persistenced-${DRIVER_VERSION} \
	nvidia-container-toolkit \
	rsync \
	skopeo \
	${EXTRA_RPM_PACKAGES} \
	&& if [[ "$(rpm -qa \| grep kernel-core \| wc -l)" != "1" ]]; then \
	echo "ERROR - Multiple kernel-core packages detected"; \
	echo "This usually means that nvidia-drivers are built for a different kernel version than the one installed"; \
	exit 1; \
	fi \
	&& if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGET_ARCH" != "arm64" ]; then \
	versionArray=(${DRIVER_VERSION//./ }); \
	DRIVER_BRANCH=${versionArray[0]}; \
	dnf module enable -y nvidia-driver:${DRIVER_BRANCH} && \
	dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \
	fi \
	&& . /etc/os-release && if [ "${ID}" == "rhel" ]; then \
	# Install rhc connect for insights telemetry gathering
	dnf install -y rhc rhc-worker-playbook; \
	# Adding rhel ai identity to os-release file for insights usage
	sed -i -e "/^VARIANT=/ {s/^VARIANT=.*/VARIANT=\"RHEL AI\"/; t}" -e "\$aVARIANT=\"RHEL AI\"" /usr/lib/os-release; \
	sed -i -e "/^VARIANT_ID=/ {s/^VARIANT_ID=.*/VARIANT_ID=rhel_ai/; t}" -e "\$aVARIANT_ID=rhel_ai" /usr/lib/os-release; \
	sed -i -e "/^RHEL_AI_VERSION_ID=/ {s/^RHEL_AI_VERSION_ID=.*/RHEL_AI_VERSION_ID='${IMAGE_VERSION_ID}'/; t}" -e "\$aRHEL_AI_VERSION_ID='${IMAGE_VERSION_ID}'" /usr/lib/os-release; \
	# disable auto upgrade service
	rm -f /usr/lib/systemd/system/default.target.wants/bootc-fetch-apply-updates.timer; \
	fi \
	&& dnf clean all \
	&& ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants \
	&& mv /etc/selinux.tmp /etc/selinux \
	&& ln -s /usr/lib/systemd/system/nvidia-toolkit-firstboot.service /usr/lib/systemd/system/basic.target.wants/nvidia-toolkit-firstboot.service \
	&& echo "blacklist nouveau" > /etc/modprobe.d/blacklist_nouveau.conf \
	&& sed -i '/\[Unit\]/a ConditionDirectoryNotEmpty=/proc/driver/nvidia-nvswitch/devices' /usr/lib/systemd/system/nvidia-fabricmanager.service \
	&& ln -s /usr/lib/systemd/system/nvidia-fabricmanager.service /etc/systemd/system/multi-user.target.wants/nvidia-fabricmanager.service \
	&& ln -s /usr/lib/systemd/system/nvidia-persistenced.service /etc/systemd/system/multi-user.target.wants/nvidia-persistenced.service

	ARG SSHPUBKEY

	# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your
	# public key into the image, allowing root access via ssh.
	RUN if [ -n "${SSHPUBKEY}" ]; then \
	set -eu; mkdir -p /usr/ssh && \
	echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \
	echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
	fi

	# Setup /usr/lib/containers/storage as an additional store for images.
	# Remove once the base images have this set by default.
	# Also make sure not to duplicate if a base image already has it specified.
	RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf \|\| \
	sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \
	/etc/containers/storage.conf

	COPY duplicated/ilab-wrapper/ilab /usr/bin/ilab
	RUN chmod +x /usr/bin/ilab

	ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
	ARG INSTRUCTLAB_IMAGE_PULL_SECRET="instructlab-nvidia-pull"

	RUN for i in /usr/bin/ilab*; do \
	sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
	sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' $i; \
	sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" $i; \
	done

	# Added for running as an OCI Container to prevent Overlay on Overlay issues.
	VOLUME /var/lib/containers

	RUN --mount=type=secret,id=${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson \
	if [ -f "/run/.input/instructlab-nvidia/oci-layout" ]; then \
	IID=$(podman --root /usr/lib/containers/storage --storage-opt overlay.force_mask=shared pull oci:/run/.input/instructlab-nvidia) && \
	podman --root /usr/lib/containers/storage --storage-opt overlay.force_mask=shared image tag ${IID} ${INSTRUCTLAB_IMAGE}; \
	elif [ -f "/run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson" ]; then \
	IID=$(sudo podman --root /usr/lib/containers/storage pull --storage-opt overlay.force_mask=shared --authfile /run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson ${INSTRUCTLAB_IMAGE}); \
	else \
	IID=$(sudo podman --root /usr/lib/containers/storage --storage-opt overlay.force_mask=shared pull ${INSTRUCTLAB_IMAGE}); \
	fi && \
	chmod -R a+rX /usr/lib/containers

	COPY containers-storage.conf /etc/skel/.config/containers/storage.conf

	RUN podman system reset --force 2>/dev/null

	LABEL image_version_id="${IMAGE_VERSION_ID}"