#!/usr/bin/env bash # Submit a GPU-node install/build job for scratch nemotron-ocr-v2. # # Usage: # ./submit_install_gpu.sh # # Environment (optional): # SLURM_TIME — wall time in minutes (default: 45) # JOB_NAME — sbatch job name (default: install_ocr_v2) set -euo pipefail BASE="/lustre/fsw/portfolios/datascience/users/rchesler/scratch/nemotron-ocr-v2" PKG="$BASE/nemotron-ocr" ACCOUNT="datascience_nemo_retriever" PARTITIONS="batch_block1,batch_block3,batch_block4" SLURM_TIME="${SLURM_TIME:-45}" JOB_NAME="${JOB_NAME:-install_ocr_v2}" TIMESTAMP=$(date +"%Y%m%d_%H%M%S") LOGS="$BASE/inference_output/install_logs" mkdir -p "$LOGS" JOB="$BASE/.job_install_${TIMESTAMP}.sh" cat > "$JOB" << ENDSCRIPT #!/bin/bash #SBATCH --job-name=${JOB_NAME} #SBATCH --account=${ACCOUNT} #SBATCH --partition=${PARTITIONS} #SBATCH --nodes=1 #SBATCH --gpus-per-node=1 #SBATCH --time=${SLURM_TIME} #SBATCH --output=${LOGS}/install_%j.out #SBATCH --error=${LOGS}/install_%j.err set -euo pipefail source /etc/profile.d/modules.sh 2>/dev/null || true if command -v module >/dev/null 2>&1; then module load cuda12.2/toolkit/12.2.2 || true fi export PATH="\$HOME/.local/bin:\$PATH" cd "${PKG}" uv venv source .venv/bin/activate uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128 rm -f src/nemotron_ocr_cpp/_nemotron_ocr_cpp*.so BUILD_CPP_FORCE=1 uv pip install -e . -v python -c " import torch print(f'torch={torch.__version__} cuda_available={torch.cuda.is_available()}') if torch.cuda.is_available(): print(f'gpu={torch.cuda.get_device_name(0)}') import nemotron_ocr, nemotron_ocr_cpp from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2 print('nemotron_ocr import OK; nemotron_ocr_cpp OK; NemotronOCRV2 OK') " ENDSCRIPT chmod +x "$JOB" echo "============================================================" echo " GPU install job (scratch clone)" echo "============================================================" echo " package: $PKG" echo " logs: $LOGS" echo " job file: $JOB" echo "============================================================" JID=$(sbatch "$JOB" 2>&1 | sed -n 's/.* \([0-9][0-9]*\)$/\1/p' || true) if [[ -n "${JID:-}" ]]; then echo " submitted job ID: $JID" echo " tail: tail -f $LOGS/install_${JID}.out" else echo " sbatch failed or did not return a job id" >&2 exit 1 fi