| | name: SSH into our runners |
| |
|
| | on: |
| | workflow_dispatch: |
| | inputs: |
| | runner_type: |
| | description: 'Type of runner to test (a10)' |
| | required: true |
| | docker_image: |
| | description: 'Name of the Docker image' |
| | required: true |
| | num_gpus: |
| | description: 'Type of the number of gpus to use (`single` or `multi`)' |
| | required: true |
| |
|
| | env: |
| | HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} |
| | HF_HOME: /mnt/cache |
| | TRANSFORMERS_IS_CI: yes |
| | OMP_NUM_THREADS: 8 |
| | MKL_NUM_THREADS: 8 |
| | RUN_SLOW: yes |
| | TF_FORCE_GPU_ALLOW_GROWTH: true |
| | CUDA_VISIBLE_DEVICES: 0,1 |
| |
|
| | jobs: |
| | get_runner: |
| | name: "Get runner to use" |
| | runs-on: ubuntu-22.04 |
| | outputs: |
| | RUNNER: ${{ steps.set_runner.outputs.RUNNER }} |
| | steps: |
| | - name: Get runner to use |
| | shell: bash |
| | env: |
| | NUM_GPUS: ${{ github.event.inputs.num_gpus }} |
| | RUNNER_TYPE: ${{ github.event.inputs.runner_type }} |
| | run: | |
| | if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then |
| | echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV |
| | elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then |
| | echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV |
| | else |
| | echo "RUNNER=" >> $GITHUB_ENV |
| | fi |
| | |
| | - name: Set runner to use |
| | id: set_runner |
| | run: | |
| | echo "$RUNNER" |
| | echo "RUNNER=$RUNNER" >> $GITHUB_OUTPUT |
| | |
| | ssh_runner: |
| | name: "SSH" |
| | needs: get_runner |
| | runs-on: |
| | group: ${{ needs.get_runner.outputs.RUNNER }} |
| | container: |
| | image: ${{ github.event.inputs.docker_image }} |
| | steps: |
| | - name: Update clone |
| | working-directory: /transformers |
| | env: |
| | commit_sha: ${{ github.sha }} |
| | run: | |
| | git fetch && git checkout "$commit_sha" |
| | |
| | - name: Cleanup |
| | working-directory: /transformers |
| | run: | |
| | rm -rf tests/__pycache__ |
| | rm -rf tests/models/__pycache__ |
| | rm -rf reports |
| | |
| | - name: Show installed libraries and their versions |
| | working-directory: /transformers |
| | run: pip freeze |
| |
|
| | - name: NVIDIA-SMI |
| | run: | |
| | nvidia-smi |
| | |
| | - name: Create python alias |
| | run: | |
| | ln -sf $(which python3) /usr/local/bin/python |
| | ln -sf $(which pip3) /usr/local/bin/pip |
| | echo "✅ python -> python3 symlink created" |
| | |
| | - name: Install psutil for memory monitor |
| | run: | |
| | pip install psutil --break-system-packages |
| | |
| | - name: Download memory monitor script |
| | working-directory: /transformers |
| | run: | |
| | apt-get update && apt-get install -y curl |
| | curl -o memory_monitor.py https://raw.githubusercontent.com/huggingface/transformers/refs/heads/utility_scripts/utils/memory_monitor.py |
| | |
| | - name: Start memory monitor |
| | working-directory: /transformers |
| | continue-on-error: true |
| | run: | |
| | python3 memory_monitor.py --threshold 90 --interval 1 > memory_monitor.log 2>&1 & |
| | echo $! > memory_monitor.pid |
| | echo "Memory monitor started with PID $(cat memory_monitor.pid)" |
| | # Give it a moment to start |
| | sleep 2 |
| | # Verify it's running |
| | ps aux | grep memory_monitor | grep -v grep || echo "Warning: memory monitor may not be running" |
| | |
| | - name: Install utilities |
| | run: | |
| | apt-get install -y nano |
| | |
| | - name: Store Slack infos |
| | |
| | shell: bash |
| | env: |
| | GITHUB_ACTOR: ${{ github.actor }} |
| | run: | |
| | echo "$GITHUB_ACTOR" |
| | github_actor=$GITHUB_ACTOR |
| | github_actor=${github_actor/'-'/'_'} |
| | echo "$github_actor" |
| | echo "github_actor=$github_actor" >> $GITHUB_ENV |
| | |
| | - name: Setup automatic environment for SSH login |
| | run: | |
| | # Create shared environment setup |
| | cat > /root/.env_setup << 'EOF' |
| | # Auto-setup (non-sensitive vars) |
| | export HF_HOME=/mnt/cache |
| | export TRANSFORMERS_IS_CI=yes |
| | export OMP_NUM_THREADS=8 |
| | export MKL_NUM_THREADS=8 |
| | export RUN_SLOW=yes |
| | export TF_FORCE_GPU_ALLOW_GROWTH=true |
| | export CUDA_VISIBLE_DEVICES=0,1 |
| | |
| | cd /transformers 2>/dev/null || true |
| | |
| | |
| | if [ -z "$HF_TOKEN" ]; then |
| | echo "⚠️ HF_TOKEN not set. Set it with:" |
| | echo " export HF_TOKEN=hf_xxxxx" |
| | else |
| | echo "✅ HF_TOKEN is set" |
| | fi |
| | |
| | echo "📁 Working directory: $(pwd)" |
| | EOF |
| | |
| | |
| | echo 'source /root/.env_setup' >> /root/.bash_profile |
| | echo 'source /root/.env_setup' >> /root/.bashrc |
| |
|
| | - name: Store Slack infos |
| | |
| | shell: bash |
| | env: |
| | user_slack_id: ${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }} |
| | default_slack_channel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} |
| | run: | |
| | echo "$github_actor" |
| | if [ "$user_slack_id" != "" ]; then |
| | echo "SLACKCHANNEL=$user_slack_id" >> $GITHUB_ENV |
| | else |
| | echo "SLACKCHANNEL=$default_slack_channel" >> $GITHUB_ENV |
| | fi |
| | |
| | - name: Tailscale |
| | uses: huggingface/tailscale-action@main |
| | with: |
| | authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} |
| | slackChannel: ${{ env.SLACKCHANNEL }} |
| | slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} |
| | waitForSSH: true |
| | sshTimeout: 15m |
| |
|