maurocarlu's picture
Enhance DVC pull with retry logic and cache cleanup for improved reliability
c654a00
name: CI Pipeline
on:
push:
branches: [ "main", "feature/*" ]
pull_request:
branches: [ "main" ]
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Free Disk Space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# Install CPU-only PyTorch to save space (we don't need CUDA for tests)
pip install torch --index-url https://download.pytorch.org/whl/cpu
# Install other dependencies
pip install -r requirements.txt --no-cache-dir
pip install -e .
- name: Lint with Ruff
run: |
make lint
- name: Run Unit Tests
run: |
pytest tests/unit/ -v -m unit --html=report.html --self-contained-html
# Preserved contribution from Antonio Fratta
- name: Upload Test Report
if: failure()
uses: actions/upload-artifact@v4
with:
name: test-report
path: report.html
build-image:
needs: unit-tests
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Free Disk Space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: 'pip'
- name: Install DVC
run: |
python -m pip install --upgrade pip
pip install dvc dvc-s3
- name: Configure DVC
run: |
dvc remote modify origin --local auth basic
dvc remote modify origin --local user ${{ secrets.DAGSHUB_USERNAME }}
dvc remote modify origin --local password ${{ secrets.DAGSHUB_TOKEN }}
- name: Pull Models with DVC
run: |
# Clean any potentially corrupted cache
rm -rf .dvc/cache/files/md5/e1 || true
# Retry logic for DVC pull to handle intermittent server errors
max_attempts=3
attempt=0
until [ $attempt -ge $max_attempts ]
do
dvc pull models/random_forest_embedding_gridsearch.pkl models/label_names.pkl && break
attempt=$((attempt+1))
echo "DVC pull attempt $attempt failed. Retrying in 10 seconds..."
# Clean cache on retry
rm -rf .dvc/cache || true
sleep 10
done
if [ $attempt -ge $max_attempts ]; then
echo "DVC pull failed after $max_attempts attempts"
exit 1
fi
- name: Verify Models Downloaded
run: |
if [ ! -f "models/random_forest_embedding_gridsearch.pkl" ] || [ ! -f "models/label_names.pkl" ]; then
echo "ERROR: Required model files not found after DVC pull"
exit 1
fi
echo "All required model files present"
- name: Build Docker Image
run: |
docker build -t hopcroft-app:latest .