File size: 2,900 Bytes
e057d08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# =============================================================================
# SAP RPT-1 Benchmarking - Multi-stage Dockerfile
# =============================================================================
# Builds two targets:
#   - sap-rpt1: Python 3.11 with SAP RPT-1 OSS + all dependencies
#   - baselines: Python 3.11 with XGBoost, CatBoost, LightGBM
#
# Usage:
#   docker-compose build
#   docker-compose run sap-rpt1
#   docker-compose run baselines
# =============================================================================

# ---------- Base stage (shared by all targets) ----------
FROM python:3.11-slim AS base

# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    build-essential \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Copy requirements first (for Docker layer caching)
COPY requirements.txt /app/requirements.txt

# ---------- SAP RPT-1 target ----------
FROM base AS sap-rpt1

# Install core scientific stack first (heavy packages)
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
    numpy==1.26.4 \
    pandas==2.2.3 \
    scikit-learn==1.6.1 \
    scipy==1.14.1 \
    matplotlib==3.9.2 \
    seaborn==0.13.2

# Install Hugging Face and PyTorch stack
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
    --extra-index-url https://download.pytorch.org/whl/cpu \
    torch==2.7.0+cpu \
    transformers==4.52.4 \
    accelerate==1.6.0 \
    huggingface-hub==0.30.2 \
    datasets==3.5.0 \
    pyarrow==20.0.0 \
    torcheval==0.0.7

# Install SAP RPT-1 and remaining requirements
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir -r requirements.txt

# Copy project code
COPY . /app

# Set Python path
ENV PYTHONPATH=/app/code

WORKDIR /app/code

# Set entrypoint so you can run via arguments natively
ENTRYPOINT ["python"]
CMD ["-m", "runners.run_experiment", "--dataset", "adult", "--model", "sap-rpt1-hf"]

# ---------- Baselines target ----------
FROM base AS baselines

# Install core scientific stack (heavy packages)
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
    numpy==1.26.4 \
    pandas==2.2.3 \
    scikit-learn==1.6.1 \
    scipy==1.14.1

# Install visualization and utilities
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
    matplotlib==3.9.2 \
    seaborn==0.13.2 \
    pyyaml==6.0.2 \
    tqdm==4.67.1 \
    joblib==1.4.2 \
    python-dotenv==1.0.1

# Install ML frameworks and OpenML
RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
    openml==0.14.2 \
    xgboost \
    catboost \
    lightgbm

# Copy project code
COPY . /app

# Set Python path
ENV PYTHONPATH=/app/code

WORKDIR /app/code

# Set entrypoint so you can run via arguments natively
ENTRYPOINT ["python"]
CMD ["-m", "runners.run_batch", "--datasets", "config/datasets.yaml", "--models", "config/models.yaml"]