File size: 3,661 Bytes
42b3a03
1ad8acc
42b3a03
1ad8acc
 
42b3a03
a9c928a
 
 
 
42b3a03
1ad8acc
 
42b3a03
 
a9c928a
 
 
 
 
 
 
 
42b3a03
 
1ad8acc
42b3a03
1ad8acc
a9c928a
42b3a03
 
 
a9c928a
 
 
42b3a03
1ad8acc
 
42b3a03
 
 
 
 
 
1ad8acc
42b3a03
 
 
a9c928a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ad8acc
 
 
42b3a03
1ad8acc
 
42b3a03
 
a9c928a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# ===== Gunakan base image resmi Jupyter =====
FROM jupyter/base-notebook:latest

# Jalankan sebagai root hanya untuk instalasi
USER root

# ===== Install Java 17 dan utilitas =====
RUN apt-get update && \
    apt-get install -y --no-install-recommends openjdk-17-jdk curl ca-certificates wget && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
ENV PATH=$JAVA_HOME/bin:$PATH

# ===== Install Apache Spark 3.5.0 =====
ENV SPARK_VERSION=3.5.0
ENV SPARK_DIST=spark-${SPARK_VERSION}-bin-hadoop3
ENV SPARK_URL=https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DIST}.tgz

RUN curl -L -o /tmp/spark.tgz "${SPARK_URL}" && \
    tar -xzf /tmp/spark.tgz -C /usr/local/ && \
    rm /tmp/spark.tgz && \
    mv /usr/local/${SPARK_DIST} /usr/local/spark

# ===== Set environment Spark =====
ENV SPARK_HOME=/usr/local/spark
ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$JAVA_HOME/bin:$PATH
ENV PYSPARK_PYTHON=python3
# Pastikan nama py4j tidak terputus; Spark 3.5.0 biasanya bundling py4j 0.10.9.7
ENV PYTHONPATH=$SPARK_HOME/python/:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH

# ===== Install library Python =====
# NOTE: jika kamu ingin menggunakan pyspark via pip, gunakan pyspark==3.5.0 untuk cocok dengan Spark.
# Saya sengaja tidak meng-install 'pyspark' lewat pip untuk menghindari mismatch dengan binary Spark yang diinstall.
RUN pip install --no-cache-dir pandas matplotlib findspark ipykernel jupyterlab

# ===== Perbaiki izin direktori home Jupyter =====
RUN mkdir -p /home/jovyan/.local/share/jupyter/runtime && \
    chown -R 1000:100 /home/jovyan && \
    chmod -R 777 /home/jovyan

# ===== Siapkan direktori kerja dan log Spark =====
RUN mkdir -p /workspace/spark_logs/kernel1 /workspace/spark_logs/kernel2 && \
    chmod -R 777 /workspace
WORKDIR /workspace

# ===== Buat dua kernel Spark terisolasi =====
RUN mkdir -p /usr/local/share/jupyter/kernels/spark_kernel1 && \
    mkdir -p /usr/local/share/jupyter/kernels/spark_kernel2

# kernel 1
RUN cat > /usr/local/share/jupyter/kernels/spark_kernel1/kernel.json <<'JSON'
{
  "argv": ["python3", "-m", "ipykernel_launcher", "-f", "{connection_file}"],
  "display_name": "Spark Kernel 1",
  "language": "python",
  "env": {
    "JAVA_HOME": "/usr/lib/jvm/java-17-openjdk-amd64",
    "SPARK_HOME": "/usr/local/spark",
    "PYSPARK_PYTHON": "python3",
    "SPARK_LOG_DIR": "/workspace/spark_logs/kernel1",
    "SPARK_LOCAL_DIRS": "/workspace/spark_logs/kernel1",
    "PYTHONPATH": "/usr/local/spark/python/:/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip"
  }
}
JSON

# kernel 2
RUN cat > /usr/local/share/jupyter/kernels/spark_kernel2/kernel.json <<'JSON'
{
  "argv": ["python3", "-m", "ipykernel_launcher", "-f", "{connection_file}"],
  "display_name": "Spark Kernel 2",
  "language": "python",
  "env": {
    "JAVA_HOME": "/usr/lib/jvm/java-17-openjdk-amd64",
    "SPARK_HOME": "/usr/local/spark",
    "PYSPARK_PYTHON": "python3",
    "SPARK_LOG_DIR": "/workspace/spark_logs/kernel2",
    "SPARK_LOCAL_DIRS": "/workspace/spark_logs/kernel2",
    "PYTHONPATH": "/usr/local/spark/python/:/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip"
  }
}
JSON

# ===== Ubah kembali ke user default Jupyter =====
USER jovyan

# ===== Port Hugging Face (7860) =====
EXPOSE 7860

# ===== Jalankan JupyterLab di port 7860 =====
# Gunakan ServerApp flags (kompatibel dengan jupyter_server/jupyterlab terbaru)
CMD ["bash", "-c", "echo 'JupyterLab berjalan di port 7860' && jupyter lab --ip=0.0.0.0 --port=7860 --no-browser --ServerApp.token='' --ServerApp.password='' --NotebookApp.token='' --NotebookApp.password='' --LabApp.default_url=/lab"]