|
|
|
|
|
import subprocess |
|
|
import os |
|
|
import urllib.request |
|
|
import tarfile |
|
|
from pathlib import Path |
|
|
|
|
|
def run(cmd): |
|
|
"""Run command and capture output for debugging""" |
|
|
print(f"Running: {cmd}") |
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True) |
|
|
if result.returncode != 0: |
|
|
print(f"Error: {result.stderr}") |
|
|
raise subprocess.CalledProcessError(result.returncode, cmd) |
|
|
print(f"Success: {result.stdout}") |
|
|
return result |
|
|
|
|
|
def install_spark_colab(): |
|
|
spark_ver = "3.5.6" |
|
|
spark_dir = f"spark-{spark_ver}-bin-hadoop3" |
|
|
url = f"https://downloads.apache.org/spark/spark-{spark_ver}/spark-{spark_ver}-bin-hadoop3.tgz" |
|
|
archive = f"{spark_dir}.tgz" |
|
|
|
|
|
|
|
|
work_dir = Path.home() / "spark_install" |
|
|
work_dir.mkdir(exist_ok=True) |
|
|
os.chdir(work_dir) |
|
|
|
|
|
|
|
|
if not Path(archive).exists(): |
|
|
print(f"Downloading Spark {spark_ver}...") |
|
|
urllib.request.urlretrieve(url, archive) |
|
|
|
|
|
|
|
|
print("Extracting archive...") |
|
|
with tarfile.open(archive, "r:gz") as tar: |
|
|
tar.extractall(path=".") |
|
|
|
|
|
|
|
|
spark_home = Path.home() / "spark" |
|
|
if spark_home.exists(): |
|
|
spark_home.unlink() |
|
|
spark_home.symlink_to(work_dir / spark_dir) |
|
|
|
|
|
|
|
|
os.environ['SPARK_HOME'] = str(spark_home) |
|
|
os.environ['PATH'] = f"{spark_home}/bin:{spark_home}/sbin:{os.environ.get('PATH', '')}" |
|
|
os.environ['PYSPARK_PYTHON'] = 'python3' |
|
|
|
|
|
|
|
|
Path(archive).unlink(missing_ok=True) |
|
|
|
|
|
print(f"Spark installed to: {spark_home}") |
|
|
print("Environment variables set for this session!") |
|
|
print("\nTo verify installation:") |
|
|
print("!ls $SPARK_HOME") |
|
|
print("!spark-submit --version") |
|
|
|
|
|
return str(spark_home) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
spark_path = install_spark_colab() |