ETL_pipeline / scripts /install_spark.py
heboya8's picture
Upload folder using huggingface_hub
2eee82e verified
#!/usr/bin/env python3
import subprocess
import os
import urllib.request
import tarfile
from pathlib import Path
def run(cmd):
"""Run command and capture output for debugging"""
print(f"Running: {cmd}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: {result.stderr}")
raise subprocess.CalledProcessError(result.returncode, cmd)
print(f"Success: {result.stdout}")
return result
def install_spark_colab():
spark_ver = "3.5.6"
spark_dir = f"spark-{spark_ver}-bin-hadoop3"
url = f"https://downloads.apache.org/spark/spark-{spark_ver}/spark-{spark_ver}-bin-hadoop3.tgz"
archive = f"{spark_dir}.tgz"
# Create working directory
work_dir = Path.home() / "spark_install"
work_dir.mkdir(exist_ok=True)
os.chdir(work_dir)
# Download if not exists
if not Path(archive).exists():
print(f"Downloading Spark {spark_ver}...")
urllib.request.urlretrieve(url, archive)
# Extract
print("Extracting archive...")
with tarfile.open(archive, "r:gz") as tar:
tar.extractall(path=".")
# Create symlink in home directory (no sudo needed)
spark_home = Path.home() / "spark"
if spark_home.exists():
spark_home.unlink()
spark_home.symlink_to(work_dir / spark_dir)
# Set environment variables for current session
os.environ['SPARK_HOME'] = str(spark_home)
os.environ['PATH'] = f"{spark_home}/bin:{spark_home}/sbin:{os.environ.get('PATH', '')}"
os.environ['PYSPARK_PYTHON'] = 'python3'
# Clean up
Path(archive).unlink(missing_ok=True)
print(f"Spark installed to: {spark_home}")
print("Environment variables set for this session!")
print("\nTo verify installation:")
print("!ls $SPARK_HOME")
print("!spark-submit --version")
return str(spark_home)
if __name__ == "__main__":
spark_path = install_spark_colab()