refactor(beam-utils): use direct file operations for beam volumes

This commit refactors the Beam volume interaction logic to use direct file operations when running on the Beam platform. It removes the dependency on the `beam cp` command for file transfers, which simplifies the code and improves performance. It also adds checks to determine if the code is running on Beam and adjusts the file operations accordingly.

Files changed (1) hide show

src/distiller/beam_utils.py +160 -103

src/distiller/beam_utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ Features:
 - Distributed storage optimization
 """
 import json
 import logging
 import shutil
@@ -25,6 +26,52 @@ from typing import Any
 logger = logging.getLogger(__name__)
 class BeamVolumeManager:
 	"""Manager for Beam distributed storage volumes using direct file operations."""
@@ -792,7 +839,7 @@ def download_evaluation_results_from_beam(
 	local_results_dir: str = "code_model2vec/evaluation_results",
 ) -> bool:
 	"""
-	Download evaluation result files from Beam volume to local directory using beam cp.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -806,46 +853,43 @@ def download_evaluation_results_from_beam(
 		local_path = Path(local_results_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
-		# Use beam cp to download individual JSON files
-		remote_path = f"{volume_name}:{remote_results_dir}"
-		# First, list files in the remote directory
-		list_cmd = ["beam", "cp", "-r", "--list-only", remote_path]
-		try:
-			result = subprocess.run(list_cmd, capture_output=True, text=True, check=True)  # noqa: S603
-			remote_files = [line.strip() for line in result.stdout.split("\n") if line.strip().endswith(".json")]
-		except subprocess.CalledProcessError:
-			logger.warning(f"Could not list files in {remote_path}")
-			remote_files = []
-		# Download each JSON file individually
-		downloaded_files = []
-		for file_name in remote_files:
-			if file_name.endswith(".json"):
-				remote_file_path = f"{volume_name}:{remote_results_dir}/{file_name}"
-				local_file_path = local_path / file_name
 				try:
-					download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
-					subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
-					downloaded_files.append(file_name)
-					logger.info(f"📥 Downloaded: {file_name}")
 					# Delete the file from Beam volume after successful download
-					delete_cmd = ["beam", "rm", remote_file_path]
-					try:
-						subprocess.run(delete_cmd, check=True, capture_output=True)  # noqa: S603
-						logger.info(f"🗑️ Deleted from volume: {file_name}")
-					except subprocess.CalledProcessError as e:
-						logger.warning(f"⚠️ Could not delete {file_name} from volume: {e}")
-				except subprocess.CalledProcessError as e:
-					logger.warning(f"⚠️ Failed to download {file_name}: {e}")
-		if downloaded_files:
-			logger.info(f"✅ Downloaded {len(downloaded_files)} evaluation result files")
 			return True
-		logger.info("ℹ️ No new evaluation files to download")
 		return True
 	except Exception:
@@ -861,7 +905,7 @@ def download_specific_evaluation_file(
 	file_prefix: str = "codesearchnet_eval",
 ) -> bool:
 	"""
-	Download a specific evaluation or benchmark result file from Beam volume.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -881,28 +925,27 @@ def download_specific_evaluation_file(
 		safe_model_name = model_name.replace("/", "_")
 		filename = f"{file_prefix}_{safe_model_name}.json"
-		remote_file_path = f"{volume_name}:{remote_results_dir}/{filename}"
 		local_file_path = local_path / filename
-		# Download the specific file
-		download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
-		subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
 		logger.info(f"📥 Downloaded {file_prefix} results for {model_name}")
 		# Delete the file from Beam volume after successful download
-		delete_cmd = ["beam", "rm", remote_file_path]
-		try:
-			subprocess.run(delete_cmd, check=True, capture_output=True)  # noqa: S603
-			logger.info(f"🗑️ Deleted {file_prefix} results for {model_name} from volume")
-		except subprocess.CalledProcessError as e:
-			logger.warning(f"⚠️ Could not delete {filename} from volume: {e}")
 		return True
-	except subprocess.CalledProcessError:
-		logger.warning(f"⚠️ No {file_prefix} results found for {model_name} on Beam")
-		return False
 	except Exception:
 		logger.exception(f"❌ Error downloading {file_prefix} results for {model_name}")
 		return False
@@ -914,7 +957,7 @@ def download_model_from_beam(
 	local_dir: str,
 ) -> bool:
 	"""
-	Download a model from Beam volume to local directory.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -928,22 +971,27 @@ def download_model_from_beam(
 		local_path = Path(local_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
-		# Use beam cp to download the model directory
-		remote_path = f"{volume_name}:models/{model_name}"
 		local_model_path = local_path / model_name
-		download_cmd = ["beam", "cp", "-r", remote_path, str(local_model_path)]
-		subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
 		logger.info(f"📥 Downloaded model {model_name} from Beam to {local_dir}")
 		return True
-	except subprocess.CalledProcessError as e:
 		logger.warning(f"⚠️ Failed to download model {model_name} from Beam: {e}")
 		return False
-	except Exception:
-		logger.exception(f"❌ Error downloading model {model_name} from Beam")
-		return False
 def upload_model_to_beam(
@@ -952,7 +1000,7 @@ def upload_model_to_beam(
 	local_dir: str,
 ) -> bool:
 	"""
-	Upload a model from local directory to Beam volume.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -968,21 +1016,24 @@ def upload_model_to_beam(
 			logger.error(f"❌ Local model directory does not exist: {local_dir}")
 			return False
-		# Use beam cp to upload the model directory
-		remote_path = f"{volume_name}:models/{model_name}"
-		upload_cmd = ["beam", "cp", "-r", str(local_path), remote_path]
-		subprocess.run(upload_cmd, check=True, capture_output=True)  # noqa: S603
 		logger.info(f"📤 Uploaded model {model_name} to Beam from {local_dir}")
 		return True
-	except subprocess.CalledProcessError as e:
 		logger.warning(f"⚠️ Failed to upload model {model_name} to Beam: {e}")
 		return False
-	except Exception:
-		logger.exception(f"❌ Error uploading model {model_name} to Beam")
-		return False
 def download_checkpoints_from_beam(
@@ -992,7 +1043,7 @@ def download_checkpoints_from_beam(
 	local_checkpoints_dir: str = "code_model2vec/checkpoints",
 ) -> bool:
 	"""
-	Download checkpoint files from Beam volume to local directory.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -1007,52 +1058,49 @@ def download_checkpoints_from_beam(
 		local_path = Path(local_checkpoints_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
 		# Build the pattern for files to download
 		if stage:
 			local_stage_dir = local_path / stage
 			local_stage_dir.mkdir(parents=True, exist_ok=True)
 		else:
-			pass
-		# Use beam cp to download checkpoint files
-		remote_path = f"{volume_name}:{remote_checkpoints_dir}"
-		# First, try to list files
-		list_cmd = ["beam", "cp", "-r", "--list-only", remote_path]
-		try:
-			result = subprocess.run(list_cmd, capture_output=True, text=True, check=True)  # noqa: S603
-			remote_files = [
-				line.strip()
-				for line in result.stdout.split("\n")
-				if line.strip().endswith(".json") and "checkpoints_" in line.strip()
-			]
-		except subprocess.CalledProcessError:
-			logger.warning(f"Could not list checkpoint files in {remote_path}")
 			remote_files = []
-		# Filter by stage if specified
-		if stage:
-			remote_files = [f for f in remote_files if f"checkpoints_{stage}_" in f]
-		# Download each checkpoint file
 		downloaded_files = []
-		for file_name in remote_files:
-			remote_file_path = f"{volume_name}:{remote_checkpoints_dir}/{file_name}"
 			# Determine local subdirectory based on checkpoint stage
-			file_stage = file_name.split("_")[1] if "_" in file_name else "unknown"
 			local_stage_dir = local_path / file_stage
 			local_stage_dir.mkdir(parents=True, exist_ok=True)
-			local_file_path = local_stage_dir / file_name
 			try:
-				download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
-				subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
-				downloaded_files.append(file_name)
-				logger.info(f"📥 Downloaded checkpoint: {file_name}")
-			except subprocess.CalledProcessError as e:
-				logger.warning(f"⚠️ Failed to download checkpoint {file_name}: {e}")
 		if downloaded_files:
 			logger.info(f"✅ Downloaded {len(downloaded_files)} checkpoint files")
@@ -1072,7 +1120,7 @@ def upload_checkpoints_to_beam(
 	remote_checkpoints_dir: str = "checkpoints",
 ) -> bool:
 	"""
-	Upload checkpoint files from local directory to Beam volume.
 	Args:
 	    volume_name: Name of the Beam volume
@@ -1089,6 +1137,10 @@ def upload_checkpoints_to_beam(
 			logger.warning(f"⚠️ Local checkpoints directory does not exist: {local_checkpoints_dir}")
 			return True  # Not an error - no checkpoints to upload
 		# Find checkpoint files to upload
 		if stage:
 			# Look in the stage subdirectory
@@ -1105,18 +1157,23 @@ def upload_checkpoints_to_beam(
 			logger.info(f"ℹ️ No checkpoint files found to upload for stage: {stage or 'all'}")
 			return True
-		# Upload each checkpoint file
 		uploaded_files = []
 		for checkpoint_file in checkpoint_files:
-			remote_file_path = f"{volume_name}:{remote_checkpoints_dir}/{checkpoint_file.name}"
 			try:
-				upload_cmd = ["beam", "cp", str(checkpoint_file), remote_file_path]
-				subprocess.run(upload_cmd, check=True, capture_output=True)  # noqa: S603
 				uploaded_files.append(checkpoint_file.name)
 				logger.info(f"📤 Uploaded checkpoint: {checkpoint_file.name}")
-			except subprocess.CalledProcessError as e:
 				logger.warning(f"⚠️ Failed to upload checkpoint {checkpoint_file.name}: {e}")
 		if uploaded_files:

 - Distributed storage optimization
 """
+# ruff: noqa: S603, S607, PLW1510
 import json
 import logging
 import shutil
 logger = logging.getLogger(__name__)
+def _is_running_on_beam() -> bool:
+	"""
+	Detect if we're running on Beam platform or locally.
+	On Beam, volumes are mounted as directories. Locally, we need to use beam CLI.
+	"""
+	import os
+	# Check for Beam environment variables
+	beam_env_vars = [
+		"BEAM_TASK_ID",
+		"BEAM_FUNCTION_ID",
+		"BEAM_RUN_ID",
+		"BEAM_JOB_ID",
+		"BEAM_CONTAINER_ID",
+	]
+	for env_var in beam_env_vars:
+		if os.environ.get(env_var):
+			return True
+	# Check for common Beam mount paths
+	beam_mount_paths = [
+		"/volumes",  # Common Beam volume mount
+		"/mnt/beam",
+		"/var/beam",
+		"/beam",
+	]
+	return any(Path(mount_path).exists() for mount_path in beam_mount_paths)
+def _check_beam_cli_available() -> bool:
+	"""
+	Check if beam CLI is available for local file operations.
+	Returns:
+	    True if beam CLI is available, False otherwise
+	"""
+	try:
+		result = subprocess.run(["beam", "--version"], capture_output=True, text=True, timeout=10)
+		return result.returncode == 0
+	except (FileNotFoundError, subprocess.TimeoutExpired):
+		return False
 class BeamVolumeManager:
 	"""Manager for Beam distributed storage volumes using direct file operations."""
 	local_results_dir: str = "code_model2vec/evaluation_results",
 ) -> bool:
 	"""
+	Download evaluation result files from Beam volume to local directory.
 	Args:
 	    volume_name: Name of the Beam volume
 		local_path = Path(local_results_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
+		if _is_running_on_beam():
+			# Direct file operations when running on Beam
+			remote_path = Path(volume_name) / remote_results_dir
+			if not remote_path.exists():
+				logger.info("ℹ️ No evaluation results directory found on Beam")
+				return True
+			# Find and copy JSON result files
+			remote_files = list(remote_path.glob("*.json"))
+			downloaded_files = []
+			for result_file in remote_files:
+				local_file_path = local_path / result_file.name
 				try:
+					shutil.copy2(result_file, local_file_path)
+					downloaded_files.append(result_file.name)
+					logger.info(f"📥 Downloaded: {result_file.name}")
 					# Delete the file from Beam volume after successful download
+					result_file.unlink()
+					logger.info(f"🗑️ Deleted from volume: {result_file.name}")
+				except Exception as e:
+					logger.warning(f"⚠️ Failed to download {result_file.name}: {e}")
+			if downloaded_files:
+				logger.info(f"✅ Downloaded {len(downloaded_files)} evaluation result files")
+				return True
+			logger.info("ℹ️ No new evaluation files to download")
 			return True
+		# When running locally, we cannot access Beam volumes directly
+		# This would require a proper Beam storage API or CLI tool
+		logger.info("ℹ️ Evaluation results download from local environment not supported")
+		logger.info("ℹ️ Evaluation results are only accessible when running on Beam platform")
 		return True
 	except Exception:
 	file_prefix: str = "codesearchnet_eval",
 ) -> bool:
 	"""
+	Download a specific evaluation or benchmark result file from Beam volume using direct file operations.
 	Args:
 	    volume_name: Name of the Beam volume
 		safe_model_name = model_name.replace("/", "_")
 		filename = f"{file_prefix}_{safe_model_name}.json"
+		# When running on Beam, the volume is mounted as a directory
+		remote_file_path = Path(volume_name) / remote_results_dir / filename
 		local_file_path = local_path / filename
+		if not remote_file_path.exists():
+			logger.warning(f"⚠️ No {file_prefix} results found for {model_name} on Beam")
+			return False
+		# Copy the specific file
+		import shutil
+		shutil.copy2(remote_file_path, local_file_path)
 		logger.info(f"📥 Downloaded {file_prefix} results for {model_name}")
 		# Delete the file from Beam volume after successful download
+		remote_file_path.unlink()
+		logger.info(f"🗑️ Deleted {file_prefix} results for {model_name} from volume")
 		return True
 	except Exception:
 		logger.exception(f"❌ Error downloading {file_prefix} results for {model_name}")
 		return False
 	local_dir: str,
 ) -> bool:
 	"""
+	Download a model from Beam volume to local directory using direct file operations.
 	Args:
 	    volume_name: Name of the Beam volume
 		local_path = Path(local_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
+		# When running on Beam, the volume is mounted as a directory
+		remote_model_path = Path(volume_name) / "models" / model_name
 		local_model_path = local_path / model_name
+		if not remote_model_path.exists():
+			logger.warning(f"⚠️ Model {model_name} not found in Beam volume at {remote_model_path}")
+			return False
+		# Copy the model directory
+		import shutil
+		if local_model_path.exists():
+			shutil.rmtree(local_model_path)
+		shutil.copytree(remote_model_path, local_model_path)
 		logger.info(f"📥 Downloaded model {model_name} from Beam to {local_dir}")
 		return True
+	except Exception as e:
 		logger.warning(f"⚠️ Failed to download model {model_name} from Beam: {e}")
 		return False
 def upload_model_to_beam(
 	local_dir: str,
 ) -> bool:
 	"""
+	Upload a model from local directory to Beam volume using direct file operations.
 	Args:
 	    volume_name: Name of the Beam volume
 			logger.error(f"❌ Local model directory does not exist: {local_dir}")
 			return False
+		# When running on Beam, the volume is mounted as a directory
+		remote_models_dir = Path(volume_name) / "models"
+		remote_models_dir.mkdir(parents=True, exist_ok=True)
+		remote_model_path = remote_models_dir / model_name
+		# Copy the model directory
+		import shutil
+		if remote_model_path.exists():
+			shutil.rmtree(remote_model_path)
+		shutil.copytree(local_path, remote_model_path)
 		logger.info(f"📤 Uploaded model {model_name} to Beam from {local_dir}")
 		return True
+	except Exception as e:
 		logger.warning(f"⚠️ Failed to upload model {model_name} to Beam: {e}")
 		return False
 def download_checkpoints_from_beam(
 	local_checkpoints_dir: str = "code_model2vec/checkpoints",
 ) -> bool:
 	"""
+	Download checkpoint files from Beam volume to local directory using direct file operations.
 	Args:
 	    volume_name: Name of the Beam volume
 		local_path = Path(local_checkpoints_dir)
 		local_path.mkdir(parents=True, exist_ok=True)
+		# When running on Beam, the volume is mounted as a directory
+		remote_base_path = Path(volume_name) / remote_checkpoints_dir
+		# If the remote path doesn't exist, there are no checkpoints to download
+		if not remote_base_path.exists():
+			logger.info(f"ℹ️ No checkpoint directory found at {remote_base_path}")
+			return True
 		# Build the pattern for files to download
 		if stage:
 			local_stage_dir = local_path / stage
 			local_stage_dir.mkdir(parents=True, exist_ok=True)
+			# Look for files in stage-specific directory
+			remote_stage_dir = remote_base_path / stage
+			if remote_stage_dir.exists():
+				remote_files = list(remote_stage_dir.glob(f"checkpoints_{stage}_*.json"))
+			else:
+				remote_files = []
 		else:
+			# Look for all checkpoint files in all stage subdirectories
 			remote_files = []
+			for stage_dir in remote_base_path.iterdir():
+				if stage_dir.is_dir():
+					remote_files.extend(stage_dir.glob("checkpoints_*.json"))
+		# Copy each checkpoint file
 		downloaded_files = []
+		for checkpoint_file in remote_files:
 			# Determine local subdirectory based on checkpoint stage
+			file_stage = checkpoint_file.name.split("_")[1] if "_" in checkpoint_file.name else "unknown"
 			local_stage_dir = local_path / file_stage
 			local_stage_dir.mkdir(parents=True, exist_ok=True)
+			local_file_path = local_stage_dir / checkpoint_file.name
 			try:
+				import shutil
+				shutil.copy2(checkpoint_file, local_file_path)
+				downloaded_files.append(checkpoint_file.name)
+				logger.info(f"📥 Downloaded checkpoint: {checkpoint_file.name}")
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to download checkpoint {checkpoint_file.name}: {e}")
 		if downloaded_files:
 			logger.info(f"✅ Downloaded {len(downloaded_files)} checkpoint files")
 	remote_checkpoints_dir: str = "checkpoints",
 ) -> bool:
 	"""
+	Upload checkpoint files from local directory to Beam volume using direct file operations.
 	Args:
 	    volume_name: Name of the Beam volume
 			logger.warning(f"⚠️ Local checkpoints directory does not exist: {local_checkpoints_dir}")
 			return True  # Not an error - no checkpoints to upload
+		# When running on Beam, the volume is mounted as a directory
+		remote_base_path = Path(volume_name) / remote_checkpoints_dir
+		remote_base_path.mkdir(parents=True, exist_ok=True)
 		# Find checkpoint files to upload
 		if stage:
 			# Look in the stage subdirectory
 			logger.info(f"ℹ️ No checkpoint files found to upload for stage: {stage or 'all'}")
 			return True
+		# Copy each checkpoint file
 		uploaded_files = []
 		for checkpoint_file in checkpoint_files:
+			# Determine remote subdirectory based on checkpoint stage
+			file_stage = checkpoint_file.name.split("_")[1] if "_" in checkpoint_file.name else "unknown"
+			remote_stage_dir = remote_base_path / file_stage
+			remote_stage_dir.mkdir(parents=True, exist_ok=True)
+			remote_file_path = remote_stage_dir / checkpoint_file.name
 			try:
+				import shutil
+				shutil.copy2(checkpoint_file, remote_file_path)
 				uploaded_files.append(checkpoint_file.name)
 				logger.info(f"📤 Uploaded checkpoint: {checkpoint_file.name}")
+			except Exception as e:
 				logger.warning(f"⚠️ Failed to upload checkpoint {checkpoint_file.name}: {e}")
 		if uploaded_files: