Spaces:

h2i
/

gLens

Sleeping

App Files Files Community

gLens / src /v2 /colab_handler.py

h2i

Upload 13 files

7f5c744 verified 5 months ago

raw

history blame contribute delete

9.93 kB

	import re
	import json
	import os
	from pathlib import Path
	from typing import List, Dict, Any

	class ColabNotebookProcessor:
	"""Processes Jupyter notebooks to replace Google Colab specific code with local equivalents"""

	def __init__(self, notebook_dir: str = "/tmp/Notebook"):
	self.notebook_dir = Path(notebook_dir)
	self.dataset_files = self._get_available_datasets()
	self.dataset_mapping = self._create_dataset_mapping()

	def _get_available_datasets(self) -> List[str]:
	"""Get list of available dataset files in the notebook directory"""
	if not self.notebook_dir.exists():
	return []

	dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
	return [f.name for f in self.notebook_dir.iterdir()
	if f.suffix.lower() in dataset_extensions and f.is_file()]

	def _create_dataset_mapping(self) -> Dict[str, str]:
	"""Create mapping for common dataset references"""
	mapping = {}

	# If we have datasets, create common mappings
	for filename in self.dataset_files:
	name_without_ext = Path(filename).stem

	# Direct mappings
	mapping[filename] = filename
	mapping[name_without_ext] = filename
	mapping[filename.lower()] = filename
	mapping[name_without_ext.lower()] = filename

	# Common patterns
	if filename.lower().endswith('.csv'):
	mapping['data.csv'] = filename
	mapping['dataset.csv'] = filename
	mapping['train.csv'] = filename
	mapping['test.csv'] = filename

	return mapping

	def process_notebook(self, notebook_path: str) -> str:
	"""Process notebook and return path to modified notebook"""
	with open(notebook_path, 'r', encoding='utf-8') as f:
	notebook = json.load(f)

	# Process each cell
	for cell in notebook.get('cells', []):
	if cell.get('cell_type') == 'code':
	cell['source'] = self._process_code_cell(cell.get('source', []))

	# Save modified notebook
	modified_path = str(Path(notebook_path).parent / f"modified_{Path(notebook_path).name}")
	with open(modified_path, 'w', encoding='utf-8') as f:
	json.dump(notebook, f, indent=2)

	return modified_path

	def _process_code_cell(self, source_lines: List[str]) -> List[str]:
	"""Process individual code cell to replace Colab-specific code"""
	if isinstance(source_lines, str):
	source_lines = source_lines.splitlines(True)

	processed_lines = []
	skip_next = False

	for i, line in enumerate(source_lines):
	if skip_next:
	skip_next = False
	continue

	processed_line = self._process_line(line)

	# Handle multi-line Colab patterns
	if self._is_colab_drive_mount(line):
	# Skip the mount line and add a comment
	processed_lines.append("# Google Drive mount replaced with local file access\n")
	continue
	elif self._is_colab_files_upload(line):
	# Replace file upload with dataset selection
	processed_lines.append(self._replace_file_upload(line))
	continue

	processed_lines.append(processed_line)

	return processed_lines

	def _process_line(self, line: str) -> str:
	"""Process individual line for Colab replacements"""
	original_line = line

	# Skip/comment out Colab-specific imports
	if self._is_colab_import(line):
	return f"# {line}" if not line.strip().startswith('#') else line

	# Replace Google Drive paths with local paths
	line = self._replace_drive_paths(line)

	# Replace Colab file operations
	line = self._replace_file_operations(line)

	# Replace uploaded file references
	line = self._replace_uploaded_files(line)

	return line

	def _is_colab_import(self, line: str) -> bool:
	"""Check if line contains Colab-specific imports"""
	colab_imports = [
	'from google.colab import drive',
	'from google.colab import files',
	'from google.colab import auth',
	'import google.colab'
	]

	line_stripped = line.strip()
	return any(imp in line_stripped for imp in colab_imports)

	def _is_colab_drive_mount(self, line: str) -> bool:
	"""Check if line is a drive mount operation"""
	return 'drive.mount(' in line or 'drive.mount (' in line

	def _is_colab_files_upload(self, line: str) -> bool:
	"""Check if line is a files upload operation"""
	return 'files.upload(' in line or 'files.upload (' in line

	def _replace_drive_paths(self, line: str) -> str:
	"""Replace Google Drive paths with local paths"""
	# Common drive path patterns
	drive_patterns = [
	(r'/content/drive/My Drive/', './'),
	(r'/content/drive/MyDrive/', './'),
	(r'/content/drive/', './'),
	(r'/content/', './'),
	(r'"/content/drive/[^"]*"', lambda m: self._find_dataset_match(m.group())),
	(r"'/content/drive/[^']*'", lambda m: self._find_dataset_match(m.group())),
	]

	for pattern, replacement in drive_patterns:
	if callable(replacement):
	line = re.sub(pattern, replacement, line)
	else:
	line = re.sub(pattern, replacement, line)

	return line

	def _replace_file_operations(self, line: str) -> str:
	"""Replace file operations with local equivalents"""
	# Replace common file reading patterns
	if 'pd.read_csv(' in line:
	line = self._replace_pandas_read(line, 'csv')
	elif 'pd.read_excel(' in line:
	line = self._replace_pandas_read(line, 'excel')

	return line

	def _replace_pandas_read(self, line: str, file_type: str) -> str:
	"""Replace pandas read operations with local file paths"""
	# Extract filename from the line if possible
	pattern = r'["\']([^"\']+)["\']'
	matches = re.findall(pattern, line)

	if matches:
	original_path = matches[0]
	# Try to find a matching local dataset
	local_file = self._find_best_dataset_match(original_path, file_type)
	if local_file:
	line = line.replace(original_path, local_file)

	return line

	def _replace_uploaded_files(self, line: str) -> str:
	"""Replace references to uploaded files with local dataset files"""
	# Pattern for uploaded file references
	if 'uploaded[' in line and self.dataset_files:
	# Replace with first available dataset
	line = f"# Uploaded file replaced with local dataset: {self.dataset_files[0]}\n"
	line += f"# Original: {line.strip()}\n"
	line += f"# Use: '{self.dataset_files[0]}' instead\n"

	return line

	def _replace_file_upload(self, line: str) -> str:
	"""Replace file upload with comment about available datasets"""
	comment = "# File upload replaced with local datasets\n"
	if self.dataset_files:
	comment += f"# Available datasets: {', '.join(self.dataset_files)}\n"
	else:
	comment += "# No datasets found in directory\n"
	return comment

	def _find_dataset_match(self, quoted_path: str) -> str:
	"""Find best matching dataset for a quoted path"""
	# Remove quotes
	path = quoted_path.strip('\'"')
	filename = os.path.basename(path)

	# Try direct match first
	if filename in self.dataset_files:
	return f'"{filename}"'

	# Try mapping
	if filename in self.dataset_mapping:
	return f'"{self.dataset_mapping[filename]}"'

	# Try partial matches
	for dataset in self.dataset_files:
	if filename.lower() in dataset.lower() or dataset.lower() in filename.lower():
	return f'"{dataset}"'

	# Return first available dataset if any
	if self.dataset_files:
	return f'"{self.dataset_files[0]}"'

	return quoted_path # Return original if no match found

	def _find_best_dataset_match(self, original_path: str, file_type: str) -> str:
	"""Find the best matching dataset file"""
	filename = os.path.basename(original_path)

	# Filter by file type if specified
	type_filtered = []
	if file_type == 'csv':
	type_filtered = [f for f in self.dataset_files if f.lower().endswith('.csv')]
	elif file_type == 'excel':
	type_filtered = [f for f in self.dataset_files if f.lower().endswith(('.xlsx', '.xls'))]
	else:
	type_filtered = self.dataset_files

	# Try exact match
	if filename in type_filtered:
	return filename

	# Try name without extension
	name_without_ext = os.path.splitext(filename)[0]
	for dataset in type_filtered:
	if os.path.splitext(dataset)[0] == name_without_ext:
	return dataset

	# Return first file of the right type
	if type_filtered:
	return type_filtered[0]

	# Return first available dataset
	if self.dataset_files:
	return self.dataset_files[0]

	return filename # Return original if no datasets available