import re import json import os from pathlib import Path from typing import List, Dict, Any class ColabNotebookProcessor: """Processes Jupyter notebooks to replace Google Colab specific code with local equivalents""" def __init__(self, notebook_dir: str = "/tmp/Notebook"): self.notebook_dir = Path(notebook_dir) self.dataset_files = self._get_available_datasets() self.dataset_mapping = self._create_dataset_mapping() def _get_available_datasets(self) -> List[str]: """Get list of available dataset files in the notebook directory""" if not self.notebook_dir.exists(): return [] dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'} return [f.name for f in self.notebook_dir.iterdir() if f.suffix.lower() in dataset_extensions and f.is_file()] def _create_dataset_mapping(self) -> Dict[str, str]: """Create mapping for common dataset references""" mapping = {} # If we have datasets, create common mappings for filename in self.dataset_files: name_without_ext = Path(filename).stem # Direct mappings mapping[filename] = filename mapping[name_without_ext] = filename mapping[filename.lower()] = filename mapping[name_without_ext.lower()] = filename # Common patterns if filename.lower().endswith('.csv'): mapping['data.csv'] = filename mapping['dataset.csv'] = filename mapping['train.csv'] = filename mapping['test.csv'] = filename return mapping def process_notebook(self, notebook_path: str) -> str: """Process notebook and return path to modified notebook""" with open(notebook_path, 'r', encoding='utf-8') as f: notebook = json.load(f) # Process each cell for cell in notebook.get('cells', []): if cell.get('cell_type') == 'code': cell['source'] = self._process_code_cell(cell.get('source', [])) # Save modified notebook modified_path = str(Path(notebook_path).parent / f"modified_{Path(notebook_path).name}") with open(modified_path, 'w', encoding='utf-8') as f: json.dump(notebook, f, indent=2) return modified_path def _process_code_cell(self, source_lines: List[str]) -> List[str]: """Process individual code cell to replace Colab-specific code""" if isinstance(source_lines, str): source_lines = source_lines.splitlines(True) processed_lines = [] skip_next = False for i, line in enumerate(source_lines): if skip_next: skip_next = False continue processed_line = self._process_line(line) # Handle multi-line Colab patterns if self._is_colab_drive_mount(line): # Skip the mount line and add a comment processed_lines.append("# Google Drive mount replaced with local file access\n") continue elif self._is_colab_files_upload(line): # Replace file upload with dataset selection processed_lines.append(self._replace_file_upload(line)) continue processed_lines.append(processed_line) return processed_lines def _process_line(self, line: str) -> str: """Process individual line for Colab replacements""" original_line = line # Skip/comment out Colab-specific imports if self._is_colab_import(line): return f"# {line}" if not line.strip().startswith('#') else line # Replace Google Drive paths with local paths line = self._replace_drive_paths(line) # Replace Colab file operations line = self._replace_file_operations(line) # Replace uploaded file references line = self._replace_uploaded_files(line) return line def _is_colab_import(self, line: str) -> bool: """Check if line contains Colab-specific imports""" colab_imports = [ 'from google.colab import drive', 'from google.colab import files', 'from google.colab import auth', 'import google.colab' ] line_stripped = line.strip() return any(imp in line_stripped for imp in colab_imports) def _is_colab_drive_mount(self, line: str) -> bool: """Check if line is a drive mount operation""" return 'drive.mount(' in line or 'drive.mount (' in line def _is_colab_files_upload(self, line: str) -> bool: """Check if line is a files upload operation""" return 'files.upload(' in line or 'files.upload (' in line def _replace_drive_paths(self, line: str) -> str: """Replace Google Drive paths with local paths""" # Common drive path patterns drive_patterns = [ (r'/content/drive/My Drive/', './'), (r'/content/drive/MyDrive/', './'), (r'/content/drive/', './'), (r'/content/', './'), (r'"/content/drive/[^"]*"', lambda m: self._find_dataset_match(m.group())), (r"'/content/drive/[^']*'", lambda m: self._find_dataset_match(m.group())), ] for pattern, replacement in drive_patterns: if callable(replacement): line = re.sub(pattern, replacement, line) else: line = re.sub(pattern, replacement, line) return line def _replace_file_operations(self, line: str) -> str: """Replace file operations with local equivalents""" # Replace common file reading patterns if 'pd.read_csv(' in line: line = self._replace_pandas_read(line, 'csv') elif 'pd.read_excel(' in line: line = self._replace_pandas_read(line, 'excel') return line def _replace_pandas_read(self, line: str, file_type: str) -> str: """Replace pandas read operations with local file paths""" # Extract filename from the line if possible pattern = r'["\']([^"\']+)["\']' matches = re.findall(pattern, line) if matches: original_path = matches[0] # Try to find a matching local dataset local_file = self._find_best_dataset_match(original_path, file_type) if local_file: line = line.replace(original_path, local_file) return line def _replace_uploaded_files(self, line: str) -> str: """Replace references to uploaded files with local dataset files""" # Pattern for uploaded file references if 'uploaded[' in line and self.dataset_files: # Replace with first available dataset line = f"# Uploaded file replaced with local dataset: {self.dataset_files[0]}\n" line += f"# Original: {line.strip()}\n" line += f"# Use: '{self.dataset_files[0]}' instead\n" return line def _replace_file_upload(self, line: str) -> str: """Replace file upload with comment about available datasets""" comment = "# File upload replaced with local datasets\n" if self.dataset_files: comment += f"# Available datasets: {', '.join(self.dataset_files)}\n" else: comment += "# No datasets found in directory\n" return comment def _find_dataset_match(self, quoted_path: str) -> str: """Find best matching dataset for a quoted path""" # Remove quotes path = quoted_path.strip('\'"') filename = os.path.basename(path) # Try direct match first if filename in self.dataset_files: return f'"{filename}"' # Try mapping if filename in self.dataset_mapping: return f'"{self.dataset_mapping[filename]}"' # Try partial matches for dataset in self.dataset_files: if filename.lower() in dataset.lower() or dataset.lower() in filename.lower(): return f'"{dataset}"' # Return first available dataset if any if self.dataset_files: return f'"{self.dataset_files[0]}"' return quoted_path # Return original if no match found def _find_best_dataset_match(self, original_path: str, file_type: str) -> str: """Find the best matching dataset file""" filename = os.path.basename(original_path) # Filter by file type if specified type_filtered = [] if file_type == 'csv': type_filtered = [f for f in self.dataset_files if f.lower().endswith('.csv')] elif file_type == 'excel': type_filtered = [f for f in self.dataset_files if f.lower().endswith(('.xlsx', '.xls'))] else: type_filtered = self.dataset_files # Try exact match if filename in type_filtered: return filename # Try name without extension name_without_ext = os.path.splitext(filename)[0] for dataset in type_filtered: if os.path.splitext(dataset)[0] == name_without_ext: return dataset # Return first file of the right type if type_filtered: return type_filtered[0] # Return first available dataset if self.dataset_files: return self.dataset_files[0] return filename # Return original if no datasets available