Spaces:

h2i
/

gLens

Sleeping

File size: 9,925 Bytes

7f5c744

import re
import json
import os
from pathlib import Path
from typing import List, Dict, Any

class ColabNotebookProcessor:
    """Processes Jupyter notebooks to replace Google Colab specific code with local equivalents"""
    
    def __init__(self, notebook_dir: str = "/tmp/Notebook"):
        self.notebook_dir = Path(notebook_dir)
        self.dataset_files = self._get_available_datasets()
        self.dataset_mapping = self._create_dataset_mapping()
    
    def _get_available_datasets(self) -> List[str]:
        """Get list of available dataset files in the notebook directory"""
        if not self.notebook_dir.exists():
            return []
        
        dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
        return [f.name for f in self.notebook_dir.iterdir() 
                if f.suffix.lower() in dataset_extensions and f.is_file()]
    
    def _create_dataset_mapping(self) -> Dict[str, str]:
        """Create mapping for common dataset references"""
        mapping = {}
        
        # If we have datasets, create common mappings
        for filename in self.dataset_files:
            name_without_ext = Path(filename).stem
            
            # Direct mappings
            mapping[filename] = filename
            mapping[name_without_ext] = filename
            mapping[filename.lower()] = filename
            mapping[name_without_ext.lower()] = filename
            
            # Common patterns
            if filename.lower().endswith('.csv'):
                mapping['data.csv'] = filename
                mapping['dataset.csv'] = filename
                mapping['train.csv'] = filename
                mapping['test.csv'] = filename
        
        return mapping
    
    def process_notebook(self, notebook_path: str) -> str:
        """Process notebook and return path to modified notebook"""
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook = json.load(f)
        
        # Process each cell
        for cell in notebook.get('cells', []):
            if cell.get('cell_type') == 'code':
                cell['source'] = self._process_code_cell(cell.get('source', []))
        
        # Save modified notebook
        modified_path = str(Path(notebook_path).parent / f"modified_{Path(notebook_path).name}")
        with open(modified_path, 'w', encoding='utf-8') as f:
            json.dump(notebook, f, indent=2)
        
        return modified_path
    
    def _process_code_cell(self, source_lines: List[str]) -> List[str]:
        """Process individual code cell to replace Colab-specific code"""
        if isinstance(source_lines, str):
            source_lines = source_lines.splitlines(True)
        
        processed_lines = []
        skip_next = False
        
        for i, line in enumerate(source_lines):
            if skip_next:
                skip_next = False
                continue
                
            processed_line = self._process_line(line)
            
            # Handle multi-line Colab patterns
            if self._is_colab_drive_mount(line):
                # Skip the mount line and add a comment
                processed_lines.append("# Google Drive mount replaced with local file access\n")
                continue
            elif self._is_colab_files_upload(line):
                # Replace file upload with dataset selection
                processed_lines.append(self._replace_file_upload(line))
                continue
            
            processed_lines.append(processed_line)
        
        return processed_lines
    
    def _process_line(self, line: str) -> str:
        """Process individual line for Colab replacements"""
        original_line = line
        
        # Skip/comment out Colab-specific imports
        if self._is_colab_import(line):
            return f"# {line}" if not line.strip().startswith('#') else line
        
        # Replace Google Drive paths with local paths
        line = self._replace_drive_paths(line)
        
        # Replace Colab file operations
        line = self._replace_file_operations(line)
        
        # Replace uploaded file references
        line = self._replace_uploaded_files(line)
        
        return line
    
    def _is_colab_import(self, line: str) -> bool:
        """Check if line contains Colab-specific imports"""
        colab_imports = [
            'from google.colab import drive',
            'from google.colab import files',
            'from google.colab import auth',
            'import google.colab'
        ]
        
        line_stripped = line.strip()
        return any(imp in line_stripped for imp in colab_imports)
    
    def _is_colab_drive_mount(self, line: str) -> bool:
        """Check if line is a drive mount operation"""
        return 'drive.mount(' in line or 'drive.mount (' in line
    
    def _is_colab_files_upload(self, line: str) -> bool:
        """Check if line is a files upload operation"""
        return 'files.upload(' in line or 'files.upload (' in line
    
    def _replace_drive_paths(self, line: str) -> str:
        """Replace Google Drive paths with local paths"""
        # Common drive path patterns
        drive_patterns = [
            (r'/content/drive/My Drive/', './'),
            (r'/content/drive/MyDrive/', './'),
            (r'/content/drive/', './'),
            (r'/content/', './'),
            (r'"/content/drive/[^"]*"', lambda m: self._find_dataset_match(m.group())),
            (r"'/content/drive/[^']*'", lambda m: self._find_dataset_match(m.group())),
        ]
        
        for pattern, replacement in drive_patterns:
            if callable(replacement):
                line = re.sub(pattern, replacement, line)
            else:
                line = re.sub(pattern, replacement, line)
        
        return line
    
    def _replace_file_operations(self, line: str) -> str:
        """Replace file operations with local equivalents"""
        # Replace common file reading patterns
        if 'pd.read_csv(' in line:
            line = self._replace_pandas_read(line, 'csv')
        elif 'pd.read_excel(' in line:
            line = self._replace_pandas_read(line, 'excel')
        
        return line
    
    def _replace_pandas_read(self, line: str, file_type: str) -> str:
        """Replace pandas read operations with local file paths"""
        # Extract filename from the line if possible
        pattern = r'["\']([^"\']+)["\']'
        matches = re.findall(pattern, line)
        
        if matches:
            original_path = matches[0]
            # Try to find a matching local dataset
            local_file = self._find_best_dataset_match(original_path, file_type)
            if local_file:
                line = line.replace(original_path, local_file)
        
        return line
    
    def _replace_uploaded_files(self, line: str) -> str:
        """Replace references to uploaded files with local dataset files"""
        # Pattern for uploaded file references
        if 'uploaded[' in line and self.dataset_files:
            # Replace with first available dataset
            line = f"# Uploaded file replaced with local dataset: {self.dataset_files[0]}\n"
            line += f"# Original: {line.strip()}\n"
            line += f"# Use: '{self.dataset_files[0]}' instead\n"
        
        return line
    
    def _replace_file_upload(self, line: str) -> str:
        """Replace file upload with comment about available datasets"""
        comment = "# File upload replaced with local datasets\n"
        if self.dataset_files:
            comment += f"# Available datasets: {', '.join(self.dataset_files)}\n"
        else:
            comment += "# No datasets found in directory\n"
        return comment
    
    def _find_dataset_match(self, quoted_path: str) -> str:
        """Find best matching dataset for a quoted path"""
        # Remove quotes
        path = quoted_path.strip('\'"')
        filename = os.path.basename(path)
        
        # Try direct match first
        if filename in self.dataset_files:
            return f'"{filename}"'
        
        # Try mapping
        if filename in self.dataset_mapping:
            return f'"{self.dataset_mapping[filename]}"'
        
        # Try partial matches
        for dataset in self.dataset_files:
            if filename.lower() in dataset.lower() or dataset.lower() in filename.lower():
                return f'"{dataset}"'
        
        # Return first available dataset if any
        if self.dataset_files:
            return f'"{self.dataset_files[0]}"'
        
        return quoted_path  # Return original if no match found
    
    def _find_best_dataset_match(self, original_path: str, file_type: str) -> str:
        """Find the best matching dataset file"""
        filename = os.path.basename(original_path)
        
        # Filter by file type if specified
        type_filtered = []
        if file_type == 'csv':
            type_filtered = [f for f in self.dataset_files if f.lower().endswith('.csv')]
        elif file_type == 'excel':
            type_filtered = [f for f in self.dataset_files if f.lower().endswith(('.xlsx', '.xls'))]
        else:
            type_filtered = self.dataset_files
        
        # Try exact match
        if filename in type_filtered:
            return filename
        
        # Try name without extension
        name_without_ext = os.path.splitext(filename)[0]
        for dataset in type_filtered:
            if os.path.splitext(dataset)[0] == name_without_ext:
                return dataset
        
        # Return first file of the right type
        if type_filtered:
            return type_filtered[0]
        
        # Return first available dataset
        if self.dataset_files:
            return self.dataset_files[0]
        
        return filename  # Return original if no datasets available