File size: 18,714 Bytes
7f5c744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import os,sys,shutil,subprocess,json,re,ast,tempfile,nbformat,logging,argparse
import pandas as pd
from pathlib import Path
from colab_handler import ColabNotebookProcessor

# -------- Logging --------
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

# -------- Executor Class --------
class NotebookExecutor:
    def __init__(self, working_dir: str | None = None, verbose: bool = False, output_csv: str = "/tmp/Notebook/notebook_execution_report.csv"):
        self.working_dir = Path(working_dir) if working_dir else Path("/tmp/Notebook")
        self.verbose = verbose
        self.results = []
        self.output_csv = output_csv
        self.colab_processor = ColabNotebookProcessor(str(self.working_dir))

    def setup_working_dir(self):
        """Create working directory"""
        self.working_dir.mkdir(parents=True, exist_ok=True)
        if self.verbose:
            logger.info(f"Working directory: {self.working_dir}")

    def clean_working_dir(self):
        """Remove working directory"""
        try:
            if self.working_dir.exists():
                shutil.rmtree(self.working_dir)
                if self.verbose:
                    logger.info(f"Cleaned working directory: {self.working_dir}")
        except Exception as e:
            logger.warning(f"Unable to fully clean working dir: {e}")

    def _read_notebook_json(self, notebook_path: Path):
        try:
            with open(str(notebook_path), "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception as e:
            raise RuntimeError(f"Failed to read notebook JSON: {e}")

    def list_available_datasets(self):
        """List available datasets in the working directory"""
        dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
        datasets = [f.name for f in self.working_dir.iterdir() 
                   if f.suffix.lower() in dataset_extensions and f.is_file()]
        
        if self.verbose and datasets:
            logger.info(f"Available datasets: {', '.join(datasets)}")
        
        return datasets

    def extract_imports_from_notebook(self, notebook_path: Path):
        """Extract third-party imports from notebook via AST (best-effort)."""
        imports = set()
        try:
            nb_json = self._read_notebook_json(notebook_path)
        except Exception:
            return set()

        for cell in nb_json.get("cells", []):
            if cell.get("cell_type") != "code":
                continue
            source = cell.get("source", "")
            if isinstance(source, list):
                source = "\n".join(source)
            try:
                tree = ast.parse(source)
                for node in ast.walk(tree):
                    if isinstance(node, ast.Import):
                        for alias in node.names:
                            imports.add(alias.name.split(".")[0])
                    elif isinstance(node, ast.ImportFrom):
                        if node.module:
                            imports.add(node.module.split(".")[0])
            except Exception:
                # ignore parsing errors in individual cells
                continue

        # Filter standard-library modules (non-exhaustive)
        stdlib = {
            'os','sys','json','re','math','random','datetime','time','collections',
            'itertools','functools','operator','pathlib','urllib','http','xml','html',
            'csv','sqlite3','pickle','logging','unittest','argparse','configparser',
            'io','typing','warnings','copy','string','textwrap','unicodedata','struct',
            'codecs','calendar','hashlib','hmac','secrets','statistics', 'subprocess'
        }
        
        # Filter out Google Colab specific imports as they're handled separately
        colab_modules = {'google', 'colab'}
        
        third_party = imports - stdlib - colab_modules
        return third_party

    def install_packages(self, python_executable: Path, packages: set | list):
        """Install packages into the environment (best-effort). Returns (success, stderr_text)"""
        if not packages:
            return True, ""
        
        # map common names -> pip packages
        package_mapping = {
            'sklearn': 'scikit-learn',
            'cv2': 'opencv-python',
            'PIL': 'Pillow',
            'bs4': 'beautifulsoup4',
        }

        mapped = [package_mapping.get(p, p) for p in packages]

        # Install packages one-by-one so errors are isolated
        for pkg in mapped:
            try:
                proc = subprocess.run([str(python_executable), "-m", "pip", "install", pkg],
                                      capture_output=True, text=True, timeout=600)
                if proc.returncode != 0:
                    stderr = proc.stderr or proc.stdout or f"pip install returned {proc.returncode}"
                    logger.warning(f"Failed to install {pkg}: {stderr.strip()[:400]}")
                    return False, stderr
            except subprocess.TimeoutExpired:
                msg = f"Timeout while installing {pkg}"
                logger.warning(msg)
                return False, msg
            except Exception as e:
                msg = f"Error while installing {pkg}: {e}"
                logger.warning(msg)
                return False, msg

        return True, ""

    def extract_notebook_error(self, stderr_text: str):
        """Attempt to extract concise error message from papermill/pip stderr."""
        if not stderr_text:
            return "Unknown error occurred"
        lines = stderr_text.strip().splitlines()
        # Look for Traceback or Exception lines
        for line in reversed(lines):
            if any(keyword in line for keyword in ("Traceback", "Error", "Exception", "ModuleNotFoundError", "ImportError")):
                return line.strip()
        # fallback to last non-empty line
        for line in reversed(lines):
            if line.strip():
                return line.strip()
        return lines[-1] if lines else "Unknown error"

    def display_cell_execution_details(self, output_notebook_path: Path):
        """Verbose: show last executed cells (best-effort)."""
        try:
            nb = nbformat.read(str(output_notebook_path), as_version=4)
        except Exception as e:
            logger.info(f"Could not read output notebook for cell details: {e}")
            return

        logger.info("CELL-BY-CELL EXECUTION DETAILS (showing up to 10 code cells)")
        shown = 0
        for i, cell in enumerate(nb.cells, start=1):
            if cell.cell_type != "code":
                continue
            shown += 1
            logger.info(f"--- CELL {i} ---")
            src_preview = ("\n".join(cell.source.splitlines()[:6]) + ("\n..." if len(cell.source.splitlines()) > 6 else ""))
            logger.info("SOURCE (first lines):\n" + src_preview)
            if getattr(cell, "outputs", None):
                for output in cell.outputs[-2:]:  # show last two outputs per cell
                    if output.output_type == "stream":
                        text = getattr(output, "text", "")
                        logger.info("STREAM OUTPUT:\n" + ("\n".join(text.splitlines()[-4:])))
                    elif output.output_type == "error":
                        ename = getattr(output, "ename", "")
                        evalue = getattr(output, "evalue", "")
                        logger.info(f"ERROR: {ename}: {evalue}")
            if shown >= 10:
                break

    def run_notebook(self, notebook_path: str | Path, timeout: int = 1800):
        """
        Run a single notebook with Colab code replacement and dataset support.
        Returns a dict: {'notebook': <name>, 'status': 'Pass'|'Fail', 'error_message': <msg>}
        """
        try:
            if isinstance(notebook_path, str):
                if notebook_path.startswith('/'):
                    notebook_full_path = Path(notebook_path)
                else:
                    notebook_full_path = Path('/tmp/Notebook') / notebook_path
            else:
                notebook_full_path = Path(notebook_path)
        except Exception as e:
            return {"notebook": str(notebook_path), "status": "Fail", "error_message": f"Invalid path: {e}"}

        notebook_name = notebook_full_path.name
        notebook_dir = Path('/tmp/Notebook')

        # Check if notebook exists
        if not notebook_full_path.exists():
            return {"notebook": notebook_name, "status": "Fail", "error_message": f"Notebook not found at: {notebook_full_path}"}

        # List available datasets
        datasets = self.list_available_datasets()
        if datasets:
            logger.info(f"Processing notebook with {len(datasets)} available dataset(s)")

        # Process notebook for Colab compatibility
        try:
            processed_notebook_path = self.colab_processor.process_notebook(str(notebook_full_path))
            if self.verbose:
                logger.info(f"Processed notebook for Colab compatibility: {processed_notebook_path}")
        except Exception as e:
            logger.warning(f"Failed to process Colab compatibility: {e}")
            processed_notebook_path = str(notebook_full_path)

        # create fresh venv in the notebook folder
        env_path = notebook_dir / "venv"
        if env_path.exists():
            try:
                shutil.rmtree(env_path)
            except Exception:
                pass

        # create venv
        try:
            venv_proc = subprocess.run([sys.executable, "-m", "venv", str(env_path)], capture_output=True, text=True, timeout=120)
            if venv_proc.returncode != 0:
                stderr = venv_proc.stderr or venv_proc.stdout
                return {"notebook": notebook_name, "status": "Fail",
                        "error_message": f"Failed to create venv: {stderr.strip()[:400]}"}
        except subprocess.TimeoutExpired:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": "Timeout while creating virtual environment"}
        except Exception as e:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": f"Error creating venv: {e}"}

        # python executable inside venv
        if os.name == "nt":
            python_exec = env_path / "Scripts" / "python.exe"
        else:
            python_exec = env_path / "bin" / "python"

        if not python_exec.exists():
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": f"Python executable not found in venv: {python_exec}"}

        # Upgrade pip and install pinned minimal tooling
        try:
            # Upgrade pip
            up_proc = subprocess.run([str(python_exec), "-m", "pip", "install", "--upgrade", "pip"],
                                     capture_output=True, text=True, timeout=120)
            if up_proc.returncode != 0:
                logger.warning("pip upgrade returned non-zero; continuing if possible")

            # Install pinned papermill / ipykernel / jupyter (stable versions)
            pinned = [
                "papermill==2.5.0",
                "ipykernel==6.29.5",
                "jupyter==1.0.0"
            ]
            install_proc = subprocess.run([str(python_exec), "-m", "pip", "install"] + pinned,
                                          capture_output=True, text=True, timeout=600)
            if install_proc.returncode != 0:
                stderr_text = install_proc.stderr or install_proc.stdout or "pip install returned non-zero"
                return {"notebook": notebook_name, "status": "Fail",
                        "error_message": f"Failed to setup environment (pip install core packages): {stderr_text.strip()[:800]}"}
        except subprocess.TimeoutExpired:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": "Timeout installing core packages"}
        except Exception as e:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": f"Error installing core packages: {e}"}

        # Install common data-science packages (helps many notebooks run without per-notebook pip)
        common_packages = ["numpy", "pandas", "matplotlib", "seaborn", "scikit-learn", "plotly"]
        try:
            common_proc = subprocess.run([str(python_exec), "-m", "pip", "install"] + common_packages,
                                         capture_output=True, text=True, timeout=600)
            if common_proc.returncode != 0:
                logger.warning("Installing common packages returned non-zero; continuing")
        except Exception:
            logger.warning("Unexpected error during common package install; continuing")

        # Extract inferred imports and try to install them (best-effort)
        # Use the original notebook for import detection, not the processed one
        inferred = self.extract_imports_from_notebook(notebook_full_path)
        if inferred:
            success, stderr_text = self.install_packages(python_exec, inferred)
            if not success:
                return {"notebook": notebook_name, "status": "Fail",
                        "error_message": f"Failed to install inferred packages: {stderr_text.strip()[:800]}"}

        # Create kernel name and install kernel
        kernel_name = f"nb_{re.sub(r'[^A-Za-z0-9_]', '_', notebook_name)}"
        try:
            kernel_proc = subprocess.run([str(python_exec), "-m", "ipykernel", "install", "--user",
                                          "--name", kernel_name, "--display-name", f"Python ({kernel_name})"],
                                         capture_output=True, text=True, timeout=60)
            if kernel_proc.returncode != 0:
                stderr_text = kernel_proc.stderr or kernel_proc.stdout or "ipykernel install returned non-zero"
                return {"notebook": notebook_name, "status": "Fail",
                        "error_message": f"Failed to install kernel: {stderr_text.strip()[:800]}"}
        except subprocess.TimeoutExpired:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": "Timeout while installing kernel"}
        except Exception as e:
            return {"notebook": notebook_name, "status": "Fail",
                    "error_message": f"Error installing kernel: {e}"}

        # Execute notebook with papermill (use the processed notebook)
        output_path = notebook_dir / "output.ipynb"
        try:
            exec_proc = subprocess.run([str(python_exec), "-m", "papermill",
                                        processed_notebook_path, str(output_path),
                                        "--kernel", kernel_name, "--no-progress-bar"],
                                       capture_output=True, text=True, timeout=timeout, cwd=str(notebook_dir))
            if exec_proc.returncode == 0:
                status = "Pass"
                error_message = ""
                if self.verbose:
                    logger.info("Notebook executed successfully")
                    if output_path.exists():
                        self.display_cell_execution_details(output_path)
            else:
                status = "Fail"
                stderr_text = (exec_proc.stderr or "") + "\n" + (exec_proc.stdout or "")
                concise = self.extract_notebook_error(stderr_text)
                error_message = f"Execution failed: {concise}"
                if self.verbose:
                    logger.error(error_message)
                    if output_path.exists():
                        self.display_cell_execution_details(output_path)
        except subprocess.TimeoutExpired:
            status = "Fail"
            error_message = f"Execution timed out after {timeout} seconds"
        except Exception as e:
            status = "Fail"
            error_message = f"Papermill execution error: {e}"

        # cleanup processed notebook
        try:
            if processed_notebook_path != str(notebook_full_path) and os.path.exists(processed_notebook_path):
                os.remove(processed_notebook_path)
        except Exception:
            pass

        # cleanup venv if present (best-effort)
        try:
            if env_path.exists():
                shutil.rmtree(env_path)
        except Exception:
            logger.info("Could not remove venv directory (non-fatal)")

        result = {"notebook": notebook_name, "status": status, "error_message": error_message}
        # store result
        self.results.append(result)
        # update CSV incrementally
        self._update_csv_report()
        return result

    def _update_csv_report(self):
        """Write incremental CSV with columns notebook,status,error_message"""
        try:
            df = pd.DataFrame(self.results)
            # Ensure consistent column ordering
            cols = ['notebook', 'status', 'error_message']
            for c in cols:
                if c not in df.columns:
                    df[c] = ""
            df = df[cols]
            df.to_csv(self.output_csv, index=False)
            if self.verbose:
                logger.info(f"Wrote report to {self.output_csv}")
        except Exception as e:
            logger.warning(f"Failed to write CSV report: {e}")

# -------- Public entrypoint --------
def execute_notebook(path: str, timeout: int = 1800, verbose: bool = False, output_csv: str = "notebook_execution_report.csv"):
    """
    Public function to execute a single notebook with Colab support and dataset integration.

    Args:
        path: path to the uploaded .ipynb file (string)
        timeout: execution timeout in seconds (default 1800)
        verbose: enable verbose logging
        output_csv: path to write CSV report (default notebook_execution_report.csv)

    Returns:
        result dict: {'notebook': <name>, 'status': 'Pass'|'Fail', 'error_message': <msg>}
    """
    executor = NotebookExecutor(verbose=verbose, output_csv=output_csv)
    executor.setup_working_dir()
    result = executor.run_notebook(path, timeout=timeout)
    return result

# -------- CLI main (optional) --------
def main_call(notebook):
    """Main function for executing notebook with enhanced Colab and dataset support"""
    executor = NotebookExecutor(verbose=True)  # Enable verbose for better debugging
    executor.setup_working_dir()

    result = executor.run_notebook(notebook, timeout=1800)
    
    return result