| import multiprocessing |
| import os |
| import re |
| import sys |
| from pathlib import Path |
| from typing import Iterable, Tuple |
|
|
| import nbformat |
| from nbconvert.exporters import MarkdownExporter |
| from nbconvert.preprocessors import Preprocessor |
|
|
| HIDE_IN_NB_MAGIC_OPEN = "<!-- HIDE_IN_NB" |
| HIDE_IN_NB_MAGIC_CLOSE = "HIDE_IN_NB -->" |
|
|
|
|
| class EscapePreprocessor(Preprocessor): |
| def preprocess_cell(self, cell, resources, index): |
| if cell.cell_type == "markdown": |
| |
| cell.source = re.sub( |
| r"\[([^\]]*)\]\((?![^\)]*//)([^)]*)\.ipynb\)", |
| r"[\1](\2.md)", |
| cell.source, |
| ) |
|
|
| elif cell.cell_type == "code": |
| |
| cell.source = cell.source.replace("```", r"\`\`\`") |
| |
|
|
| |
| if cell.source.startswith("# title="): |
| lines = cell.source.split("\n") |
| title = lines[0].split("# title=")[1] |
| if title.startswith('"') and title.endswith('"'): |
| title = title[1:-1] |
| cell.metadata["title"] = title |
| cell.source = "\n".join(lines[1:]) |
|
|
| if "outputs" in cell: |
| filter_out = set() |
| for i, output in enumerate(cell["outputs"]): |
| if "text" in output: |
| if not output["text"].strip(): |
| filter_out.add(i) |
| continue |
| output["text"] = output["text"].replace("```", r"\`\`\`") |
| elif "data" in output: |
| for key, value in output["data"].items(): |
| if isinstance(value, str): |
| output["data"][key] = value.replace("```", r"\`\`\`") |
| cell["outputs"] = [ |
| output |
| for i, output in enumerate(cell["outputs"]) |
| if i not in filter_out |
| ] |
|
|
| return cell, resources |
|
|
|
|
| class ExtractAttachmentsPreprocessor(Preprocessor): |
| """ |
| Extracts all of the outputs from the notebook file. The extracted |
| outputs are returned in the 'resources' dictionary. |
| """ |
|
|
| def preprocess_cell(self, cell, resources, index): |
| """ |
| Apply a transformation on each cell, |
| Parameters |
| ---------- |
| cell : NotebookNode cell |
| Notebook cell being processed |
| resources : dictionary |
| Additional resources used in the conversion process. Allows |
| preprocessors to pass variables into the Jinja engine. |
| cell_index : int |
| Index of the cell being processed (see base.py) |
| """ |
|
|
| |
|
|
| |
| if not isinstance(resources["outputs"], dict): |
| resources["outputs"] = {} |
|
|
| |
| for name, attach in cell.get("attachments", {}).items(): |
| for mime, data in attach.items(): |
| if mime not in { |
| "image/png", |
| "image/jpeg", |
| "image/svg+xml", |
| "application/pdf", |
| }: |
| continue |
|
|
| |
| |
| attach_str = f"({name})" |
| if attach_str in cell.source: |
| data = f"(data:{mime};base64,{data})" |
| cell.source = cell.source.replace(attach_str, data) |
|
|
| return cell, resources |
|
|
|
|
| class CustomRegexRemovePreprocessor(Preprocessor): |
| def check_conditions(self, cell): |
| pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)") |
| rtn = not pattern.match(cell.source) |
| if not rtn: |
| return False |
| else: |
| return True |
|
|
| def preprocess(self, nb, resources): |
| nb.cells = [cell for cell in nb.cells if self.check_conditions(cell)] |
|
|
| return nb, resources |
|
|
|
|
| class UnHidePreprocessor(Preprocessor): |
| def preprocess_cell(self, cell, resources, index): |
| cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_OPEN, "") |
| cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_CLOSE, "") |
| return cell, resources |
|
|
|
|
| exporter = MarkdownExporter( |
| preprocessors=[ |
| EscapePreprocessor, |
| ExtractAttachmentsPreprocessor, |
| CustomRegexRemovePreprocessor, |
| UnHidePreprocessor, |
| ], |
| template_name="mdoutput", |
| extra_template_basedirs=["./scripts/notebook_convert_templates"], |
| ) |
|
|
|
|
| def _process_path(tup: Tuple[Path, Path, Path]): |
| notebook_path, intermediate_docs_dir, output_docs_dir = tup |
| relative = notebook_path.relative_to(intermediate_docs_dir) |
| output_path = output_docs_dir / relative.parent / (relative.stem + ".md") |
| _convert_notebook(notebook_path, output_path, intermediate_docs_dir) |
|
|
|
|
| def _modify_frontmatter( |
| body: str, notebook_path: Path, intermediate_docs_dir: Path |
| ) -> str: |
| |
| rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix() |
| edit_url = ( |
| f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}" |
| ) |
| frontmatter = { |
| "custom_edit_url": edit_url, |
| } |
| if re.match(r"^[\s\n]*---\n", body): |
| |
|
|
| for k, v in frontmatter.items(): |
| |
| if re.match(f"{k}: ", body): |
| continue |
| else: |
| body = re.sub(r"^[\s\n]*---\n", f"---\n{k}: {v}\n", body, count=1) |
| return body |
| else: |
| insert = "\n".join([f"{k}: {v}" for k, v in frontmatter.items()]) |
| return f"---\n{insert}\n---\n{body}" |
|
|
|
|
| def _convert_notebook( |
| notebook_path: Path, output_path: Path, intermediate_docs_dir: Path |
| ) -> Path: |
| with open(notebook_path) as f: |
| nb = nbformat.read(f, as_version=4) |
|
|
| body, resources = exporter.from_notebook_node(nb) |
|
|
| body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir) |
|
|
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| with open(output_path, "w") as f: |
| f.write(body) |
|
|
| return output_path |
|
|
|
|
| if __name__ == "__main__": |
| intermediate_docs_dir = Path(sys.argv[1]) |
| output_docs_dir = Path(sys.argv[2]) |
|
|
| source_paths_arg = os.environ.get("SOURCE_PATHS") |
| source_paths: Iterable[Path] |
| if source_paths_arg: |
| source_path_strs = re.split(r"\s+", source_paths_arg) |
| source_paths_stripped = [p.strip() for p in source_path_strs] |
| source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p] |
| else: |
| original_paths = list(intermediate_docs_dir.glob("**/*.ipynb")) |
| |
| relative_paths = [p.relative_to(intermediate_docs_dir) for p in original_paths] |
| out_paths = [ |
| output_docs_dir / p.parent / (p.stem + ".md") for p in relative_paths |
| ] |
| source_paths = [ |
| p |
| for p, o in zip(original_paths, out_paths) |
| if not o.exists() or o.stat().st_mtime < p.stat().st_mtime |
| ] |
| print(f"rebuilding {len(source_paths)}/{len(relative_paths)} notebooks") |
|
|
| with multiprocessing.Pool() as pool: |
| pool.map( |
| _process_path, |
| ( |
| (notebook_path, intermediate_docs_dir, output_docs_dir) |
| for notebook_path in source_paths |
| ), |
| ) |
|
|