DeOldify / fastai /gen_doc /gen_notebooks.py

Initial commit for Hugging Face sync (Clean History)

e9f9fd3 5 months ago

16.2 kB

	"`gen_doc.nbdoc` generates notebook documentation from module functions and links to correct places"
	import pkgutil, inspect, sys,os, importlib,json,enum,warnings,nbformat,re
	from IPython.core.display import display, Markdown
	from nbconvert.preprocessors import ExecutePreprocessor
	from nbformat.sign import NotebookNotary
	from pathlib import Path
	from .core import *
	from .nbdoc import *

	__all__ = ['create_module_page', 'update_module_page', 'import_mod',
	'link_nb', 'update_notebooks', 'generate_missing_metadata', 'update_nb_metadata']

	def get_empty_notebook():
	"Default notbook with the minimum metadata."
	#TODO: check python version and nbformat
	return {'metadata': {'kernelspec': {'display_name': 'Python 3',
	'language': 'python',
	'name': 'python3'},
	'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
	'file_extension': '.py',
	'mimetype': 'text/x-python',
	'name': 'python',
	'nbconvert_exporter': 'python',
	'pygments_lexer': 'ipython3',
	'version': '3.6.6'}},
	'nbformat': 4,
	'nbformat_minor': 2}

	def get_md_cell(source, metadata=None):
	"Markdown cell containing `source` with `metadata`."
	return {'cell_type': 'markdown',
	'metadata': {} if metadata is None else metadata,
	'source': source}

	def get_empty_cell(ctype='markdown'):
	"Empty cell of type `ctype`."
	return {'cell_type': ctype, 'metadata': {}, 'source': []}

	def get_code_cell(code, hidden=False):
	"Code cell containing `code` that may be `hidden`."
	return {'cell_type' : 'code',
	'execution_count': 0,
	'metadata' : {'hide_input': hidden, 'trusted':True},
	'source' : code,
	'outputs': []}

	def get_doc_cell(func_name):
	"Code cell with the command to show the doc of `func_name`."
	code = f"show_doc({func_name})"
	return get_code_cell(code, True)

	def get_global_vars(mod):
	"Return globally assigned variables."
	# https://stackoverflow.com/questions/8820276/docstring-for-variable/31764368#31764368
	import ast,re
	with open(mod.__file__, 'r') as f: fstr = f.read()
	flines = fstr.splitlines()
	d = {}
	for node in ast.walk(ast.parse(fstr)):
	if isinstance(node,ast.Assign) and hasattr(node.targets[0], 'id'):
	key,lineno = node.targets[0].id,node.targets[0].lineno
	codestr = flines[lineno]
	match = re.match(f"^({key})\s=\s.*", codestr)
	if match and match.group(1) != '__all__': # only top level assignment
	d[key] = f'`{codestr}` {get_source_link(mod, lineno)}'
	return d

	def write_nb(nb, nb_path, mode='w'):
	with open(nb_path, mode) as f: f.write(nbformat.writes(nbformat.from_dict(nb), version=4))

	class ExecuteShowDocPreprocessor(ExecutePreprocessor):
	"An ExecutePreprocessor that only executes show_doc cells"
	def preprocess_cell(self, cell, resources, index):
	if 'source' in cell and cell.cell_type == "code":
	if IMPORT_RE.search(cell['source']) or SHOW_DOC_RE.search(cell['source']):
	return super().preprocess_cell(cell, resources, index)
	return cell, resources

	def execute_nb(fname, metadata=None, save=True, show_doc_only=False):
	"Execute notebook `fname` with `metadata` for preprocessing."
	# Any module used in the notebook that isn't inside must be in the same directory as this script
	with open(fname) as f: nb = nbformat.read(f, as_version=4)
	ep_class = ExecuteShowDocPreprocessor if show_doc_only else ExecutePreprocessor
	ep = ep_class(timeout=600, kernel_name='python3')
	metadata = metadata or {}
	ep.preprocess(nb, metadata)
	if save:
	with open(fname, 'wt') as f: nbformat.write(nb, f)
	NotebookNotary().sign(nb)

	def _symbol_skeleton(name): return [get_doc_cell(name), get_md_cell(f"`{name}`")]

	def create_module_page(mod, dest_path, force=False):
	"Create the documentation notebook for module `mod_name` in path `dest_path`"
	nb = get_empty_notebook()
	mod_name = mod.__name__
	strip_name = strip_fastai(mod_name)
	init_cell = [get_md_cell(f'## Title for {strip_name} (use plain english, not module name!)'), get_md_cell('Type an introduction of the package here.')]
	cells = [get_code_cell(f'from fastai.gen_doc.nbdoc import \nfrom {mod_name} import ', True)]

	gvar_map = get_global_vars(mod)
	if gvar_map: cells.append(get_md_cell('### Global Variable Definitions:'))
	for name in get_exports(mod):
	if name in gvar_map: cells.append(get_md_cell(gvar_map[name]))

	for ft_name in get_ft_names(mod, include_inner=True):
	if not hasattr(mod, ft_name):
	warnings.warn(f"Module {strip_name} doesn't have a function named {ft_name}.")
	continue
	cells += _symbol_skeleton(ft_name)
	elt = getattr(mod, ft_name)
	nb['cells'] = init_cell + cells + [get_md_cell(UNDOC_HEADER)]

	doc_path = get_doc_path(mod, dest_path)
	write_nb(nb, doc_path, 'w' if force else 'x')
	execute_nb(doc_path)
	return doc_path

	_default_exclude = ['.ipynb_checkpoints', '__pycache__', '__init__.py', 'imports']

	def get_module_names(path_dir, exclude=None):
	if exclude is None: exclude = _default_exclude
	"Search a given `path_dir` and return all the modules contained inside except those in `exclude`"
	files = sorted(path_dir.glob('*'), key=lambda x: (x.is_dir(), x.name), reverse=True) # directories first
	res = [f'{path_dir.name}']
	for f in files:
	if f.is_dir() and f.name in exclude: continue # exclude directories
	if any([f.name.endswith(ex) for ex in exclude]): continue # exclude extensions

	if f.suffix == '.py': res.append(f'{path_dir.name}.{f.stem}')
	elif f.is_dir(): res += [f'{path_dir.name}.{name}' for name in get_module_names(f)]
	return res

	def read_nb(fname):
	"Read a notebook in `fname` and return its corresponding json"
	with open(fname,'r') as f: return nbformat.reads(f.read(), as_version=4)

	SHOW_DOC_RE = re.compile(r"show_doc\(([\w\.]*)")
	def read_nb_content(cells, mod_name):
	"Build a dictionary containing the position of the `cells`."
	doc_fns = {}
	for i, cell in enumerate(cells):
	if cell['cell_type'] == 'code':
	for match in SHOW_DOC_RE.findall(cell['source']):
	doc_fns[match] = i
	return doc_fns

	def read_nb_types(cells):
	doc_fns = {}
	for i, cell in enumerate(cells):
	if cell['cell_type'] == 'markdown':
	match = re.match(r"^(?:<code>\|`)?(\w)\s=\s*", cell['source'])
	if match is not None: doc_fns[match.group(1)] = i
	return doc_fns

	def link_markdown_cells(cells, modules):
	"Create documentation links for all cells in markdown with backticks."
	for i, cell in enumerate(cells):
	if cell['cell_type'] == 'markdown':
	cell['source'] = link_docstring(modules, cell['source'])

	def get_insert_idx(pos_dict, name):
	"Return the position to insert a given function doc in a notebook."
	keys,i = list(pos_dict.keys()),0
	while i < len(keys) and str.lower(keys[i]) < str.lower(name): i+=1
	if i == len(keys): return -1
	else: return pos_dict[keys[i]]

	def update_pos(pos_dict, start_key, nbr=2):
	"Update the `pos_dict` by moving all positions after `start_key` by `nbr`."
	for key,idx in pos_dict.items():
	if str.lower(key) >= str.lower(start_key): pos_dict[key] += nbr
	return pos_dict

	def insert_cells(cells, pos_dict, ft_name, append=False):
	"Insert the function doc `cells` at their correct position and updates `pos_dict`."
	idx = get_insert_idx(pos_dict, ft_name)
	if append or idx == -1: cells += [get_doc_cell(ft_name), get_empty_cell()]
	else:
	cells.insert(idx, get_doc_cell(ft_name))
	cells.insert(idx+1, get_empty_cell())
	pos_dict = update_pos(pos_dict, ft_name, 2)
	return cells, pos_dict

	def get_doc_path(mod, dest_path):
	strip_name = strip_fastai(mod.__name__)
	return os.path.join(dest_path,f'{strip_name}.ipynb')

	def generate_missing_metadata(dest_file):
	fn = Path(dest_file)
	meta_fn = fn.parent/'jekyll_metadata.ipynb'
	if not fn.exists() or not meta_fn.exists(): return print('Could not find notebooks:', fn, meta_fn)
	metadata_nb = read_nb(meta_fn)

	if has_metadata_cell(metadata_nb['cells'], fn.name): return
	nb = read_nb(fn)
	jmd = nb['metadata'].get('jekyll', {})
	fmt_params = ''
	for k,v in jmd.items(): fmt_params += f',\n {k}={stringify(v)}'
	metadata_cell = get_code_cell(f"update_nb_metadata('{Path(fn).name}'{fmt_params})", hidden=False)
	metadata_nb['cells'].append(metadata_cell)
	write_nb(metadata_nb, meta_fn)

	def update_nb_metadata(nb_path=None, title=None, summary=None, keywords='fastai', overwrite=True, **kwargs):
	"Creates jekyll metadata for given notebook path."
	nb = read_nb(nb_path)
	data = {'title': title, 'summary': summary, 'keywords': keywords, **kwargs}
	data = {k:v for (k,v) in data.items() if v is not None} # remove none values
	if not data: return
	nb['metadata']['jekyll'] = data
	write_nb(nb, nb_path)
	NotebookNotary().sign(nb)

	def has_metadata_cell(cells, fn):
	for c in cells:
	if re.search(f"update_nb_metadata\('{fn}'", c['source']): return c

	def stringify(s): return f'\'{s}\'' if isinstance(s, str) else s

	IMPORT_RE = re.compile(r"from (fastai[\.\w_]*)")
	def get_imported_modules(cells, nb_module_name=''):
	"Finds all submodules of notebook - sorted by submodules > top level modules > manual imports. This gives notebook imports priority"
	module_names = get_top_level_modules()
	nb_imports = [match.group(1) for cell in cells for match in IMPORT_RE.finditer(cell['source']) if cell['cell_type'] == 'code']
	parts = nb_module_name.split('.')
	parent_modules = ['.'.join(parts[:(x+1)]) for x in range_of(parts)] # Imports parent modules - a.b.c = [a, a.b, a.b.c]
	all_modules = module_names + nb_imports + parent_modules
	mods = [import_mod(m, ignore_errors=True) for m in all_modules]
	return [m for m in mods if m is not None]

	def get_top_level_modules(num_levels=1):
	mod_dir = Path(import_mod('fastai').__file__).parent
	filtered_n = filter(lambda x: x.count('.')<=num_levels, get_module_names(mod_dir))
	return sorted(filtered_n, key=lambda s: s.count('.'), reverse=True) # Submodules first (sorted by periods)

	NEW_FT_HEADER = '## New Methods - Please document or move to the undocumented section'
	UNDOC_HEADER = '## Undocumented Methods - Methods moved below this line will intentionally be hidden'
	def parse_sections(cells):
	old_cells, undoc_cells, new_cells = [], [], []
	current_section = old_cells
	for cell in cells:
	if cell['cell_type'] == 'markdown':
	if re.match(UNDOC_HEADER, cell['source']): current_section = undoc_cells
	if re.match(NEW_FT_HEADER, cell['source']): current_section = new_cells
	current_section.append(cell)
	undoc_cells = undoc_cells or [get_md_cell(UNDOC_HEADER)]
	new_cells = new_cells or [get_md_cell(NEW_FT_HEADER)]
	return old_cells, undoc_cells, new_cells

	def remove_undoc_cells(cells):
	old, _, _ = parse_sections(cells)
	return old

	# currently code vbox sub-cells mainly
	def remove_code_cell_jupyter_widget_state_elem(cells):
	for c in cells:
	if c['cell_type'] == 'code':
	if 'outputs' in c:
	c['outputs'] = [l for l in c['outputs'] if not ('data' in l and 'application/vnd.jupyter.widget-view+json' in l.data)]
	return cells

	def update_module_page(mod, dest_path='.'):
	"Update the documentation notebook of a given module."
	doc_path = get_doc_path(mod, dest_path)
	strip_name = strip_fastai(mod.__name__)
	nb = read_nb(doc_path)
	cells = nb['cells']

	link_markdown_cells(cells, get_imported_modules(cells, mod.__name__))

	type_dict = read_nb_types(cells)
	gvar_map = get_global_vars(mod)
	for name in get_exports(mod):
	if name not in gvar_map: continue
	code = gvar_map[name]
	if name in type_dict: cells[type_dict[name]] = get_md_cell(code)
	else: cells.append(get_md_cell(code))

	pos_dict = read_nb_content(cells, strip_name)
	ft_names = get_ft_names(mod, include_inner=True)
	new_fts = list(set(ft_names) - set(pos_dict.keys()))
	if new_fts: print(f'Found new fuctions for {mod}. Please document:\n{new_fts}')
	existing, undoc_cells, new_cells = parse_sections(cells)
	for ft_name in new_fts: new_cells.extend([get_doc_cell(ft_name), get_empty_cell()])
	if len(new_cells) > 1: nb['cells'] = existing + undoc_cells + new_cells

	write_nb(nb, doc_path)
	return doc_path

	def link_nb(nb_path):
	nb = read_nb(nb_path)
	cells = nb['cells']
	link_markdown_cells(cells, get_imported_modules(cells, Path(nb_path).stem))
	write_nb(nb, nb_path)
	NotebookNotary().sign(read_nb(nb_path))

	def get_module_from_notebook(doc_path):
	"Find module given a source path. Assume it belongs to fastai directory"
	return f'fastai.{Path(doc_path).stem}'

	def check_nbconvert_version():
	import nbconvert
	assert nbconvert.version_info >= (5,4,0), "Please update nbconvert to >=5.4 for consistent .html output"

	def update_notebooks(source_path, dest_path=None, update_html=True, document_new_fns=False,
	update_nb_links=True, html_path=None, force=False):
	"`source_path` can be a directory or a file. Assume all modules reside in the fastai directory."
	from .convert2html import convert_nb
	source_path = Path(source_path)

	if source_path.is_file():
	dest_path = source_path.parent if dest_path is None else Path(dest_path)
	html_path = dest_path/'..'/'docs' if html_path is None else Path(html_path)
	doc_path = source_path
	assert source_path.suffix == '.ipynb', 'Must update from notebook or module'
	if document_new_fns:
	mod = import_mod(get_module_from_notebook(source_path))
	if not mod: print('Could not find module for path:', source_path)
	elif mod.__file__.endswith('__init__.py'): pass
	else: update_module_page(mod, dest_path)
	generate_missing_metadata(doc_path)
	if update_nb_links:
	print(f'Updating notebook {doc_path}. Please wait...')
	link_nb(doc_path)
	execute_nb(doc_path, {'metadata': {'path': doc_path.parent}}, show_doc_only=True)
	if update_html:
	check_nbconvert_version()
	html_fn = html_path/doc_path.with_suffix('.html').name
	if not force and html_fn.is_file():
	in_mod = os.path.getmtime(doc_path)
	out_mod = os.path.getmtime(html_fn)
	if in_mod < out_mod: return
	convert_nb(doc_path, html_path)

	elif (source_path.name.startswith('fastai.')):
	# Do module update
	assert dest_path is not None, 'To update a module, you must specify a destination folder for where notebook resides'
	mod = import_mod(source_path.name)
	if not mod: return print('Could not find module for:', source_path)
	doc_path = Path(dest_path)/(strip_fastai(mod.__name__)+'.ipynb')
	if not doc_path.exists():
	print('Notebook does not exist. Creating:', doc_path)
	create_module_page(mod, dest_path)
	update_notebooks(doc_path, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,
	update_nb_links=update_nb_links, html_path=html_path)
	elif source_path.is_dir():
	for f in sorted(Path(source_path).glob('*.ipynb')):
	update_notebooks(f, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,
	update_nb_links=update_nb_links, html_path=html_path)
	else: print('Could not resolve source file:', source_path)