learn / langchain /docs /scripts /check_templates.py

Upload folder using huggingface_hub

a80f6e6 verified 10 months ago

3.01 kB

	import json
	import re
	import sys
	from functools import cache
	from pathlib import Path
	from typing import Dict, Iterable, List, Union

	CURR_DIR = Path(__file__).parent.absolute()
	CLI_TEMPLATE_DIR = (
	CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
	)

	INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
	"chat": {
	"issue_number": 22296,
	},
	"document_loaders": {
	"issue_number": 22866,
	},
	"stores": {"issue_number": 24888},
	"llms": {
	"issue_number": 24803,
	},
	"text_embedding": {"issue_number": 14856},
	"toolkits": {"issue_number": 24820},
	"tools": {"issue_number": "TODO"},
	"vectorstores": {"issue_number": 24800},
	"retrievers": {"issue_number": 24908},
	}


	@cache
	def _get_headers(doc_dir: str) -> Iterable[str]:
	"""Gets all markdown headers ## and below from the integration template.

	Ignores headers that contain "TODO"."""
	ipynb_name = f"{doc_dir}.ipynb"
	if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
	raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
	with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
	nb = json.load(f)

	headers: List[str] = []
	for cell in nb["cells"]:
	if cell["cell_type"] == "markdown":
	for line in cell["source"]:
	if not line.startswith("## ") or "TODO" in line:
	continue
	header = line.strip()
	headers.append(header)
	return headers


	def check_header_order(path: Path) -> None:
	if path.name.startswith("index."):
	# skip index pages
	return
	doc_dir = path.parent.name
	if doc_dir not in INFO_BY_DIR:
	# Skip if not a directory we care about
	return
	if "toolkit" in path.name:
	headers = _get_headers("toolkits")
	else:
	headers = _get_headers(doc_dir)
	issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")

	print(f"Checking {doc_dir} page {path}")

	with open(path, "r") as f:
	doc = f.read()
	notfound = []
	for header in headers:
	index = doc.find(header)
	if index == -1:
	notfound.append(header)
	doc = doc[index + len(header) :]
	if notfound:
	notfound_headers = "\n- ".join(notfound)
	raise ValueError(
	f"Document {path} is missing headers:"
	"\n- "
	f"{notfound_headers}"
	"\n\n"
	"Please see https://github.com/langchain-ai/langchain/issues/"
	f"{issue_number} for instructions on how to correctly format a "
	f"{doc_dir} integration page."
	)


	def main(*new_doc_paths: Union[str, Path]) -> None:
	for path in new_doc_paths:
	path = Path(path).resolve().absolute()
	if CURR_DIR.parent / "docs" / "integrations" in path.parents:
	check_header_order(path)
	else:
	continue


	if __name__ == "__main__":
	main(*sys.argv[1:])