# Ultralytics šŸš€ AGPL-3.0 License - https://ultralytics.com/license """ Automates building and post-processing of MkDocs documentation, especially for multilingual projects. This script streamlines generating localized documentation and updating HTML links for correct formatting. Key Features: - Automated building of MkDocs documentation: Compiles main documentation and localized versions from separate MkDocs configuration files. - Post-processing of generated HTML files: Updates HTML files to remove '.md' from internal links, ensuring correct navigation in web-based documentation. Usage: - Run from the root directory of your MkDocs project. - Ensure MkDocs is installed and configuration files (main and localized) are present. - The script builds documentation using MkDocs, then scans HTML files in 'site' to update links. - Ideal for projects with Markdown documentation served as a static website. Note: - Requires Python and MkDocs to be installed and configured. """ from __future__ import annotations import os import re import shutil import subprocess import tempfile import time from pathlib import Path import yaml from bs4 import BeautifulSoup from minijinja import Environment, load_from_path try: from plugin import postprocess_site # mkdocs-ultralytics-plugin except ImportError: postprocess_site = None from build_reference import build_reference_docs, build_reference_for from ultralytics.utils import LINUX, LOGGER, MACOS from ultralytics.utils.tqdm import TQDM os.environ["JUPYTER_PLATFORM_DIRS"] = "1" # fix DeprecationWarning: Jupyter is migrating to use standard platformdirs DOCS = Path(__file__).parent.resolve() SITE = DOCS.parent / "site" LINK_PATTERN = re.compile(r"(https?://[^\s()<>]*[^\s()<>.,:;!?\'\"])") TITLE_PATTERN = re.compile(r"(.*?)", flags=re.IGNORECASE | re.DOTALL) MD_LINK_PATTERN = re.compile(r'(["\']?)([^"\'>\s]+?)\.md(["\']?)') DOC_KIND_LABELS = {"Class", "Function", "Method", "Property"} DOC_KIND_COLORS = { "Class": "#039dfc", # blue "Method": "#ef5eff", # magenta "Function": "#fc9803", # orange "Property": "#02e835", # green } def prepare_docs_markdown(clone_repos: bool = True): """Build docs using mkdocs.""" LOGGER.info("Removing existing build artifacts") shutil.rmtree(SITE, ignore_errors=True) shutil.rmtree(DOCS / "repos", ignore_errors=True) if clone_repos: # Get hub-sdk repo repo = "https://github.com/ultralytics/hub-sdk" local_dir = DOCS / "repos" / Path(repo).name subprocess.run( ["git", "clone", "-q", "--depth=1", "--single-branch", "-b", "main", repo, str(local_dir)], check=True ) shutil.rmtree(DOCS / "en/hub/sdk", ignore_errors=True) # delete if exists shutil.copytree(local_dir / "docs", DOCS / "en/hub/sdk") # for docs LOGGER.info(f"Cloned/Updated {repo} in {local_dir}") # Get docs repo repo = "https://github.com/ultralytics/docs" local_dir = DOCS / "repos" / Path(repo).name subprocess.run( ["git", "clone", "-q", "--depth=1", "--single-branch", "-b", "main", repo, str(local_dir)], check=True ) shutil.rmtree(DOCS / "en/compare", ignore_errors=True) # delete if exists shutil.copytree(local_dir / "docs/en/compare", DOCS / "en/compare") # for docs LOGGER.info(f"Cloned/Updated {repo} in {local_dir}") # Add frontmatter for file in TQDM((DOCS / "en").rglob("*.md"), desc="Adding frontmatter"): update_markdown_files(file) def update_markdown_files(md_filepath: Path): """Create or update a Markdown file, ensuring frontmatter is present.""" if md_filepath.exists(): content = md_filepath.read_text().strip() # Replace apostrophes content = content.replace("ā€˜", "'").replace("’", "'") # Add frontmatter if missing if not content.strip().startswith("---\n"): header = "---\ncomments: true\ndescription: TODO ADD DESCRIPTION\nkeywords: TODO ADD KEYWORDS\n---\n\n" content = header + content # Ensure MkDocs admonitions "=== " lines are preceded and followed by empty newlines lines = content.split("\n") new_lines = [] for i, line in enumerate(lines): stripped_line = line.strip() if stripped_line.startswith("=== "): if i > 0 and new_lines[-1] != "": new_lines.append("") new_lines.append(line) if i < len(lines) - 1 and lines[i + 1].strip() != "": new_lines.append("") else: new_lines.append(line) content = "\n".join(new_lines) # Add EOF newline if missing if not content.endswith("\n"): content += "\n" # Save page md_filepath.write_text(content) return def update_docs_html(): """Update titles, edit links, and convert plaintext links in HTML documentation in one pass.""" from concurrent.futures import ProcessPoolExecutor html_files = list(SITE.rglob("*.html")) if not html_files: LOGGER.info("Updated HTML files: 0") return desc = f"Updating HTML at {SITE}" max_workers = os.cpu_count() or 1 with ProcessPoolExecutor(max_workers=max_workers) as executor: pbar = TQDM(executor.map(_process_html_file, html_files), total=len(html_files), desc=desc) updated = 0 for res in pbar: updated += bool(res) pbar.set_description(f"{desc} ({updated}/{len(html_files)} updated)") def _process_html_file(html_file: Path) -> bool: """Process a single HTML file; returns True if modified.""" try: content = html_file.read_text(encoding="utf-8") except Exception as e: LOGGER.warning(f"Could not read {html_file}: {e}") return False changed = False try: rel_path = html_file.relative_to(SITE).as_posix() except ValueError: rel_path = html_file.name # For pages sourced from external repos (hub-sdk, compare), drop edit/copy buttons to avoid wrong links if rel_path.startswith(("hub/sdk/", "compare/")): before = content content = re.sub( r']*class="[^"]*md-content__button[^"]*"[^>]*>.*?', "", content, flags=re.IGNORECASE | re.DOTALL, ) if content != before: changed = True if rel_path == "404.html": new_content = re.sub(r".*?", "Ultralytics Docs - Not Found", content) if new_content != content: content, changed = new_content, True new_content = update_docs_soup(content, html_file=html_file) if new_content != content: content, changed = new_content, True new_content = _rewrite_md_links(content) if new_content != content: content, changed = new_content, True if changed: try: html_file.write_text(content, encoding="utf-8") return True except Exception as e: LOGGER.warning(f"Could not write {html_file}: {e}") return False def update_docs_soup(content: str, html_file: Path | None = None, max_title_length: int = 70) -> str: """Convert plaintext links to HTML hyperlinks, truncate long meta titles, and remove code line hrefs.""" title_match = TITLE_PATTERN.search(content) needs_title_trim = bool( title_match and len(title_match.group(1)) > max_title_length and "-" in title_match.group(1) ) needs_link_conversion = (" max_title_length and "-" in title_tag.text: title_tag.string = title_tag.text.rsplit("-", 1)[0].strip() modified = True # Find the main content area main_content = soup.find("main") or soup.find("div", class_="md-content") if not main_content: return str(soup) if modified else content # Convert plaintext links to HTML hyperlinks if needs_link_conversion: for paragraph in main_content.select("p, li"): for text_node in paragraph.find_all(string=True, recursive=False): if text_node.parent.name not in {"a", "code"}: new_text = LINK_PATTERN.sub(r'\1', str(text_node)) if " 0: tail = " " if tail: span.insert_after(tail) modified = True highlight_labels(soup.select("main h1, main h2, main h3, main h4, main h5")) highlight_labels(soup.select("nav.md-nav--secondary .md-ellipsis, nav.md-nav__list .md-ellipsis")) if "reference" in rel_path: for ellipsis in soup.select("nav.md-nav--secondary .md-ellipsis"): kind = ellipsis.find(class_=lambda c: c and "doc-kind" in c.split()) text = str(kind.next_sibling).strip() if kind and kind.next_sibling else ellipsis.get_text(strip=True) if "." not in text: continue ellipsis.clear() short = text.rsplit(".", 1)[-1] if kind: ellipsis.append(kind) ellipsis.append(f" {short}") else: ellipsis.append(short) modified = True if needs_kind_highlight and not modified and soup.select(".doc-kind"): # Ensure style injection when pre-existing badges are present modified = True if modified: head = soup.find("head") if head and not soup.select("style[data-doc-kind]"): style = soup.new_tag("style", attrs={"data-doc-kind": "true"}) style.string = ( ".doc-kind{display:inline-flex;align-items:center;gap:0.25em;padding:0.21em 0.59em;border-radius:999px;" "font-weight:700;font-size:0.81em;letter-spacing:0.06em;text-transform:uppercase;" "line-height:1;color:var(--doc-kind-color,#f8fafc);" "background:var(--doc-kind-bg,rgba(255,255,255,0.12));}" f".doc-kind-class{{--doc-kind-color:{DOC_KIND_COLORS['Class']};--doc-kind-bg:rgba(3,157,252,0.22);}}" f".doc-kind-function{{--doc-kind-color:{DOC_KIND_COLORS['Function']};--doc-kind-bg:rgba(252,152,3,0.22);}}" f".doc-kind-method{{--doc-kind-color:{DOC_KIND_COLORS['Method']};--doc-kind-bg:rgba(239,94,255,0.22);}}" f".doc-kind-property{{--doc-kind-color:{DOC_KIND_COLORS['Property']};--doc-kind-bg:rgba(2,232,53,0.22);}}" ) head.append(style) return str(soup) if modified else content def _rewrite_md_links(content: str) -> str: """Replace .md references with trailing slashes in HTML content, skipping GitHub links.""" if ".md" not in content: return content lines = [] for line in content.split("\n"): if "github.com" not in line: line = line.replace("index.md", "") line = MD_LINK_PATTERN.sub(r"\1\2/\3", line) lines.append(line) return "\n".join(lines) # Precompiled regex patterns for minification HTML_COMMENT = re.compile(r"") HTML_PRESERVE = re.compile(r"<(pre|code|textarea|script)[^>]*>[\s\S]*?", re.IGNORECASE) HTML_TAG_SPACE = re.compile(r">\s+<") HTML_MULTI_SPACE = re.compile(r"\s{2,}") HTML_EMPTY_LINE = re.compile(r"^\s*$\n", re.MULTILINE) CSS_COMMENT = re.compile(r"/\*[\s\S]*?\*/") def remove_comments_and_empty_lines(content: str, file_type: str) -> str: """Remove comments and empty lines from a string of code, preserving newlines and URLs. Args: content (str): Code content to process. file_type (str): Type of file ('html', 'css', or 'js'). Returns: (str): Cleaned content with comments and empty lines removed. Notes: Typical reductions for Ultralytics Docs are: - Total HTML reduction: 2.83% (1301.56 KB saved) - Total CSS reduction: 1.75% (2.61 KB saved) - Total JS reduction: 13.51% (99.31 KB saved) """ if file_type == "html": content = HTML_COMMENT.sub("", content) # Remove HTML comments # Preserve whitespace in
, ,