Upload folder using huggingface_hub

88df9e4 verified about 1 month ago

10 kB

	/*
	Parses fenced code blocks with `annotate` in info string.
	Results in single line comments split out, output format is:

	.annotate
	.annotate-row (n)
	.annotate-code
	.annotate-note

	Contributing rules:
	- You must include `annotate` in the info string
	- You must include a language on the starting ` ``` ` tag.
	- Notes must start with one of: `#`, `//`, `<!--`, `%%`. (comment tag)
	- The comment tag style must match the language on the code fence.
	- Multiline-style comments, such as `/*` are not supported.
	- You can include any number of spaces before the comment tag starts.
	- You can include any number of spaces after the comment tag ends.
	- You can leave after the comment tag blank to create a blank annotation.
	- You cannot create a blank code block however.
	- Anything after the comment tag will be parsed with Markdown.
	- You can use any inline Markdown tag in the comment; recommend against using block tags such as headings, blockquote, horizontal rules, tables, lists, or code fences.
	- Multiple lines in row with the comment tag will result in a single annotation.
	- Empty lines, or lines that contain only space characters, will be discarded.
	- You must start the code section with a single line comment, otherwise the two will be flipped.
	- For HTML style, you can include a line after your annotations such as `<!-- -->` to maintain syntax highlighting; this will not impact what renders.

	`parse-info-string.ts` plugin is required for this to work, and must come before `remark-rehype`.
	`annotate` must come before the `highlight` plugin.
	*/

	import yaml from 'js-yaml'
	import fs from 'fs'
	import { chunk, last } from 'lodash-es'
	import { visit } from 'unist-util-visit'
	import { h } from 'hastscript'
	import { fromMarkdown } from 'mdast-util-from-markdown'
	import { toHast } from 'mdast-util-to-hast'
	import type { Root } from 'mdast'
	import { header } from './code-header'
	import findPage from '@/frame/lib/find-page'

	interface LanguageConfig {
	comment: 'number' \| 'slash' \| 'xml' \| 'percent' \| 'hyphen'
	[key: string]: any
	}

	interface ElementNode {
	type: 'element'
	tagName: string
	properties: {
	className?: string[]
	[key: string]: any
	}
	children: any[]
	data?: {
	meta?: {
	annotate?: boolean
	[key: string]: any
	}
	}
	}

	const languages = yaml.load(fs.readFileSync('./data/code-languages.yml', 'utf8')) as Record<
	string,
	LanguageConfig
	>

	const commentRegexes = {
	// Also known has hash or sharp; but the unicode name is "number sign".
	// The reason this has 2 variants is because the hash is used, in bash
	// for both hash-hang and for comments.
	// For example:
	//
	// #!/bin/bash
	//
	// ...is not a comment.
	// But if you only look for `#` followed by anything-but `!` it will not
	// match if the line is just `#`.
	//
	// > /^\s#[^!]\s/.test('#')
	// false
	//
	// Which makes sense, because the `#` is not followed by anything.
	// That's why we use the \| operator to make an "exception" for that case.
	number: /^\s#[^!]\s\|^\s*#$/,
	slash: /^\s\/\/\s/,
	xml: /^\s<!--\s/,
	percent: /^\s%%?\s/,
	hyphen: /^\s--\s/,
	}

	// Using 'any' for node because unist-util-visit requires broad type compatibility
	const matcher = (node: any): node is ElementNode =>
	node.type === 'element' && node.tagName === 'pre' && Boolean(getPreMeta(node).annotate)

	// Using 'any' for context because unified plugins receive different context types depending on processor configuration
	export default function annotate(context: any) {
	// Using 'any' for tree because unified's AST types are complex and vary between processors
	return (tree: any) => {
	// Using 'any' for parent because unist-util-visit's callback typing doesn't provide specific parent types
	visit(tree, matcher, (node: ElementNode, index: number \| undefined, parent: any) => {
	if (index !== undefined && parent) {
	parent.children[index] = createAnnotatedNode(node, context)
	}
	})
	}
	}

	// Using 'any' for context to match the plugin signature, and return type because hastscript returns complex hast types
	function createAnnotatedNode(node: ElementNode, context: any): any {
	const lang = node.children[0].properties.className[0].replace('language-', '')
	const code = node.children[0].children[0].value

	// Check the code is parse-able
	validate(lang, code)

	// Group into code and notes
	const lines = code.split('\n').filter(hasChar)
	const groups = chunkBy(lines, matchComment(lang))

	// Group groups into rows
	const rows = chunk(groups, 2)

	// Check the rows are formatted correctly
	for (const [note, codeBlock] of rows) {
	if (note === undefined \|\| codeBlock === undefined) {
	throw new Error(
	"Each annotation must have a note and a code block. If you're trying to create a blank annotation, you can use a single line comment with a space after it.",
	)
	}
	}

	// Render the HTML
	return template({ lang, code, rows, context })
	}

	function validate(lang: string, code: string): void {
	if (!lang) {
	throw new Error('No language specific for annotate info string.')
	}
	if (!languages[lang]) {
	throw new Error(
	`Unsupported language for annotate info string. Please use one of: ${Object.keys(
	languages,
	)}.`,
	)
	}

	const firstLine = code.split('\n')[0]
	if (!getRegexp(lang).test(firstLine)) {
	throw new Error(
	`Make sure the annotated code example starts with a single line annotation. It's currently starting with: ${firstLine}`,
	)
	}

	if (!new RegExp(getRegexp(lang), 'm').test(code)) {
	throw new Error(
	'Make sure the comment syntax matches the language. Use single-line comments only.',
	)
	}
	}

	function getRegexp(lang: string): RegExp {
	return commentRegexes[languages[lang].comment]
	}

	function hasChar(line: string): boolean {
	return Boolean(line.trim())
	}

	function chunkBy(arr: string[], predicate: (item: string) => boolean): string[][] {
	const groups: string[][] = [[]]
	let on = predicate(arr[0])
	for (const item of arr) {
	if ((!on && predicate(item)) \|\| (on && !predicate(item))) {
	on = !on
	groups.push([])
	}
	last(groups)!.push(item)
	}
	return groups
	}

	function matchComment(lang: string): (line: string) => boolean {
	const regex = getRegexp(lang)
	return (line) => regex.test(line)
	}

	// Using 'any' return type because hastscript's h() function returns complex hast element types
	function getSubnav(): any {
	const besideBtn = h(
	'button',
	{
	name: 'annotate-display',
	value: 'beside',
	type: 'button',
	className: 'annotate-option',
	},
	['Beside'],
	)
	const inlineBtn = h(
	'button',
	{
	name: 'annotate-display',
	value: 'inline',
	type: 'button',
	className: 'annotate-option',
	},
	['Inline'],
	)

	return h('div', { className: 'annotate-toggle' }, [besideBtn, inlineBtn])
	}

	// Using 'any' for context and return type due to hastscript's complex type definitions
	function template({
	lang,
	code,
	rows,
	context,
	}: {
	lang: string
	code: string
	rows: string[][][]
	context: any
	}): any {
	return h(
	'div',
	{ class: 'annotate beside' },
	h('div', { className: 'annotate-header' }, header(lang, code, getSubnav())),
	h(
	'div',
	{ className: 'annotate-beside' },
	rows.map(([note, codeBlock]) =>
	h('div', { className: 'annotate-row' }, [
	h(
	'div',
	{ className: 'annotate-code' },
	// pre > code matches the mdast -> hast tree of a regular fenced code block.
	h('pre', h('code', { className: `language-${lang}` }, codeBlock.join('\n'))),
	),
	h(
	'div',
	{ className: 'annotate-note' },
	mdToHast(note.map(removeComment(lang)).join('\n'), context),
	),
	]),
	),
	),
	h('div', { className: 'annotate-inline' }, [
	// pre > code matches the mdast -> hast tree of a regular fenced code block.
	h('pre', h('code', { className: `language-${lang}` }, code)),
	]),
	)
	}

	// Using 'any' for context and return type to maintain compatibility with mdast-util-to-hast complex types
	function mdToHast(text: string, context: any): any {
	const mdast: Root = fromMarkdown(text)

	// Process AUTOTITLE links
	processAutotitleInMdast(mdast, context)

	return toHast(mdast)
	}

	// Helper method to process AUTOTITLE links in MDAST
	// This can be reused for other MDAST processing that needs AUTOTITLE support
	// Using 'any' for context because it may or may not have pages/redirects properties depending on usage
	function processAutotitleInMdast(mdast: Root, context: any): void {
	visit(mdast, 'link', (node) => {
	if (node.url && node.url.startsWith('/')) {
	for (const child of node.children) {
	if (child.type === 'text' && /^\sAUTOTITLE\s$/.test(child.value)) {
	// Find the page and get its title
	const page = findPage(node.url, context.pages, context.redirects)
	if (page) {
	try {
	// Use rawTitle for synchronous processing in annotations
	child.value = page.rawTitle \|\| 'AUTOTITLE'
	} catch (error) {
	// Keep AUTOTITLE if we can't get the title
	console.warn(
	`Could not resolve AUTOTITLE for ${node.url}:`,
	error instanceof Error ? error.message : String(error),
	)
	}
	}
	}
	}
	}
	})
	}

	function removeComment(lang: string): (line: string) => string {
	const regex = getRegexp(lang)
	return (line) => line.replace(regex, '')
	}

	function getPreMeta(node: ElementNode): { annotate?: boolean; [key: string]: any } {
	// Here's why this monstrosity works:
	// https://github.com/syntax-tree/mdast-util-to-hast/blob/c87cd606731c88a27dbce4bfeaab913a9589bf83/lib/handlers/code.js#L40-L42
	return node.children[0]?.data?.meta \|\| {}
	}