carbon-tokenization / frontend /src /editor /extensions /code-block-shiki.tsx
tfrere's picture
tfrere HF Staff
fix(highlight): auto-detect language for code blocks without explicit lang
bf2abd0
Raw
History Blame Contribute Delete
11 kB
import { findChildren } from "@tiptap/core";
import { CodeBlock, type CodeBlockOptions } from "@tiptap/extension-code-block";
import { Plugin, PluginKey } from "@tiptap/pm/state";
import { Decoration, DecorationSet } from "@tiptap/pm/view";
import type { Node as ProsemirrorNode } from "@tiptap/pm/model";
import type { EditorView } from "@tiptap/pm/view";
import type { Element as HastElement, ElementContent, Root as HastRoot } from "hast";
import {
getSharedHighlighter,
isSupportedLang,
normalizeLang,
SHIKI_THEMES,
type ShikiHighlighter,
} from "#shared/shiki-config";
import { detectShikiLang } from "#shared/detect-lang";
/**
* TipTap code block with Shiki-powered syntax highlighting.
*
* Architecture mirrors `@tiptap/extension-code-block-lowlight`:
* 1. Extends the base `CodeBlock` node (input rules, keymap, etc.).
* 2. Registers a ProseMirror plugin that produces a `DecorationSet` from
* a Shiki `codeToHast()` call for every `codeBlock` in the document.
* 3. Token coloring uses `Decoration.inline` with `style`/`class` (rendered
* by PM as wrapping spans). Line numbers use `Decoration.widget`: one
* `<span class="code-line-num">` inserted at each source line start.
*
* Why widgets for numbers (not CSS counters): PM merges overlapping inline
* decorations into a single flat span per text range, so an outer `.line`
* decoration would NOT produce a nesting parent usable by `counter-increment`.
* A widget at line start is a standalone DOM node, which the same CSS can
* style identically in the editor and in the published output (the publisher
* transformer injects the exact same `<span class="code-line-num">` markup).
*
* The language label (e.g. "python") is rendered by pure CSS using
* `pre::after { content: attr(data-lang) }` - no extra DOM.
*
* Because Shiki is async (WASM engine + grammars), decorations stay empty
* until the shared highlighter resolves. A view-level hook then dispatches a
* refresh transaction, causing the whole doc to re-decorate at once.
*/
/**
* Walks Shiki's token tree inside a single `<span class="line">` and pushes
* one `Decoration.inline` per leaf text node, inheriting `style`/`class`
* from ancestors. The `line` class itself is stripped from class propagation
* because it only marks Shiki's line wrapper, not the tokens.
*/
/**
* Read the class list off a hast element. Shiki emits a plain `class` string
* (not the JSX-style `className` array), so we inspect both to be robust
* against upstream changes.
*/
function readHastClasses(node: HastElement): string[] {
const props = node.properties ?? {};
const raw = props.class ?? props.className;
if (Array.isArray(raw)) return raw.map(String).filter(Boolean);
if (typeof raw === "string") return raw.split(/\s+/).filter(Boolean);
return [];
}
function walkLineTokens(
nodes: readonly ElementContent[],
offset: number,
out: Decoration[],
styleStack: readonly string[] = [],
classStack: readonly string[] = [],
): number {
for (const node of nodes) {
if (node.type === "text") {
const len = node.value.length;
if (len > 0 && (styleStack.length > 0 || classStack.length > 0)) {
const attrs: Record<string, string> = {};
if (styleStack.length > 0) attrs.style = styleStack.join(";");
if (classStack.length > 0) attrs.class = classStack.join(" ");
out.push(Decoration.inline(offset, offset + len, attrs));
}
offset += len;
continue;
}
if (node.type === "element") {
const rawStyle = node.properties?.style;
const style = typeof rawStyle === "string" ? rawStyle : "";
const classes = readHastClasses(node).filter((c) => c !== "line");
const nextStyleStack = style ? [...styleStack, style] : styleStack;
const nextClassStack = classes.length ? [...classStack, ...classes] : classStack;
offset = walkLineTokens(node.children, offset, out, nextStyleStack, nextClassStack);
}
}
return offset;
}
/**
* Extract the `<code>` node's children from a Shiki hast root, skipping the
* outer `<pre>` and `<code>` wrappers (their styles would leak onto tokens).
*/
function getCodeChildren(hast: HastRoot): readonly ElementContent[] {
const pre = hast.children.find((c): c is HastElement => c.type === "element" && c.tagName === "pre");
if (!pre) return [];
const code = pre.children.find((c): c is HastElement => c.type === "element" && c.tagName === "code");
return code ? code.children : [];
}
function isLineElement(node: ElementContent): node is HastElement {
if (node.type !== "element") return false;
return readHastClasses(node).includes("line");
}
function buildLineNumberWidget(n: number): HTMLSpanElement {
const span = document.createElement("span");
span.className = "code-line-num";
span.textContent = String(n);
span.setAttribute("contenteditable", "false");
span.setAttribute("aria-hidden", "true");
return span;
}
interface DecorationBuildCtx {
doc: ProsemirrorNode;
name: string;
highlighter: ShikiHighlighter | null;
defaultLanguage: string | null;
}
function buildDecorations({ doc, name, highlighter, defaultLanguage }: DecorationBuildCtx): DecorationSet {
const decorations: Decoration[] = [];
const blocks = findChildren(doc, (node) => node.type.name === name);
for (const block of blocks) {
const text = block.node.textContent;
// Always emit line-number widgets, even before the highlighter resolves,
// so the gutter appears immediately on page load instead of flashing in
// ~100ms later when Shiki is ready. Each widget captures its own line
// number via a block-scoped const to avoid the classic let-in-closure
// pitfall (all widgets would otherwise render the final line number).
{
const n = 1;
const pos = block.pos + 1;
decorations.push(
Decoration.widget(pos, () => buildLineNumberWidget(n), {
side: -1,
key: `cln-${block.pos}-${n}`,
ignoreSelection: true,
}),
);
}
let lineNo = 1;
for (let i = 0; i < text.length; i++) {
if (text.charCodeAt(i) === 10 /* \n */) {
lineNo += 1;
const n = lineNo;
const pos = block.pos + 1 + i + 1;
decorations.push(
Decoration.widget(pos, () => buildLineNumberWidget(n), {
side: -1,
key: `cln-${block.pos}-${n}`,
ignoreSelection: true,
}),
);
}
}
if (!highlighter || !text) continue;
// Auto-detect when no explicit language is set, so language-less blocks
// get highlighted in the editor just like in the published output.
const raw = block.node.attrs.language || defaultLanguage || "";
const lang = normalizeLang(raw) || detectShikiLang(text);
let hast: HastRoot;
try {
hast = highlighter.codeToHast(text, {
lang: isSupportedLang(lang) ? lang : "text",
themes: SHIKI_THEMES,
defaultColor: false,
}) as HastRoot;
} catch {
continue;
}
const children = getCodeChildren(hast);
let offset = block.pos + 1;
for (const child of children) {
if (child.type === "text") {
offset += child.value.length;
continue;
}
if (!isLineElement(child)) continue;
offset = walkLineTokens(child.children, offset, decorations);
}
}
return DecorationSet.create(doc, decorations);
}
const shikiPluginKey = new PluginKey<DecorationSet>("shiki");
function createShikiPlugin(name: string, defaultLanguage: string | null): Plugin<DecorationSet> {
let highlighter: ShikiHighlighter | null = null;
const build = (doc: ProsemirrorNode) =>
buildDecorations({ doc, name, highlighter, defaultLanguage });
return new Plugin<DecorationSet>({
key: shikiPluginKey,
state: {
init: (_cfg, { doc }) => build(doc),
apply(tr, decorationSet, oldState, newState) {
if (tr.getMeta(shikiPluginKey)?.refresh) return build(tr.doc);
const oldNodeName = oldState.selection.$head.parent.type.name;
const newNodeName = newState.selection.$head.parent.type.name;
const oldNodes = findChildren(oldState.doc, (node) => node.type.name === name);
const newNodes = findChildren(newState.doc, (node) => node.type.name === name);
const needsRebuild =
tr.docChanged &&
([oldNodeName, newNodeName].includes(name) ||
newNodes.length !== oldNodes.length ||
tr.steps.some((step) => {
const s = step as unknown as { from?: number; to?: number };
return (
s.from !== undefined &&
s.to !== undefined &&
oldNodes.some((n) => n.pos >= (s.from as number) && n.pos + n.node.nodeSize <= (s.to as number))
);
}));
if (needsRebuild) return build(tr.doc);
return decorationSet.map(tr.mapping, tr.doc);
},
},
props: {
decorations(state) {
return shikiPluginKey.getState(state);
},
},
view(view: EditorView) {
let cancelled = false;
getSharedHighlighter().then((h) => {
if (cancelled) return;
highlighter = h;
if (!view.isDestroyed) {
view.dispatch(view.state.tr.setMeta(shikiPluginKey, { refresh: true }));
}
});
return {
destroy() {
cancelled = true;
},
};
},
});
}
export const CodeBlockShiki = CodeBlock.extend<CodeBlockOptions>({
addProseMirrorPlugins() {
const parentPlugins = this.parent?.() || [];
return [
...parentPlugins,
createShikiPlugin(this.name, this.options.defaultLanguage ?? null),
];
},
addNodeView() {
return ({ node, HTMLAttributes }) => {
const pre = document.createElement("pre");
pre.classList.add("shiki");
for (const [key, value] of Object.entries(HTMLAttributes)) {
if (value !== undefined && value !== null && value !== false) {
pre.setAttribute(key, String(value));
}
}
const code = document.createElement("code");
pre.append(code);
const applyLang = (n: ProsemirrorNode) => {
const raw = (n.attrs.language as string | null | undefined) || "";
// Mirror the decoration logic: show the auto-detected language label
// (and `language-*` class) when the author didn't pick one.
const normalized = raw ? normalizeLang(raw) : detectShikiLang(n.textContent || "");
if (normalized) {
pre.dataset.lang = normalized;
code.className = `language-${normalized}`;
} else {
delete pre.dataset.lang;
code.className = "";
}
};
applyLang(node);
return {
dom: pre,
contentDOM: code,
update(updatedNode) {
if (updatedNode.type.name !== "codeBlock") return false;
applyLang(updatedNode);
return true;
},
};
};
},
});