import koffi from "koffi"; import { join } from "path"; import "../services/sentry"; import * as Sentry from "@sentry/node"; import dotenv from "dotenv"; import { logger } from "./logger"; import { stat } from "fs/promises"; dotenv.config(); // TODO: add a timeout to the Go parser const goExecutablePath = join( process.cwd(), "sharedLibs", "go-html-to-md", "html-to-markdown.so", ); class GoMarkdownConverter { private static instance: GoMarkdownConverter; private convert: any; private constructor() { const lib = koffi.load(goExecutablePath); this.convert = lib.func("ConvertHTMLToMarkdown", "string", ["string"]); } public static async getInstance(): Promise { if (!GoMarkdownConverter.instance) { try { await stat(goExecutablePath); } catch (_) { throw Error("Go shared library not found"); } GoMarkdownConverter.instance = new GoMarkdownConverter(); } return GoMarkdownConverter.instance; } public async convertHTMLToMarkdown(html: string): Promise { return new Promise((resolve, reject) => { this.convert.async(html, (err: Error, res: string) => { if (err) { reject(err); } else { resolve(res); } }); }); } } export async function parseMarkdown( html: string | null | undefined, ): Promise { if (!html) { return ""; } try { if (process.env.USE_GO_MARKDOWN_PARSER == "true") { const converter = await GoMarkdownConverter.getInstance(); let markdownContent = await converter.convertHTMLToMarkdown(html); markdownContent = processMultiLineLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent); // logger.info(`HTML to Markdown conversion using Go parser successful`); return markdownContent; } } catch (error) { if ( !(error instanceof Error) || error.message !== "Go shared library not found" ) { Sentry.captureException(error); logger.error( `Error converting HTML to Markdown with Go parser: ${error}`, ); } else { logger.warn( "Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath }, ); } } // Fallback to TurndownService if Go parser fails or is not enabled var TurndownService = require("turndown"); var turndownPluginGfm = require("joplin-turndown-plugin-gfm"); const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { filter: function (node, options) { return ( options.linkStyle === "inlined" && node.nodeName === "A" && node.getAttribute("href") ); }, replacement: function (content, node) { var href = node.getAttribute("href").trim(); var title = node.title ? ' "' + node.title + '"' : ""; return "[" + content.trim() + "](" + href + title + ")\n"; }, }); var gfm = turndownPluginGfm.gfm; turndownService.use(gfm); try { let markdownContent = await turndownService.turndown(html); markdownContent = processMultiLineLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent); return markdownContent; } catch (error) { logger.error("Error converting HTML to Markdown", { error }); return ""; // Optionally return an empty string or handle the error as needed } } function processMultiLineLinks(markdownContent: string): string { let insideLinkContent = false; let newMarkdownContent = ""; let linkOpenCount = 0; for (let i = 0; i < markdownContent.length; i++) { const char = markdownContent[i]; if (char == "[") { linkOpenCount++; } else if (char == "]") { linkOpenCount = Math.max(0, linkOpenCount - 1); } insideLinkContent = linkOpenCount > 0; if (insideLinkContent && char == "\n") { newMarkdownContent += "\\" + "\n"; } else { newMarkdownContent += char; } } return newMarkdownContent; } function removeSkipToContentLinks(markdownContent: string): string { // Remove [Skip to Content](#page) and [Skip to content](#skip) const newMarkdownContent = markdownContent.replace( /\[Skip to Content\]\(#[^\)]*\)/gi, "", ); return newMarkdownContent; }