import koffi, { KoffiFunction } from "koffi"; import { join } from "path"; import { stat } from "fs/promises"; import { platform } from "os"; // TODO: add a timeout to the Rust transformer const rustExecutablePath = join( process.cwd(), "sharedLibs/html-transformer/target/release/", platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so" ); type TransformHtmlOptions = { html: string, url: string, include_tags: string[], exclude_tags: string[], only_main_content: boolean, }; class RustHTMLTransformer { private static instance: RustHTMLTransformer; private _extractLinks: KoffiFunction; private _extractMetadata: KoffiFunction; private _transformHtml: KoffiFunction; private _freeString: KoffiFunction; private _getInnerJSON: KoffiFunction; private constructor() { const lib = koffi.load(rustExecutablePath); this._freeString = lib.func("free_string", "void", ["string"]); const cstn = "CString:" + crypto.randomUUID(); const freedResultString = koffi.disposable(cstn, "string", this._freeString); this._extractLinks = lib.func("extract_links", freedResultString, ["string"]); this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]); this._transformHtml = lib.func("transform_html", freedResultString, ["string"]); this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]); } public static async getInstance(): Promise { if (!RustHTMLTransformer.instance) { try { await stat(rustExecutablePath); } catch (_) { throw Error("Rust html-transformer shared library not found"); } RustHTMLTransformer.instance = new RustHTMLTransformer(); } return RustHTMLTransformer.instance; } public async extractLinks(html: string): Promise { return new Promise((resolve, reject) => { this._extractLinks.async(html, (err: Error, res: string) => { if (err) { reject(err); } else { resolve(JSON.parse(res)); } }); }); } public async extractMetadata(html: string): Promise { return new Promise((resolve, reject) => { this._extractMetadata.async(html, (err: Error, res: string) => { if (err) { reject(err); } else { resolve(JSON.parse(res)); } }); }); } public async transformHtml(opts: TransformHtmlOptions): Promise { return new Promise((resolve, reject) => { this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => { if (err) { reject(err); } else { if (res === "RUSTFC:ERROR") { reject(new Error("Something went wrong on the Rust side.")); } else { resolve(res); } } }); }); } public async getInnerJSON(html: string): Promise { return new Promise((resolve, reject) => { this._getInnerJSON.async(html, (err: Error, res: string) => { if (err) { reject(err); } else { if (res === "RUSTFC:ERROR") { reject(new Error("Something went wrong on the Rust side.")); } else { resolve(res); } } }); }); } } export async function extractLinks( html: string | null | undefined, ): Promise { if (!html) { return []; } const converter = await RustHTMLTransformer.getInstance(); return await converter.extractLinks(html); } export async function extractMetadata( html: string | null | undefined, ): Promise { if (!html) { return []; } const converter = await RustHTMLTransformer.getInstance(); return await converter.extractMetadata(html); } export async function transformHtml( opts: TransformHtmlOptions, ): Promise { const converter = await RustHTMLTransformer.getInstance(); return await converter.transformHtml(opts); } export async function getInnerJSON( html: string, ): Promise { const converter = await RustHTMLTransformer.getInstance(); return await converter.getInnerJSON(html); }