Spaces:
Paused
Paused
| import koffi, { KoffiFunction } from "koffi"; | |
| import { join } from "path"; | |
| import { stat } from "fs/promises"; | |
| import { platform } from "os"; | |
| // TODO: add a timeout to the Rust transformer | |
| const rustExecutablePath = join( | |
| process.cwd(), | |
| "sharedLibs/html-transformer/target/release/", | |
| platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so" | |
| ); | |
| type TransformHtmlOptions = { | |
| html: string, | |
| url: string, | |
| include_tags: string[], | |
| exclude_tags: string[], | |
| only_main_content: boolean, | |
| }; | |
| class RustHTMLTransformer { | |
| private static instance: RustHTMLTransformer; | |
| private _extractLinks: KoffiFunction; | |
| private _extractMetadata: KoffiFunction; | |
| private _transformHtml: KoffiFunction; | |
| private _freeString: KoffiFunction; | |
| private _getInnerJSON: KoffiFunction; | |
| private constructor() { | |
| const lib = koffi.load(rustExecutablePath); | |
| this._freeString = lib.func("free_string", "void", ["string"]); | |
| const cstn = "CString:" + crypto.randomUUID(); | |
| const freedResultString = koffi.disposable(cstn, "string", this._freeString); | |
| this._extractLinks = lib.func("extract_links", freedResultString, ["string"]); | |
| this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]); | |
| this._transformHtml = lib.func("transform_html", freedResultString, ["string"]); | |
| this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]); | |
| } | |
| public static async getInstance(): Promise<RustHTMLTransformer> { | |
| if (!RustHTMLTransformer.instance) { | |
| try { | |
| await stat(rustExecutablePath); | |
| } catch (_) { | |
| throw Error("Rust html-transformer shared library not found"); | |
| } | |
| RustHTMLTransformer.instance = new RustHTMLTransformer(); | |
| } | |
| return RustHTMLTransformer.instance; | |
| } | |
| public async extractLinks(html: string): Promise<string[]> { | |
| return new Promise<string[]>((resolve, reject) => { | |
| this._extractLinks.async(html, (err: Error, res: string) => { | |
| if (err) { | |
| reject(err); | |
| } else { | |
| resolve(JSON.parse(res)); | |
| } | |
| }); | |
| }); | |
| } | |
| public async extractMetadata(html: string): Promise<any> { | |
| return new Promise<string[]>((resolve, reject) => { | |
| this._extractMetadata.async(html, (err: Error, res: string) => { | |
| if (err) { | |
| reject(err); | |
| } else { | |
| resolve(JSON.parse(res)); | |
| } | |
| }); | |
| }); | |
| } | |
| public async transformHtml(opts: TransformHtmlOptions): Promise<string> { | |
| return new Promise<string>((resolve, reject) => { | |
| this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => { | |
| if (err) { | |
| reject(err); | |
| } else { | |
| if (res === "RUSTFC:ERROR") { | |
| reject(new Error("Something went wrong on the Rust side.")); | |
| } else { | |
| resolve(res); | |
| } | |
| } | |
| }); | |
| }); | |
| } | |
| public async getInnerJSON(html: string): Promise<string> { | |
| return new Promise<string>((resolve, reject) => { | |
| this._getInnerJSON.async(html, (err: Error, res: string) => { | |
| if (err) { | |
| reject(err); | |
| } else { | |
| if (res === "RUSTFC:ERROR") { | |
| reject(new Error("Something went wrong on the Rust side.")); | |
| } else { | |
| resolve(res); | |
| } | |
| } | |
| }); | |
| }); | |
| } | |
| } | |
| export async function extractLinks( | |
| html: string | null | undefined, | |
| ): Promise<string[]> { | |
| if (!html) { | |
| return []; | |
| } | |
| const converter = await RustHTMLTransformer.getInstance(); | |
| return await converter.extractLinks(html); | |
| } | |
| export async function extractMetadata( | |
| html: string | null | undefined, | |
| ): Promise<any> { | |
| if (!html) { | |
| return []; | |
| } | |
| const converter = await RustHTMLTransformer.getInstance(); | |
| return await converter.extractMetadata(html); | |
| } | |
| export async function transformHtml( | |
| opts: TransformHtmlOptions, | |
| ): Promise<string> { | |
| const converter = await RustHTMLTransformer.getInstance(); | |
| return await converter.transformHtml(opts); | |
| } | |
| export async function getInnerJSON( | |
| html: string, | |
| ): Promise<string> { | |
| const converter = await RustHTMLTransformer.getInstance(); | |
| return await converter.getInnerJSON(html); | |
| } | |