Fire-crawl / src /lib /html-transformer.ts
Echo-AI-official's picture
Upload 280 files
0e759d2 verified
import koffi, { KoffiFunction } from "koffi";
import { join } from "path";
import { stat } from "fs/promises";
import { platform } from "os";
// TODO: add a timeout to the Rust transformer
const rustExecutablePath = join(
process.cwd(),
"sharedLibs/html-transformer/target/release/",
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
);
type TransformHtmlOptions = {
html: string,
url: string,
include_tags: string[],
exclude_tags: string[],
only_main_content: boolean,
};
class RustHTMLTransformer {
private static instance: RustHTMLTransformer;
private _extractLinks: KoffiFunction;
private _extractMetadata: KoffiFunction;
private _transformHtml: KoffiFunction;
private _freeString: KoffiFunction;
private _getInnerJSON: KoffiFunction;
private constructor() {
const lib = koffi.load(rustExecutablePath);
this._freeString = lib.func("free_string", "void", ["string"]);
const cstn = "CString:" + crypto.randomUUID();
const freedResultString = koffi.disposable(cstn, "string", this._freeString);
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]);
}
public static async getInstance(): Promise<RustHTMLTransformer> {
if (!RustHTMLTransformer.instance) {
try {
await stat(rustExecutablePath);
} catch (_) {
throw Error("Rust html-transformer shared library not found");
}
RustHTMLTransformer.instance = new RustHTMLTransformer();
}
return RustHTMLTransformer.instance;
}
public async extractLinks(html: string): Promise<string[]> {
return new Promise<string[]>((resolve, reject) => {
this._extractLinks.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
public async extractMetadata(html: string): Promise<any> {
return new Promise<string[]>((resolve, reject) => {
this._extractMetadata.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
public async transformHtml(opts: TransformHtmlOptions): Promise<string> {
return new Promise<string>((resolve, reject) => {
this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => {
if (err) {
reject(err);
} else {
if (res === "RUSTFC:ERROR") {
reject(new Error("Something went wrong on the Rust side."));
} else {
resolve(res);
}
}
});
});
}
public async getInnerJSON(html: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this._getInnerJSON.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
if (res === "RUSTFC:ERROR") {
reject(new Error("Something went wrong on the Rust side."));
} else {
resolve(res);
}
}
});
});
}
}
export async function extractLinks(
html: string | null | undefined,
): Promise<string[]> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractLinks(html);
}
export async function extractMetadata(
html: string | null | undefined,
): Promise<any> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractMetadata(html);
}
export async function transformHtml(
opts: TransformHtmlOptions,
): Promise<string> {
const converter = await RustHTMLTransformer.getInstance();
return await converter.transformHtml(opts);
}
export async function getInnerJSON(
html: string,
): Promise<string> {
const converter = await RustHTMLTransformer.getInstance();
return await converter.getInnerJSON(html);
}