Spaces:
Paused
Paused
File size: 4,236 Bytes
0e759d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import koffi, { KoffiFunction } from "koffi";
import { join } from "path";
import { stat } from "fs/promises";
import { platform } from "os";
// TODO: add a timeout to the Rust transformer
const rustExecutablePath = join(
process.cwd(),
"sharedLibs/html-transformer/target/release/",
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
);
type TransformHtmlOptions = {
html: string,
url: string,
include_tags: string[],
exclude_tags: string[],
only_main_content: boolean,
};
class RustHTMLTransformer {
private static instance: RustHTMLTransformer;
private _extractLinks: KoffiFunction;
private _extractMetadata: KoffiFunction;
private _transformHtml: KoffiFunction;
private _freeString: KoffiFunction;
private _getInnerJSON: KoffiFunction;
private constructor() {
const lib = koffi.load(rustExecutablePath);
this._freeString = lib.func("free_string", "void", ["string"]);
const cstn = "CString:" + crypto.randomUUID();
const freedResultString = koffi.disposable(cstn, "string", this._freeString);
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]);
}
public static async getInstance(): Promise<RustHTMLTransformer> {
if (!RustHTMLTransformer.instance) {
try {
await stat(rustExecutablePath);
} catch (_) {
throw Error("Rust html-transformer shared library not found");
}
RustHTMLTransformer.instance = new RustHTMLTransformer();
}
return RustHTMLTransformer.instance;
}
public async extractLinks(html: string): Promise<string[]> {
return new Promise<string[]>((resolve, reject) => {
this._extractLinks.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
public async extractMetadata(html: string): Promise<any> {
return new Promise<string[]>((resolve, reject) => {
this._extractMetadata.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
public async transformHtml(opts: TransformHtmlOptions): Promise<string> {
return new Promise<string>((resolve, reject) => {
this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => {
if (err) {
reject(err);
} else {
if (res === "RUSTFC:ERROR") {
reject(new Error("Something went wrong on the Rust side."));
} else {
resolve(res);
}
}
});
});
}
public async getInnerJSON(html: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this._getInnerJSON.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
if (res === "RUSTFC:ERROR") {
reject(new Error("Something went wrong on the Rust side."));
} else {
resolve(res);
}
}
});
});
}
}
export async function extractLinks(
html: string | null | undefined,
): Promise<string[]> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractLinks(html);
}
export async function extractMetadata(
html: string | null | undefined,
): Promise<any> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractMetadata(html);
}
export async function transformHtml(
opts: TransformHtmlOptions,
): Promise<string> {
const converter = await RustHTMLTransformer.getInstance();
return await converter.transformHtml(opts);
}
export async function getInnerJSON(
html: string,
): Promise<string> {
const converter = await RustHTMLTransformer.getInstance();
return await converter.getInnerJSON(html);
}
|