Spaces:

Echo-AI-official
/

Fire-crawl

Paused

App Files Files Community

Fire-crawl / sharedLibs /html-transformer /src /lib.rs

Echo-AI-official

Upload 280 files

0e759d2 verified 9 months ago

raw

history blame contribute delete

13.7 kB

	use std::{collections::HashMap, ffi::{CStr, CString}};

	use kuchikiki::{parse_html, traits::TendrilSink};
	use serde::Deserialize;
	use serde_json::Value;
	use url::Url;

	/// Extracts links from HTML
	///
	/// # Safety
	/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
	#[no_mangle]
	pub unsafe extern "C" fn extract_links(html: const libc::c_char) -> mut libc::c_char {
	let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();

	let document = parse_html().one(html);

	let mut out: Vec<String> = Vec::new();

	let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
	for anchor in anchors {
	let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();

	if href.starts_with("http:/") && !href.starts_with("http://") {
	href = format!("http://{}", &href[6..]);
	} else if href.starts_with("https:/") && !href.starts_with("https://") {
	href = format!("https://{}", &href[7..]);
	}

	out.push(href);
	}

	CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
	}

	macro_rules! insert_meta_name {
	($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
	if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(\|description\| description.attributes.borrow().get("content").map(\|x\| x.to_string())) {
	$out.insert(($outName).to_string(), Value::String(x));
	}
	};
	}

	macro_rules! insert_meta_property {
	($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
	if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(\|description\| description.attributes.borrow().get("content").map(\|x\| x.to_string())) {
	$out.insert(($outName).to_string(), Value::String(x));
	}
	};
	}

	/// Extracts metadata from HTML
	///
	/// # Safety
	/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
	#[no_mangle]
	pub unsafe extern "C" fn extract_metadata(html: const libc::c_char) -> mut libc::c_char {
	let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();

	let document = parse_html().one(html);
	let mut out = HashMap::<String, Value>::new();

	if let Some(title) = document.select("title").unwrap().next() {
	out.insert("title".to_string(), Value::String(title.text_contents()));
	}
	// insert_meta_name!(out, document, "description", "description");

	if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
	.and_then(\|x\| x.attributes.borrow().get("href").map(\|x\| x.to_string()))
	.or_else(\|\| document.select("link[rel*=\"icon\"]").unwrap().next()
	.and_then(\|x\| x.attributes.borrow().get("href").map(\|x\| x.to_string()))) {
	out.insert("favicon".to_string(), Value::String(favicon_link));
	}

	if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(\|x\| x.attributes.borrow().get("lang").map(\|x\| x.to_string())) {
	out.insert("language".to_string(), Value::String(lang));
	}

	// insert_meta_name!(out, document, "keywords", "keywords");
	// insert_meta_name!(out, document, "robots", "robots");
	insert_meta_property!(out, document, "og:title", "ogTitle");
	insert_meta_property!(out, document, "og:description", "ogDescription");
	insert_meta_property!(out, document, "og:url", "ogUrl");
	insert_meta_property!(out, document, "og:image", "ogImage");
	insert_meta_property!(out, document, "og:audio", "ogAudio");
	insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
	insert_meta_property!(out, document, "og:locale", "ogLocale");

	for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
	let attrs = meta.attributes.borrow();

	if let Some(content) = attrs.get("content") {
	if let Some(v) = out.get_mut("og:locale:alternate") {
	match v {
	Value::Array(x) => {
	x.push(Value::String(content.to_string()));
	},
	_ => unreachable!(),
	}
	} else {
	out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
	}
	}
	}

	insert_meta_property!(out, document, "og:site_name", "ogSiteName");
	insert_meta_property!(out, document, "og:video", "ogVideo");
	insert_meta_name!(out, document, "article:section", "articleSection");
	insert_meta_name!(out, document, "article:tag", "articleTag");
	insert_meta_property!(out, document, "article:published_time", "publishedTime");
	insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
	insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
	insert_meta_name!(out, document, "dc.description", "dcDescription");
	insert_meta_name!(out, document, "dc.subject", "dcSubject");
	insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
	insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
	insert_meta_name!(out, document, "dc.type", "dcType");
	insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
	insert_meta_name!(out, document, "dc.date", "dcDate");
	insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
	insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");

	for meta in document.select("meta").unwrap() {
	let meta = meta.as_node().as_element().unwrap();
	let attrs = meta.attributes.borrow();

	if let Some(name) = attrs.get("name").or_else(\|\| attrs.get("property")) {
	if let Some(content) = attrs.get("content") {
	if let Some(v) = out.get(name) {
	match v {
	Value::String(_) => {
	if name != "title" { // preserve title tag in metadata
	out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
	}
	},
	Value::Array(_) => {
	match out.get_mut(name) {
	Some(Value::Array(x)) => {
	x.push(Value::String(content.to_string()));
	},
	_ => unreachable!(),
	}
	},
	_ => unreachable!(),
	}
	} else {
	out.insert(name.to_string(), Value::String(content.to_string()));
	}
	}
	}
	}

	CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
	}

	const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
	"header",
	"footer",
	"nav",
	"aside",
	".header",
	".top",
	".navbar",
	"#header",
	".footer",
	".bottom",
	"#footer",
	".sidebar",
	".side",
	".aside",
	"#sidebar",
	".modal",
	".popup",
	"#modal",
	".overlay",
	".ad",
	".ads",
	".advert",
	"#ad",
	".lang-selector",
	".language",
	"#language-selector",
	".social",
	".social-media",
	".social-links",
	"#social",
	".menu",
	".navigation",
	"#nav",
	".breadcrumbs",
	"#breadcrumbs",
	".share",
	"#share",
	".widget",
	"#widget",
	".cookie",
	"#cookie",
	];

	const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
	"#main",

	// swoogo event software as .widget in all of their content
	".swoogo-cols",
	".swoogo-text",
	".swoogo-table-div",
	".swoogo-space",
	".swoogo-alert",
	".swoogo-sponsors",
	".swoogo-title",
	".swoogo-tabs",
	".swoogo-logo",
	".swoogo-image",
	".swoogo-button",
	".swoogo-agenda",
	];

	#[derive(Deserialize)]
	struct TranformHTMLOptions {
	html: String,
	url: String,
	include_tags: Vec<String>,
	exclude_tags: Vec<String>,
	only_main_content: bool,
	}

	struct ImageSource {
	url: String,
	size: i32,
	is_x: bool,
	}

	fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
	let mut document = parse_html().one(opts.html);

	if !opts.include_tags.is_empty() {
	let new_document = parse_html().one("<div></div>");
	let root = new_document.select_first("div")?;

	for x in opts.include_tags.iter() {
	let matching_nodes: Vec<_> = document.select(x)?.collect();
	for tag in matching_nodes {
	root.as_node().append(tag.as_node().clone());
	}
	}

	document = new_document;
	}

	while let Ok(x) = document.select_first("head") {
	x.as_node().detach();
	}

	while let Ok(x) = document.select_first("meta") {
	x.as_node().detach();
	}

	while let Ok(x) = document.select_first("noscript") {
	x.as_node().detach();
	}

	while let Ok(x) = document.select_first("style") {
	x.as_node().detach();
	}

	while let Ok(x) = document.select_first("script") {
	x.as_node().detach();
	}

	for x in opts.exclude_tags.iter() {
	// TODO: implement weird version
	while let Ok(x) = document.select_first(x) {
	x.as_node().detach();
	}
	}

	if opts.only_main_content {
	for x in EXCLUDE_NON_MAIN_TAGS.iter() {
	let x: Vec<_> = document.select(x)?.collect();
	for tag in x {
	if !FORCE_INCLUDE_MAIN_TAGS.iter().any(\|x\| tag.as_node().select(x).is_ok_and(\|mut x\| x.next().is_some())) {
	tag.as_node().detach();
	}
	}
	}
	}

	let srcset_images: Vec<_> = document.select("img[srcset]")?.collect();
	for img in srcset_images {
	let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.split(",").filter_map(\|x\| {
	let tok: Vec<&str> = x.trim().split(" ").collect();
	let tok_1 = if tok.len() > 1 && !tok[1].is_empty() {
	tok[1]
	} else {
	"1x"
	};
	if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
	Some(ImageSource {
	url: tok[0].to_string(),
	size: parsed_size,
	is_x: tok_1.ends_with("x")
	})
	} else {
	None
	}
	}).collect();

	if sizes.iter().all(\|x\| x.is_x) {
	if let Some(src) = img.attributes.borrow().get("src").map(\|x\| x.to_string()) {
	sizes.push(ImageSource {
	url: src,
	size: 1,
	is_x: true,
	});
	}
	}

	sizes.sort_by(\|a, b\| b.size.cmp(&a.size));

	if let Some(biggest) = sizes.first() {
	img.attributes.borrow_mut().insert("src", biggest.url.clone());
	}
	}

	let url = Url::parse(&opts.url).map_err(\|_\| ())?;

	let src_images: Vec<_> = document.select("img[src]")?.collect();
	for img in src_images {
	let old = img.attributes.borrow().get("src").map(\|x\| x.to_string()).ok_or(())?;
	if let Ok(new) = url.join(&old) {
	img.attributes.borrow_mut().insert("src", new.to_string());
	}
	}

	let href_anchors: Vec<_> = document.select("a[href]")?.collect();
	for anchor in href_anchors {
	let old = anchor.attributes.borrow().get("href").map(\|x\| x.to_string()).ok_or(())?;
	if let Ok(new) = url.join(&old) {
	anchor.attributes.borrow_mut().insert("href", new.to_string());
	}
	}

	Ok(document.to_string())
	}

	/// Transforms rawHtml to html (formerly removeUnwantedElements)
	///
	/// # Safety
	/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
	#[no_mangle]
	pub unsafe extern "C" fn transform_html(opts: const libc::c_char) -> mut libc::c_char {
	let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(\|_\| ()).and_then(\|x\| serde_json::de::from_str(x).map_err(\|_\| ())) {
	Ok(x) => x,
	Err(_) => {
	return CString::new("RUSTFC:ERROR").unwrap().into_raw();
	}
	};

	let out = match _transform_html_inner(opts) {
	Ok(x) => x,
	Err(_) => "RUSTFC:ERROR".to_string(),
	};

	CString::new(out).unwrap().into_raw()
	}

	fn _get_inner_json(html: &str) -> Result<String, ()> {
	Ok(parse_html().one(html).select_first("body")?.text_contents())
	}

	/// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON.
	///
	/// # Safety
	/// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string.
	#[no_mangle]
	pub unsafe extern "C" fn get_inner_json(html: const libc::c_char) -> mut libc::c_char {
	let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();

	let out = match _get_inner_json(html) {
	Ok(x) => x,
	Err(_) => "RUSTFC:ERROR".to_string(),
	};

	CString::new(out).unwrap().into_raw()
	}

	/// Frees a string allocated in Rust-land.
	///
	/// # Safety
	/// ptr must be a non-freed string pointer returned by Rust code.
	#[no_mangle]
	pub unsafe extern "C" fn free_string(ptr: *mut libc::c_char) {
	drop(unsafe { CString::from_raw(ptr) })
	}