Spaces:
Paused
Paused
| import { load } from "cheerio"; // rustified | |
| import { Document } from "../../../controllers/v1/types"; | |
| import { Meta } from ".."; | |
| import { extractMetadata as _extractMetadata } from "../../../lib/html-transformer"; | |
| export async function extractMetadataRust( | |
| meta: Meta, | |
| html: string, | |
| ): Promise<Partial<Document["metadata"]>> { | |
| const fromRust = await _extractMetadata(html); | |
| return { | |
| ...fromRust, | |
| ...(fromRust.favicon ? { | |
| favicon: new URL(fromRust.favicon, meta.url) | |
| } : {}), | |
| scrapeId: meta.id, | |
| }; | |
| } | |
| export async function extractMetadata( | |
| meta: Meta, | |
| html: string, | |
| ): Promise<Partial<Document["metadata"]>> { | |
| try { | |
| return await extractMetadataRust(meta, html); | |
| } catch (error) { | |
| meta.logger.warn("Failed to call html-transformer! Falling back to cheerio...", { | |
| error, | |
| module: "scrapeURL", method: "extractMetadata" | |
| }); | |
| } | |
| let title: string | undefined = undefined; | |
| let description: string | undefined = undefined; | |
| let favicon: string | undefined = undefined; | |
| let language: string | undefined = undefined; | |
| let keywords: string | undefined = undefined; | |
| let robots: string | undefined = undefined; | |
| let ogTitle: string | undefined = undefined; | |
| let ogDescription: string | undefined = undefined; | |
| let ogUrl: string | undefined = undefined; | |
| let ogImage: string | undefined = undefined; | |
| let ogAudio: string | undefined = undefined; | |
| let ogDeterminer: string | undefined = undefined; | |
| let ogLocale: string | undefined = undefined; | |
| let ogLocaleAlternate: string[] | undefined = undefined; | |
| let ogSiteName: string | undefined = undefined; | |
| let ogVideo: string | undefined = undefined; | |
| let dcTermsCreated: string | undefined = undefined; | |
| let dcDateCreated: string | undefined = undefined; | |
| let dcDate: string | undefined = undefined; | |
| let dcTermsType: string | undefined = undefined; | |
| let dcType: string | undefined = undefined; | |
| let dcTermsAudience: string | undefined = undefined; | |
| let dcTermsSubject: string | undefined = undefined; | |
| let dcSubject: string | undefined = undefined; | |
| let dcDescription: string | undefined = undefined; | |
| let dcTermsKeywords: string | undefined = undefined; | |
| let modifiedTime: string | undefined = undefined; | |
| let publishedTime: string | undefined = undefined; | |
| let articleTag: string | undefined = undefined; | |
| let articleSection: string | undefined = undefined; | |
| const customMetadata: Record<string, string | string[]> = {}; | |
| const soup = load(html); | |
| try { | |
| title = soup("title").first().text().trim() || undefined; | |
| description = soup('meta[name="description"]').attr("content") || undefined; | |
| const faviconLink = | |
| soup('link[rel="icon"]').attr("href") || | |
| soup('link[rel*="icon"]').first().attr("href") || | |
| undefined; | |
| if (faviconLink) { | |
| const baseUrl = new URL(meta.url).origin; | |
| favicon = faviconLink.startsWith("http") | |
| ? faviconLink | |
| : `${baseUrl}${faviconLink}`; | |
| } | |
| // Assuming the language is part of the URL as per the regex pattern | |
| language = soup("html").attr("lang") || undefined; | |
| keywords = soup('meta[name="keywords"]').attr("content") || undefined; | |
| robots = soup('meta[name="robots"]').attr("content") || undefined; | |
| ogTitle = soup('meta[property="og:title"]').attr("content") || undefined; | |
| ogDescription = | |
| soup('meta[property="og:description"]').attr("content") || undefined; | |
| ogUrl = soup('meta[property="og:url"]').attr("content") || undefined; | |
| ogImage = soup('meta[property="og:image"]').attr("content") || undefined; | |
| ogAudio = soup('meta[property="og:audio"]').attr("content") || undefined; | |
| ogDeterminer = | |
| soup('meta[property="og:determiner"]').attr("content") || undefined; | |
| ogLocale = soup('meta[property="og:locale"]').attr("content") || undefined; | |
| ogLocaleAlternate = | |
| soup('meta[property="og:locale:alternate"]') | |
| .map((i, el) => soup(el).attr("content")) | |
| .get() || undefined; | |
| ogSiteName = | |
| soup('meta[property="og:site_name"]').attr("content") || undefined; | |
| ogVideo = soup('meta[property="og:video"]').attr("content") || undefined; | |
| articleSection = | |
| soup('meta[name="article:section"]').attr("content") || undefined; | |
| articleTag = soup('meta[name="article:tag"]').attr("content") || undefined; | |
| publishedTime = | |
| soup('meta[property="article:published_time"]').attr("content") || | |
| undefined; | |
| modifiedTime = | |
| soup('meta[property="article:modified_time"]').attr("content") || | |
| undefined; | |
| dcTermsKeywords = | |
| soup('meta[name="dcterms.keywords"]').attr("content") || undefined; | |
| dcDescription = | |
| soup('meta[name="dc.description"]').attr("content") || undefined; | |
| dcSubject = soup('meta[name="dc.subject"]').attr("content") || undefined; | |
| dcTermsSubject = | |
| soup('meta[name="dcterms.subject"]').attr("content") || undefined; | |
| dcTermsAudience = | |
| soup('meta[name="dcterms.audience"]').attr("content") || undefined; | |
| dcType = soup('meta[name="dc.type"]').attr("content") || undefined; | |
| dcTermsType = | |
| soup('meta[name="dcterms.type"]').attr("content") || undefined; | |
| dcDate = soup('meta[name="dc.date"]').attr("content") || undefined; | |
| dcDateCreated = | |
| soup('meta[name="dc.date.created"]').attr("content") || undefined; | |
| dcTermsCreated = | |
| soup('meta[name="dcterms.created"]').attr("content") || undefined; | |
| try { | |
| // Extract all meta tags for custom metadata | |
| soup("meta").each((i, elem) => { | |
| try { | |
| const name = soup(elem).attr("name") || soup(elem).attr("property"); | |
| const content = soup(elem).attr("content"); | |
| if (name && content) { | |
| if (customMetadata[name] === undefined) { | |
| customMetadata[name] = content; | |
| } else if (Array.isArray(customMetadata[name])) { | |
| (customMetadata[name] as string[]).push(content); | |
| } else { | |
| customMetadata[name] = [customMetadata[name] as string, content]; | |
| } | |
| } | |
| } catch (error) { | |
| meta.logger.error(`Error extracting custom metadata (in)`, { error }); | |
| } | |
| }); | |
| } catch (error) { | |
| meta.logger.error(`Error extracting custom metadata`, { error }); | |
| } | |
| } catch (error) { | |
| meta.logger.error(`Error extracting metadata`, { error }); | |
| } | |
| return { | |
| title, | |
| description, | |
| favicon, | |
| language, | |
| keywords, | |
| robots, | |
| ogTitle, | |
| ogDescription, | |
| ogUrl, | |
| ogImage, | |
| ogAudio, | |
| ogDeterminer, | |
| ogLocale, | |
| ogLocaleAlternate, | |
| ogSiteName, | |
| ogVideo, | |
| dcTermsCreated, | |
| dcDateCreated, | |
| dcDate, | |
| dcTermsType, | |
| dcType, | |
| dcTermsAudience, | |
| dcTermsSubject, | |
| dcSubject, | |
| dcDescription, | |
| dcTermsKeywords, | |
| modifiedTime, | |
| publishedTime, | |
| articleTag, | |
| articleSection, | |
| scrapeId: meta.id, | |
| ...customMetadata, | |
| }; | |
| } | |