| import type { Attachments } from "./types"; |
| import { lookupDomainByCompanyName } from "./utils/domain-lookup"; |
|
|
| export const allowedMimeTypes = [ |
| "image/heic", |
| "image/png", |
| "image/jpeg", |
| "image/jpg", |
| "application/pdf", |
| "application/octet-stream", |
| ]; |
|
|
| export function getAllowedAttachments(attachments?: Attachments) { |
| return attachments?.filter((attachment) => |
| allowedMimeTypes.includes(attachment.ContentType), |
| ); |
| } |
|
|
| |
| |
| |
| |
| export function getDomainFromEmail(email?: string | null): string | null { |
| if (!email) return null; |
|
|
| |
| const cleanedEmail = email.trim().replace(/[<>]/g, ""); |
|
|
| const emailPattern = /^[^\s@]+@([^\s@]+)$/; |
| const match = cleanedEmail.match(emailPattern); |
| const domain = match?.at(1); |
|
|
| if (!domain) return null; |
|
|
| |
| const commonEmailServices = [ |
| "gmail.com", |
| "yahoo.com", |
| "outlook.com", |
| "hotmail.com", |
| "icloud.com", |
| "protonmail.com", |
| ]; |
| if (commonEmailServices.includes(domain.toLowerCase())) { |
| return domain.toLowerCase(); |
| } |
|
|
| |
| const domainParts = domain.toLowerCase().split("."); |
|
|
| |
| const twoPartTLDs = [ |
| "co.uk", |
| "com.au", |
| "co.nz", |
| "co.za", |
| "com.br", |
| "com.mx", |
| "co.jp", |
| "com.cn", |
| ]; |
|
|
| |
| if (domainParts.length >= 3) { |
| const lastTwo = domainParts.slice(-2).join("."); |
| if (twoPartTLDs.includes(lastTwo)) { |
| |
| return domainParts.slice(-3).join("."); |
| } |
| } |
|
|
| |
| if (domainParts.length > 2) { |
| return domainParts.slice(-2).join("."); |
| } |
|
|
| return domain.toLowerCase(); |
| } |
|
|
| |
| |
| |
| |
| export function removeProtocolFromDomain(domain: string | null): string | null { |
| if (!domain) return null; |
|
|
| |
| let cleaned = domain |
| .trim() |
| .replace(/^(https?:\/\/)?(www\.)?/i, "") |
| .toLowerCase(); |
|
|
| |
| cleaned = cleaned.replace(/\/$/, ""); |
|
|
| |
| cleaned = cleaned.split("/")[0]?.split("?")[0]?.split("#")[0] || cleaned; |
|
|
| |
| const domainParts = cleaned.split("."); |
|
|
| |
| const twoPartTLDs = [ |
| "co.uk", |
| "com.au", |
| "co.nz", |
| "co.za", |
| "com.br", |
| "com.mx", |
| "co.jp", |
| "com.cn", |
| ]; |
|
|
| if (domainParts.length >= 3) { |
| const lastTwo = domainParts.slice(-2).join("."); |
| if (twoPartTLDs.includes(lastTwo)) { |
| return domainParts.slice(-3).join("."); |
| } |
| } |
|
|
| |
| if (domainParts.length > 2) { |
| return domainParts.slice(-2).join("."); |
| } |
|
|
| return cleaned; |
| } |
|
|
| |
| |
| |
| |
| export async function extractWebsite( |
| website: string | null | undefined, |
| email: string | null | undefined, |
| vendorName: string | null | undefined, |
| logger?: ReturnType<typeof import("@midday/logger").createLoggerWithContext>, |
| ): Promise<string | null> { |
| |
| if (website) { |
| const cleaned = removeProtocolFromDomain(website); |
| if (cleaned) return cleaned; |
| } |
|
|
| |
| if (email) { |
| const domain = getDomainFromEmail(email); |
| if (domain) { |
| |
| const commonEmailServices = [ |
| "gmail.com", |
| "yahoo.com", |
| "outlook.com", |
| "hotmail.com", |
| "icloud.com", |
| "protonmail.com", |
| ]; |
| if (!commonEmailServices.includes(domain)) { |
| return domain; |
| } |
| } |
| } |
|
|
| |
| if (vendorName) { |
| try { |
| const lookedUpDomain = await lookupDomainByCompanyName( |
| vendorName, |
| logger, |
| ); |
| if (lookedUpDomain) { |
| return lookedUpDomain; |
| } |
| } catch (error) { |
| |
| logger?.warn("Domain lookup failed during website extraction", { |
| vendorName, |
| error: error instanceof Error ? error.message : "Unknown error", |
| }); |
| } |
| } |
|
|
| return null; |
| } |
|
|
| export function getDocumentTypeFromMimeType(mimetype: string): string { |
| switch (mimetype) { |
| case "application/pdf": |
| case "application/octet-stream": |
| return "invoice"; |
| default: |
| return "receipt"; |
| } |
| } |
|
|
| export function getContentSample(text: string, maxTokens = 1200): string { |
| const words = text.split(/\s+/); |
| const approxWordsPerToken = 0.75; |
| const maxWords = Math.floor(maxTokens / approxWordsPerToken); |
| return words.slice(0, maxWords).join(" "); |
| } |
|
|
| const supportedMimeTypesForProcessing = new Set([ |
| "application/pdf", |
| "application/x-pdf", |
| "text/csv", |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| "application/docx", |
| "text/plain", |
| "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| "application/pptx", |
| "application/rtf", |
| "text/markdown", |
| "application/vnd.oasis.opendocument.text", |
| "image/heic", |
| |
| |
| |
| ]); |
|
|
| |
| |
| |
| |
| |
| |
| export function isMimeTypeSupportedForProcessing(mimetype: string): boolean { |
| |
| if (supportedMimeTypesForProcessing.has(mimetype)) { |
| return true; |
| } |
|
|
| |
| if (mimetype.startsWith("image/")) { |
| return true; |
| } |
|
|
| return false; |
| } |
|
|
| export function extractTextFromRtf(buffer: Buffer): string { |
| let rtfContent = buffer.toString("utf-8"); |
|
|
| |
| rtfContent = rtfContent.replace( |
| /{\\(?:fonttbl|colortbl|stylesheet)[^}]*}/gi, |
| "", |
| ); |
|
|
| |
| rtfContent = rtfContent.replace(/^{\\rtf1[^}]*}/i, ""); |
|
|
| |
| rtfContent = rtfContent.replace(/{\\\*\\shppict[^}]*}/gi, ""); |
| rtfContent = rtfContent.replace(/{\\object[^}]*}/gi, ""); |
| rtfContent = rtfContent.replace(/{\\pict[^}]*}/gi, ""); |
|
|
| |
| rtfContent = rtfContent.replace(/\\u-?\d+\??/g, ""); |
|
|
| |
| rtfContent = rtfContent.replace(/\\[a-z]+\d* ?/gi, ""); |
|
|
| |
| rtfContent = rtfContent.replace(/\\'[0-9a-f]{2}/gi, ""); |
|
|
| |
| rtfContent = rtfContent.replace(/[{}]/g, ""); |
|
|
| |
| rtfContent = rtfContent |
| .replace(/\\par[d]?/gi, "\n") |
| .replace(/\\tab/gi, "\t") |
| .replace(/\\line/gi, "\n"); |
|
|
| |
| rtfContent = rtfContent.replace(/\r?\n\s*\r?\n/g, "\n"); |
| rtfContent = rtfContent.replace(/[ \t]{2,}/g, " "); |
|
|
| |
| return rtfContent.trim(); |
| } |
|
|
| export function cleanText(text: string): string { |
| |
| |
| |
| |
| |
| let cleanedText = text.replace( |
| new RegExp( |
| [ |
| "[", |
| "\\u0000-\\u001F", |
| "\\u007F-\\u009F", |
| "]", |
| ].join(""), |
| "g", |
| ), |
| "", |
| ); |
|
|
| |
| cleanedText = cleanedText.replace(/\s+/g, " ").trim(); |
|
|
| |
| |
| |
|
|
| |
| |
| |
|
|
| return cleanedText; |
| } |
|
|
| export function limitWords(text: string, maxWords: number): string { |
| if (!text) return ""; |
|
|
| const words = text.split(/\s+/); |
|
|
| if (words.length <= maxWords) { |
| return text; |
| } |
|
|
| return words.slice(0, maxWords).join(" "); |
| } |
|
|
| export { mapLanguageCodeToPostgresConfig } from "./utils/language-mapping"; |
|
|