Spaces:

AUXteam
/

Midday

Paused

File size: 9,318 Bytes

c09f67c

import type { Attachments } from "./types";
import { lookupDomainByCompanyName } from "./utils/domain-lookup";

export const allowedMimeTypes = [
  "image/heic",
  "image/png",
  "image/jpeg",
  "image/jpg",
  "application/pdf",
  "application/octet-stream",
];

export function getAllowedAttachments(attachments?: Attachments) {
  return attachments?.filter((attachment) =>
    allowedMimeTypes.includes(attachment.ContentType),
  );
}

/**
 * Extract domain from email address
 * Handles various email formats and extracts root domain
 */
export function getDomainFromEmail(email?: string | null): string | null {
  if (!email) return null;

  // Clean email - remove any whitespace and angle brackets
  const cleanedEmail = email.trim().replace(/[<>]/g, "");

  const emailPattern = /^[^\s@]+@([^\s@]+)$/;
  const match = cleanedEmail.match(emailPattern);
  const domain = match?.at(1);

  if (!domain) return null;

  // Handle common email service domains (keep as-is)
  const commonEmailServices = [
    "gmail.com",
    "yahoo.com",
    "outlook.com",
    "hotmail.com",
    "icloud.com",
    "protonmail.com",
  ];
  if (commonEmailServices.includes(domain.toLowerCase())) {
    return domain.toLowerCase();
  }

  // Extract root domain (remove subdomains)
  const domainParts = domain.toLowerCase().split(".");

  // Handle special cases like .co.uk, .com.au, etc.
  const twoPartTLDs = [
    "co.uk",
    "com.au",
    "co.nz",
    "co.za",
    "com.br",
    "com.mx",
    "co.jp",
    "com.cn",
  ];

  // Check if it's a two-part TLD
  if (domainParts.length >= 3) {
    const lastTwo = domainParts.slice(-2).join(".");
    if (twoPartTLDs.includes(lastTwo)) {
      // Return domain with two-part TLD (e.g., example.co.uk)
      return domainParts.slice(-3).join(".");
    }
  }

  // Standard case: return last two parts (e.g., example.com)
  if (domainParts.length > 2) {
    return domainParts.slice(-2).join(".");
  }

  return domain.toLowerCase();
}

/**
 * Remove protocol and clean domain/URL
 * Handles various URL formats and extracts clean domain
 */
export function removeProtocolFromDomain(domain: string | null): string | null {
  if (!domain) return null;

  // Remove protocol (http://, https://, www.)
  let cleaned = domain
    .trim()
    .replace(/^(https?:\/\/)?(www\.)?/i, "")
    .toLowerCase();

  // Remove trailing slash
  cleaned = cleaned.replace(/\/$/, "");

  // Remove path, query params, and fragments
  cleaned = cleaned.split("/")[0]?.split("?")[0]?.split("#")[0] || cleaned;

  // Extract root domain (remove subdomains)
  const domainParts = cleaned.split(".");

  // Handle special cases like .co.uk, .com.au, etc.
  const twoPartTLDs = [
    "co.uk",
    "com.au",
    "co.nz",
    "co.za",
    "com.br",
    "com.mx",
    "co.jp",
    "com.cn",
  ];

  if (domainParts.length >= 3) {
    const lastTwo = domainParts.slice(-2).join(".");
    if (twoPartTLDs.includes(lastTwo)) {
      return domainParts.slice(-3).join(".");
    }
  }

  // Standard case: return last two parts
  if (domainParts.length > 2) {
    return domainParts.slice(-2).join(".");
  }

  return cleaned;
}

/**
 * Intelligently extract website from invoice/receipt data
 * Tries multiple sources: explicit website, email domain, vendor name lookup
 */
export async function extractWebsite(
  website: string | null | undefined,
  email: string | null | undefined,
  vendorName: string | null | undefined,
  logger?: ReturnType<typeof import("@midday/logger").createLoggerWithContext>,
): Promise<string | null> {
  // First priority: explicit website field
  if (website) {
    const cleaned = removeProtocolFromDomain(website);
    if (cleaned) return cleaned;
  }

  // Second priority: extract from email
  if (email) {
    const domain = getDomainFromEmail(email);
    if (domain) {
      // Skip common email service domains
      const commonEmailServices = [
        "gmail.com",
        "yahoo.com",
        "outlook.com",
        "hotmail.com",
        "icloud.com",
        "protonmail.com",
      ];
      if (!commonEmailServices.includes(domain)) {
        return domain;
      }
    }
  }

  // Third priority: lookup domain by company name using Gemini Grounding
  if (vendorName) {
    try {
      const lookedUpDomain = await lookupDomainByCompanyName(
        vendorName,
        logger,
      );
      if (lookedUpDomain) {
        return lookedUpDomain;
      }
    } catch (error) {
      // Log error but don't throw - graceful degradation
      logger?.warn("Domain lookup failed during website extraction", {
        vendorName,
        error: error instanceof Error ? error.message : "Unknown error",
      });
    }
  }

  return null;
}

export function getDocumentTypeFromMimeType(mimetype: string): string {
  switch (mimetype) {
    case "application/pdf":
    case "application/octet-stream":
      return "invoice";
    default:
      return "receipt";
  }
}

export function getContentSample(text: string, maxTokens = 1200): string {
  const words = text.split(/\s+/);
  const approxWordsPerToken = 0.75; // Rough estimate
  const maxWords = Math.floor(maxTokens / approxWordsPerToken);
  return words.slice(0, maxWords).join(" ");
}

const supportedMimeTypesForProcessing = new Set([
  "application/pdf",
  "application/x-pdf",
  "text/csv",
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  "application/docx",
  "text/plain",
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  "application/pptx",
  "application/rtf",
  "text/markdown",
  "application/vnd.oasis.opendocument.text",
  "image/heic", // Handled via conversion
  // "application/vnd.apple.pages",
  // "application/x-iwork-pages-sffpages",
  // "applicatiosn/epub+zip",
]);

/**
 * Checks if a given MIME type is supported for document or image processing.
 * This includes types loadable by `loadDocument` and image types handled by `classifyImage`.
 * @param mimetype The MIME type string to check.
 * @returns True if the MIME type is supported, false otherwise.
 */
export function isMimeTypeSupportedForProcessing(mimetype: string): boolean {
  // Check exact matches first
  if (supportedMimeTypesForProcessing.has(mimetype)) {
    return true;
  }

  // Check if it's any other image type (handled by classifyImage)
  if (mimetype.startsWith("image/")) {
    return true;
  }

  return false;
}

export function extractTextFromRtf(buffer: Buffer): string {
  let rtfContent = buffer.toString("utf-8");

  // Remove font tables, color tables, and other metadata groups
  rtfContent = rtfContent.replace(
    /{\\(?:fonttbl|colortbl|stylesheet)[^}]*}/gi,
    "",
  );

  // Remove RTF header
  rtfContent = rtfContent.replace(/^{\\rtf1[^}]*}/i, "");

  // Remove embedded pictures, objects
  rtfContent = rtfContent.replace(/{\\\*\\shppict[^}]*}/gi, "");
  rtfContent = rtfContent.replace(/{\\object[^}]*}/gi, "");
  rtfContent = rtfContent.replace(/{\\pict[^}]*}/gi, "");

  // Remove Unicode characters like \u1234? (keep the fallback '?')
  rtfContent = rtfContent.replace(/\\u-?\d+\??/g, "");

  // Remove all other RTF control words
  rtfContent = rtfContent.replace(/\\[a-z]+\d* ?/gi, "");

  // Remove escaped hex like \'ab
  rtfContent = rtfContent.replace(/\\'[0-9a-f]{2}/gi, "");

  // Remove any leftover braces
  rtfContent = rtfContent.replace(/[{}]/g, "");

  // Replace known RTF newline/tab symbols
  rtfContent = rtfContent
    .replace(/\\par[d]?/gi, "\n")
    .replace(/\\tab/gi, "\t")
    .replace(/\\line/gi, "\n");

  // Collapse multiple spaces and newlines
  rtfContent = rtfContent.replace(/\r?\n\s*\r?\n/g, "\n"); // multiple newlines -> single
  rtfContent = rtfContent.replace(/[ \t]{2,}/g, " "); // multiple spaces/tabs -> single

  // Final clean trim§
  return rtfContent.trim();
}

export function cleanText(text: string): string {
  // Remove control characters (C0 and C1 controls)
  // Using Unicode escapes to avoid eslint `no-control-regex` error
  // \u0000-\u001F corresponds to \x00-\x1F
  // \u007F-\u009F corresponds to \x7F-\x9F
  // Remove control characters (C0 and C1 controls) using Unicode escapes to avoid eslint `no-control-regex` error
  let cleanedText = text.replace(
    new RegExp(
      [
        "[",
        "\\u0000-\\u001F", // C0 controls
        "\\u007F-\\u009F", // C1 controls
        "]",
      ].join(""),
      "g",
    ),
    "",
  );

  // Normalize spaces: replace multiple spaces, tabs, or line breaks with a single space
  cleanedText = cleanedText.replace(/\s+/g, " ").trim();

  // The previous version removed too many characters with /[^\x20-\x7E]/g
  // It also had potentially overly aggressive punctuation cleaning.
  // This simpler version focuses on removing control chars and normalizing space.

  // Optional: Further specific cleaning can be added here if needed,
  // for example, removing zero-width spaces:
  // cleanedText = cleanedText.replace(/[\u200B-\u200D\uFEFF]/g, '');

  return cleanedText;
}

export function limitWords(text: string, maxWords: number): string {
  if (!text) return "";

  const words = text.split(/\s+/); // Split by any whitespace

  if (words.length <= maxWords) {
    return text;
  }

  return words.slice(0, maxWords).join(" ");
}

export { mapLanguageCodeToPostgresConfig } from "./utils/language-mapping";