File size: 2,690 Bytes
bd28470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import { distance } from "fastest-levenshtein";
import { getSupabaseClient } from "../../shared/supabase/client";
import { logger } from "../../shared/utils/logger";

/**
 * Checks if a company already exists in Supabase.
 * Uses exact domain match first, then fuzzy name match as fallback.
 * Returns the existing company ID if duplicate, null if new.
 */
export async function isDuplicate(
  domain: string,
  name: string
): Promise<{ isDupe: boolean; existingId?: string }> {
  const db = getSupabaseClient();

  // ── 1. Exact domain match (fastest) ─────────────────────────
  const { data: byDomain } = await db
    .from("companies")
    .select("id, domain, name")
    .eq("domain", normalizeDomain(domain))
    .maybeSingle();

  if (byDomain) {
    logger.debug({ domain, existingId: byDomain.id }, "Duplicate: exact domain match");
    return { isDupe: true, existingId: byDomain.id };
  }

  // ── 2. Fuzzy name match against recent records ────────────────
  const { data: recent } = await db
    .from("companies")
    .select("id, name")
    .order("discovered_at", { ascending: false })
    .limit(500);

  if (!recent) return { isDupe: false };

  const normalizedInput = normalizeName(name);

  for (const existing of recent) {
    const normalizedExisting = normalizeName(existing.name);
    const dist = distance(normalizedInput, normalizedExisting);
    const maxLen = Math.max(normalizedInput.length, normalizedExisting.length);
    const similarity = 1 - dist / maxLen;

    if (similarity >= 0.88) {
      logger.debug(
        { input: name, existing: existing.name, similarity: similarity.toFixed(2) },
        "Duplicate: fuzzy name match"
      );
      return { isDupe: true, existingId: existing.id };
    }
  }

  return { isDupe: false };
}

/**
 * Checks suppression list before any processing.
 */
export async function isSuppressed(domain: string): Promise<boolean> {
  const db = getSupabaseClient();
  const { data } = await db
    .from("suppression_list")
    .select("id")
    .eq("domain", domain)
    .maybeSingle();
  return !!data;
}

// ─── Helpers ─────────────────────────────────────────────────

function normalizeDomain(domain: string): string {
  return domain.toLowerCase().replace(/^www\./, "").replace(/\/$/, "").trim();
}

function normalizeName(name: string): string {
  return name
    .toLowerCase()
    .replace(/\b(inc|ltd|llc|corp|co|limited|plc|gmbh|pty|pvt|srl|bv|ag|sa)\b\.?/gi, "")
    .replace(/[^a-z0-9\s]/g, "")
    .trim();
}