clienttarget-python / src /discovery /lib /deduplicator.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
raw
history blame
2.69 kB
import { distance } from "fastest-levenshtein";
import { getSupabaseClient } from "../../shared/supabase/client";
import { logger } from "../../shared/utils/logger";
/**
* Checks if a company already exists in Supabase.
* Uses exact domain match first, then fuzzy name match as fallback.
* Returns the existing company ID if duplicate, null if new.
*/
export async function isDuplicate(
domain: string,
name: string
): Promise<{ isDupe: boolean; existingId?: string }> {
const db = getSupabaseClient();
// ── 1. Exact domain match (fastest) ─────────────────────────
const { data: byDomain } = await db
.from("companies")
.select("id, domain, name")
.eq("domain", normalizeDomain(domain))
.maybeSingle();
if (byDomain) {
logger.debug({ domain, existingId: byDomain.id }, "Duplicate: exact domain match");
return { isDupe: true, existingId: byDomain.id };
}
// ── 2. Fuzzy name match against recent records ────────────────
const { data: recent } = await db
.from("companies")
.select("id, name")
.order("discovered_at", { ascending: false })
.limit(500);
if (!recent) return { isDupe: false };
const normalizedInput = normalizeName(name);
for (const existing of recent) {
const normalizedExisting = normalizeName(existing.name);
const dist = distance(normalizedInput, normalizedExisting);
const maxLen = Math.max(normalizedInput.length, normalizedExisting.length);
const similarity = 1 - dist / maxLen;
if (similarity >= 0.88) {
logger.debug(
{ input: name, existing: existing.name, similarity: similarity.toFixed(2) },
"Duplicate: fuzzy name match"
);
return { isDupe: true, existingId: existing.id };
}
}
return { isDupe: false };
}
/**
* Checks suppression list before any processing.
*/
export async function isSuppressed(domain: string): Promise<boolean> {
const db = getSupabaseClient();
const { data } = await db
.from("suppression_list")
.select("id")
.eq("domain", domain)
.maybeSingle();
return !!data;
}
// ─── Helpers ─────────────────────────────────────────────────
function normalizeDomain(domain: string): string {
return domain.toLowerCase().replace(/^www\./, "").replace(/\/$/, "").trim();
}
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\b(inc|ltd|llc|corp|co|limited|plc|gmbh|pty|pvt|srl|bv|ag|sa)\b\.?/gi, "")
.replace(/[^a-z0-9\s]/g, "")
.trim();
}