Spaces:

dexakif
/

clienttarget

Running

clienttarget / src /discovery /lib /email-classifier.ts

iDevBuddy

feat: Phase 1 — AI Client Acquisition System

bd28470 3 days ago

7.34 kB

	/**
	* Email Classifier — 3-Tier Decision System
	*
	* Tier 1: Hard REJECT (noreply, support, jobs → instant discard)
	* Tier 2: LLM Context Check (operations, admin, system → depends on company size/industry)
	* Tier 3: High confidence KEEP (personal format, ceo@, partnerships@)
	*
	* Key insight: admin@ at a 5-person dental clinic reaches the owner.
	* admin@ at a 500-person corp reaches an assistant. Context matters.
	*/

	import { callLLM } from "../../shared/llm/nvidia-client";
	import { SYSTEM_PROMPTS, buildEmailClassifyPrompt } from "../../shared/llm/prompts";
	import { MODELS } from "../../shared/llm/nvidia-client";
	import { logger } from "../../shared/utils/logger";

	export type EmailTier = "reject" \| "context_check" \| "keep";
	export type EmailVerdict = "personal" \| "authority" \| "context_verified" \| "outsourcing" \| "rejected";

	export interface ClassificationResult {
	email: string;
	tier: EmailTier;
	verdict: EmailVerdict;
	confidence: number;
	likelyReaches: string;
	reason: string;
	}

	// ─── Tier 1: ALWAYS REJECT ──────────────────────────────────

	const HARD_REJECT_PREFIXES = new Set([
	// Automated / system
	"noreply", "no-reply", "no_reply", "donotreply", "do-not-reply",
	"notifications", "automated", "bounces", "mailer",
	"postmaster", "unsubscribe", "spam", "abuse",
	// Support (never reaches decision-maker)
	"support", "helpdesk", "tickets", "complaints", "feedback",
	// Jobs (irrelevant)
	"jobs", "careers", "apply", "recruitment", "hiring", "talent",
	]);

	// ─── Tier 2: CONTEXT-DEPENDENT (LLM decides) ────────────────

	const CONTEXT_CHECK_PREFIXES = new Set([
	"operations", "admin", "system", "info", "office",
	"hello", "contact", "enquiries", "general", "team",
	"accounts", "finance", "billing", "sales", "marketing",
	"hr", "legal", "compliance", "reception", "manager",
	]);

	// ─── Tier 3: HIGH CONFIDENCE KEEP ───────────────────────────

	const AUTHORITY_PREFIXES = new Set([
	"ceo", "owner", "founder", "president", "cto", "coo",
	"partner", "principal", "director", "md", "gm", "head",
	]);

	const OUTSOURCING_PREFIXES = new Set([
	"partnerships", "vendors", "procurement", "outsource",
	"collaborate", "projects", "business", "growth",
	]);

	// ─── Personal email pattern (firstname, firstname.lastname) ─
	const PERSONAL_PATTERN = /^[a-z]{2,}(\.[a-z]{2,})?$/;
	const INITIAL_PATTERN = /^[a-z]\.[a-z]{2,}$/; // j.smith

	/**
	* Main classifier — determines if email is worth pursuing.
	*/
	export async function classifyEmail(
	email: string,
	companyContext: {
	name: string;
	employeeCount: number \| null;
	industry: string;
	websiteSnippet: string;
	},
	traceId: string
	): Promise<ClassificationResult> {
	const prefix = email.split("@")[0].toLowerCase().replace(/[^a-z]/g, "");
	const fullPrefix = email.split("@")[0].toLowerCase();

	// ── Tier 1: Hard reject ────────────────────────────────────
	if (HARD_REJECT_PREFIXES.has(prefix)) {
	return {
	email,
	tier: "reject",
	verdict: "rejected",
	confidence: 1.0,
	likelyReaches: "automated inbox or department queue",
	reason: `"${fullPrefix}@" is a known non-personal email type`,
	};
	}

	// ── Tier 3: Personal format → instant keep ─────────────────
	if (PERSONAL_PATTERN.test(fullPrefix) \|\| INITIAL_PATTERN.test(fullPrefix)) {
	return {
	email,
	tier: "keep",
	verdict: "personal",
	confidence: 0.95,
	likelyReaches: "individual person (personal email format)",
	reason: `"${fullPrefix}@" matches personal email pattern`,
	};
	}

	// ── Tier 3: Authority prefix → instant keep ────────────────
	if (AUTHORITY_PREFIXES.has(prefix)) {
	return {
	email,
	tier: "keep",
	verdict: "authority",
	confidence: 0.90,
	likelyReaches: `${prefix.toUpperCase()} or equivalent executive`,
	reason: `"${fullPrefix}@" is a known decision-maker prefix`,
	};
	}

	// ── Tier 3: Outsourcing signal → keep ──────────────────────
	if (OUTSOURCING_PREFIXES.has(prefix)) {
	return {
	email,
	tier: "keep",
	verdict: "outsourcing",
	confidence: 0.80,
	likelyReaches: "vendor/partnership manager (purchasing authority likely)",
	reason: `"${fullPrefix}@" signals company outsources services`,
	};
	}

	// ── Tier 2: Context check needed → ask LLM ────────────────
	if (CONTEXT_CHECK_PREFIXES.has(prefix)) {
	return contextCheckWithLLM(email, companyContext, traceId);
	}

	// ── Unknown prefix → default to LLM context check ─────────
	return contextCheckWithLLM(email, companyContext, traceId);
	}

	/**
	* LLM-powered context check for ambiguous email prefixes.
	* Uses FAST model (8B) to save tokens — this is a simple classification.
	*/
	async function contextCheckWithLLM(
	email: string,
	context: {
	name: string;
	employeeCount: number \| null;
	industry: string;
	websiteSnippet: string;
	},
	traceId: string
	): Promise<ClassificationResult> {
	try {
	const response = await callLLM({
	operation: "email_classify",
	model: MODELS.FAST, // 8B model — fast + cheap for simple classification
	systemPrompt: SYSTEM_PROMPTS.EMAIL_CLASSIFIER,
	userPrompt: buildEmailClassifyPrompt({
	email,
	company_name: context.name,
	company_size: context.employeeCount,
	industry: context.industry,
	website_snippet: context.websiteSnippet,
	}),
	temperature: 0.1,
	maxTokens: 200,
	jsonMode: true,
	traceId,
	});

	if (response.parsed) {
	const keep = response.parsed.keep === true;
	const confidence = Number(response.parsed.confidence ?? 0.5);

	return {
	email,
	tier: "context_check",
	verdict: keep ? "context_verified" : "rejected",
	confidence,
	likelyReaches: String(response.parsed.likely_reaches ?? "unknown"),
	reason: String(response.parsed.reason ?? "LLM context check"),
	};
	}

	// LLM failed to respond properly → conservative: keep it, low confidence
	return {
	email,
	tier: "context_check",
	verdict: "context_verified",
	confidence: 0.5,
	likelyReaches: "unknown — LLM parse failed",
	reason: "LLM context check failed — keeping with low confidence",
	};

	} catch (err) {
	logger.warn({ email, err }, "Email LLM classify failed — keeping conservatively");

	// Fallback: rule-based size heuristic
	const isSmall = (context.employeeCount ?? 0) < 30;
	return {
	email,
	tier: "context_check",
	verdict: isSmall ? "context_verified" : "rejected",
	confidence: 0.4,
	likelyReaches: isSmall ? "likely owner/manager (small company)" : "likely department inbox (large company)",
	reason: `Fallback: company size ${context.employeeCount ?? "unknown"} → ${isSmall ? "small=keep" : "large=reject"}`,
	};
	}
	}