Spaces:

dexakif
/

clienttarget-python

Running

File size: 7,336 Bytes

bd28470

/**
 * Email Classifier — 3-Tier Decision System
 * 
 * Tier 1: Hard REJECT (noreply, support, jobs → instant discard)
 * Tier 2: LLM Context Check (operations, admin, system → depends on company size/industry)
 * Tier 3: High confidence KEEP (personal format, ceo@, partnerships@)
 * 
 * Key insight: admin@ at a 5-person dental clinic reaches the owner.
 * admin@ at a 500-person corp reaches an assistant. Context matters.
 */

import { callLLM } from "../../shared/llm/nvidia-client";
import { SYSTEM_PROMPTS, buildEmailClassifyPrompt } from "../../shared/llm/prompts";
import { MODELS } from "../../shared/llm/nvidia-client";
import { logger } from "../../shared/utils/logger";

export type EmailTier = "reject" | "context_check" | "keep";
export type EmailVerdict = "personal" | "authority" | "context_verified" | "outsourcing" | "rejected";

export interface ClassificationResult {
  email: string;
  tier: EmailTier;
  verdict: EmailVerdict;
  confidence: number;
  likelyReaches: string;
  reason: string;
}

// ─── Tier 1: ALWAYS REJECT ──────────────────────────────────

const HARD_REJECT_PREFIXES = new Set([
  // Automated / system
  "noreply", "no-reply", "no_reply", "donotreply", "do-not-reply",
  "notifications", "automated", "bounces", "mailer",
  "postmaster", "unsubscribe", "spam", "abuse",
  // Support (never reaches decision-maker)
  "support", "helpdesk", "tickets", "complaints", "feedback",
  // Jobs (irrelevant)
  "jobs", "careers", "apply", "recruitment", "hiring", "talent",
]);

// ─── Tier 2: CONTEXT-DEPENDENT (LLM decides) ────────────────

const CONTEXT_CHECK_PREFIXES = new Set([
  "operations", "admin", "system", "info", "office",
  "hello", "contact", "enquiries", "general", "team",
  "accounts", "finance", "billing", "sales", "marketing",
  "hr", "legal", "compliance", "reception", "manager",
]);

// ─── Tier 3: HIGH CONFIDENCE KEEP ───────────────────────────

const AUTHORITY_PREFIXES = new Set([
  "ceo", "owner", "founder", "president", "cto", "coo",
  "partner", "principal", "director", "md", "gm", "head",
]);

const OUTSOURCING_PREFIXES = new Set([
  "partnerships", "vendors", "procurement", "outsource",
  "collaborate", "projects", "business", "growth",
]);

// ─── Personal email pattern (firstname, firstname.lastname) ─
const PERSONAL_PATTERN = /^[a-z]{2,}(\.[a-z]{2,})?$/;
const INITIAL_PATTERN = /^[a-z]\.[a-z]{2,}$/;  // j.smith

/**
 * Main classifier — determines if email is worth pursuing.
 */
export async function classifyEmail(
  email: string,
  companyContext: {
    name: string;
    employeeCount: number | null;
    industry: string;
    websiteSnippet: string;
  },
  traceId: string
): Promise<ClassificationResult> {
  const prefix = email.split("@")[0].toLowerCase().replace(/[^a-z]/g, "");
  const fullPrefix = email.split("@")[0].toLowerCase();

  // ── Tier 1: Hard reject ────────────────────────────────────
  if (HARD_REJECT_PREFIXES.has(prefix)) {
    return {
      email,
      tier: "reject",
      verdict: "rejected",
      confidence: 1.0,
      likelyReaches: "automated inbox or department queue",
      reason: `"${fullPrefix}@" is a known non-personal email type`,
    };
  }

  // ── Tier 3: Personal format → instant keep ─────────────────
  if (PERSONAL_PATTERN.test(fullPrefix) || INITIAL_PATTERN.test(fullPrefix)) {
    return {
      email,
      tier: "keep",
      verdict: "personal",
      confidence: 0.95,
      likelyReaches: "individual person (personal email format)",
      reason: `"${fullPrefix}@" matches personal email pattern`,
    };
  }

  // ── Tier 3: Authority prefix → instant keep ────────────────
  if (AUTHORITY_PREFIXES.has(prefix)) {
    return {
      email,
      tier: "keep",
      verdict: "authority",
      confidence: 0.90,
      likelyReaches: `${prefix.toUpperCase()} or equivalent executive`,
      reason: `"${fullPrefix}@" is a known decision-maker prefix`,
    };
  }

  // ── Tier 3: Outsourcing signal → keep ──────────────────────
  if (OUTSOURCING_PREFIXES.has(prefix)) {
    return {
      email,
      tier: "keep",
      verdict: "outsourcing",
      confidence: 0.80,
      likelyReaches: "vendor/partnership manager (purchasing authority likely)",
      reason: `"${fullPrefix}@" signals company outsources services`,
    };
  }

  // ── Tier 2: Context check needed → ask LLM ────────────────
  if (CONTEXT_CHECK_PREFIXES.has(prefix)) {
    return contextCheckWithLLM(email, companyContext, traceId);
  }

  // ── Unknown prefix → default to LLM context check ─────────
  return contextCheckWithLLM(email, companyContext, traceId);
}

/**
 * LLM-powered context check for ambiguous email prefixes.
 * Uses FAST model (8B) to save tokens — this is a simple classification.
 */
async function contextCheckWithLLM(
  email: string,
  context: {
    name: string;
    employeeCount: number | null;
    industry: string;
    websiteSnippet: string;
  },
  traceId: string
): Promise<ClassificationResult> {
  try {
    const response = await callLLM({
      operation: "email_classify",
      model: MODELS.FAST,    // 8B model — fast + cheap for simple classification
      systemPrompt: SYSTEM_PROMPTS.EMAIL_CLASSIFIER,
      userPrompt: buildEmailClassifyPrompt({
        email,
        company_name: context.name,
        company_size: context.employeeCount,
        industry: context.industry,
        website_snippet: context.websiteSnippet,
      }),
      temperature: 0.1,
      maxTokens: 200,
      jsonMode: true,
      traceId,
    });

    if (response.parsed) {
      const keep = response.parsed.keep === true;
      const confidence = Number(response.parsed.confidence ?? 0.5);
      
      return {
        email,
        tier: "context_check",
        verdict: keep ? "context_verified" : "rejected",
        confidence,
        likelyReaches: String(response.parsed.likely_reaches ?? "unknown"),
        reason: String(response.parsed.reason ?? "LLM context check"),
      };
    }

    // LLM failed to respond properly → conservative: keep it, low confidence
    return {
      email,
      tier: "context_check",
      verdict: "context_verified",
      confidence: 0.5,
      likelyReaches: "unknown — LLM parse failed",
      reason: "LLM context check failed — keeping with low confidence",
    };

  } catch (err) {
    logger.warn({ email, err }, "Email LLM classify failed — keeping conservatively");
    
    // Fallback: rule-based size heuristic
    const isSmall = (context.employeeCount ?? 0) < 30;
    return {
      email,
      tier: "context_check",
      verdict: isSmall ? "context_verified" : "rejected",
      confidence: 0.4,
      likelyReaches: isSmall ? "likely owner/manager (small company)" : "likely department inbox (large company)",
      reason: `Fallback: company size ${context.employeeCount ?? "unknown"} → ${isSmall ? "small=keep" : "large=reject"}`,
    };
  }
}