clienttarget / src /discovery /lib /normalizer.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
import { InsertCompany } from "../../shared/supabase/schema";
import { ScrapedCompany } from "./web-scraper";
import { LinkedInCompanyData } from "./linkedin-scraper";
import { SerperResult } from "../providers/serper";
/**
* Normalizes raw data from multiple sources into a single canonical Company record.
* Priority: LinkedIn > Website > Serper snippet
*/
export function normalizeCompany(
serperResult: SerperResult,
website: ScrapedCompany,
linkedin: LinkedInCompanyData | null,
region: string,
source: string
): InsertCompany {
const name =
linkedin?.name ??
website.name ??
cleanTitle(serperResult.title);
const description =
linkedin?.description ??
website.description ??
serperResult.snippet;
const employeeCount =
linkedin?.employeeCount ??
website.employeeCount ??
null;
const employeeRange =
linkedin?.employeeRange ??
website.employeeRange ??
estimateRange(employeeCount);
const industry =
linkedin?.industry ??
website.industry ??
null;
const country =
linkedin?.headquarters
? extractCountry(linkedin.headquarters)
: regionToCountry(region);
const linkedinUrl =
linkedin !== null
? extractLinkedInCompanyUrl(serperResult.link) ?? website.linkedinUrl
: website.linkedinUrl;
const growthSignals = buildGrowthSignals(website, linkedin);
return {
domain: website.domain,
name: name ?? "Unknown",
industry,
employee_count: employeeCount,
employee_range: employeeRange,
description: description?.slice(0, 1000) ?? null,
website_url: `https://${website.domain}`,
linkedin_url: linkedinUrl ?? null,
country,
region,
tech_stack: website.techStack,
growth_signals: growthSignals,
raw_data: {
serper_title: serperResult.title,
serper_snippet: serperResult.snippet,
serper_link: serperResult.link,
},
source,
status: "discovered",
};
}
// ─── Helpers ─────────────────────────────────────────────────
function cleanTitle(title: string): string {
return title
.split(/[|\-–]/)[0]
.replace(/\b(home|official|website|welcome to)\b/gi, "")
.trim();
}
function estimateRange(count: number | null): string | null {
if (!count) return null;
if (count < 50) return "10-49";
if (count < 100) return "50-99";
if (count < 200) return "100-199";
if (count < 500) return "200-499";
if (count < 1000) return "500-999";
return "1000+";
}
function extractCountry(headquarters: string): string | null {
const parts = headquarters.split(",");
return parts[parts.length - 1]?.trim() ?? null;
}
function regionToCountry(region: string): string {
const map: Record<string, string> = {
US: "United States", UK: "United Kingdom",
AU: "Australia", UAE: "United Arab Emirates",
SA: "Saudi Arabia", SG: "Singapore",
};
return map[region] ?? region;
}
function extractLinkedInCompanyUrl(url: string): string | null {
const match = url.match(/https?:\/\/(www\.)?linkedin\.com\/company\/[^/?#]+/);
return match ? match[0] : null;
}
function buildGrowthSignals(
website: ScrapedCompany,
linkedin: LinkedInCompanyData | null
): object[] {
const signals: object[] = [];
// AI-related job postings
website.jobPostings
.filter((j) => j.hasAiSignal)
.forEach((j) => {
signals.push({
type: "job_posting",
content: j.title,
source_url: j.url,
ai_related: true,
detected_at: new Date().toISOString(),
});
});
// Recent LinkedIn posts
(linkedin?.recentPosts ?? []).forEach((post) => {
signals.push({
type: "social_post",
content: post.slice(0, 200),
ai_related: /automat|ai\b|machine learning|digital/i.test(post),
detected_at: new Date().toISOString(),
});
});
return signals.slice(0, 10); // cap at 10 signals
}