import { InsertCompany } from "../../shared/supabase/schema"; import { ScrapedCompany } from "./web-scraper"; import { LinkedInCompanyData } from "./linkedin-scraper"; import { SerperResult } from "../providers/serper"; /** * Normalizes raw data from multiple sources into a single canonical Company record. * Priority: LinkedIn > Website > Serper snippet */ export function normalizeCompany( serperResult: SerperResult, website: ScrapedCompany, linkedin: LinkedInCompanyData | null, region: string, source: string ): InsertCompany { const name = linkedin?.name ?? website.name ?? cleanTitle(serperResult.title); const description = linkedin?.description ?? website.description ?? serperResult.snippet; const employeeCount = linkedin?.employeeCount ?? website.employeeCount ?? null; const employeeRange = linkedin?.employeeRange ?? website.employeeRange ?? estimateRange(employeeCount); const industry = linkedin?.industry ?? website.industry ?? null; const country = linkedin?.headquarters ? extractCountry(linkedin.headquarters) : regionToCountry(region); const linkedinUrl = linkedin !== null ? extractLinkedInCompanyUrl(serperResult.link) ?? website.linkedinUrl : website.linkedinUrl; const growthSignals = buildGrowthSignals(website, linkedin); return { domain: website.domain, name: name ?? "Unknown", industry, employee_count: employeeCount, employee_range: employeeRange, description: description?.slice(0, 1000) ?? null, website_url: `https://${website.domain}`, linkedin_url: linkedinUrl ?? null, country, region, tech_stack: website.techStack, growth_signals: growthSignals, raw_data: { serper_title: serperResult.title, serper_snippet: serperResult.snippet, serper_link: serperResult.link, }, source, status: "discovered", }; } // ─── Helpers ───────────────────────────────────────────────── function cleanTitle(title: string): string { return title .split(/[|\-–]/)[0] .replace(/\b(home|official|website|welcome to)\b/gi, "") .trim(); } function estimateRange(count: number | null): string | null { if (!count) return null; if (count < 50) return "10-49"; if (count < 100) return "50-99"; if (count < 200) return "100-199"; if (count < 500) return "200-499"; if (count < 1000) return "500-999"; return "1000+"; } function extractCountry(headquarters: string): string | null { const parts = headquarters.split(","); return parts[parts.length - 1]?.trim() ?? null; } function regionToCountry(region: string): string { const map: Record = { US: "United States", UK: "United Kingdom", AU: "Australia", UAE: "United Arab Emirates", SA: "Saudi Arabia", SG: "Singapore", }; return map[region] ?? region; } function extractLinkedInCompanyUrl(url: string): string | null { const match = url.match(/https?:\/\/(www\.)?linkedin\.com\/company\/[^/?#]+/); return match ? match[0] : null; } function buildGrowthSignals( website: ScrapedCompany, linkedin: LinkedInCompanyData | null ): object[] { const signals: object[] = []; // AI-related job postings website.jobPostings .filter((j) => j.hasAiSignal) .forEach((j) => { signals.push({ type: "job_posting", content: j.title, source_url: j.url, ai_related: true, detected_at: new Date().toISOString(), }); }); // Recent LinkedIn posts (linkedin?.recentPosts ?? []).forEach((post) => { signals.push({ type: "social_post", content: post.slice(0, 200), ai_related: /automat|ai\b|machine learning|digital/i.test(post), detected_at: new Date().toISOString(), }); }); return signals.slice(0, 10); // cap at 10 signals }