Spaces:
Running
Running
| import { InsertCompany } from "../../shared/supabase/schema"; | |
| import { ScrapedCompany } from "./web-scraper"; | |
| import { LinkedInCompanyData } from "./linkedin-scraper"; | |
| import { SerperResult } from "../providers/serper"; | |
| /** | |
| * Normalizes raw data from multiple sources into a single canonical Company record. | |
| * Priority: LinkedIn > Website > Serper snippet | |
| */ | |
| export function normalizeCompany( | |
| serperResult: SerperResult, | |
| website: ScrapedCompany, | |
| linkedin: LinkedInCompanyData | null, | |
| region: string, | |
| source: string | |
| ): InsertCompany { | |
| const name = | |
| linkedin?.name ?? | |
| website.name ?? | |
| cleanTitle(serperResult.title); | |
| const description = | |
| linkedin?.description ?? | |
| website.description ?? | |
| serperResult.snippet; | |
| const employeeCount = | |
| linkedin?.employeeCount ?? | |
| website.employeeCount ?? | |
| null; | |
| const employeeRange = | |
| linkedin?.employeeRange ?? | |
| website.employeeRange ?? | |
| estimateRange(employeeCount); | |
| const industry = | |
| linkedin?.industry ?? | |
| website.industry ?? | |
| null; | |
| const country = | |
| linkedin?.headquarters | |
| ? extractCountry(linkedin.headquarters) | |
| : regionToCountry(region); | |
| const linkedinUrl = | |
| linkedin !== null | |
| ? extractLinkedInCompanyUrl(serperResult.link) ?? website.linkedinUrl | |
| : website.linkedinUrl; | |
| const growthSignals = buildGrowthSignals(website, linkedin); | |
| return { | |
| domain: website.domain, | |
| name: name ?? "Unknown", | |
| industry, | |
| employee_count: employeeCount, | |
| employee_range: employeeRange, | |
| description: description?.slice(0, 1000) ?? null, | |
| website_url: `https://${website.domain}`, | |
| linkedin_url: linkedinUrl ?? null, | |
| country, | |
| region, | |
| tech_stack: website.techStack, | |
| growth_signals: growthSignals, | |
| raw_data: { | |
| serper_title: serperResult.title, | |
| serper_snippet: serperResult.snippet, | |
| serper_link: serperResult.link, | |
| }, | |
| source, | |
| status: "discovered", | |
| }; | |
| } | |
| // βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function cleanTitle(title: string): string { | |
| return title | |
| .split(/[|\-β]/)[0] | |
| .replace(/\b(home|official|website|welcome to)\b/gi, "") | |
| .trim(); | |
| } | |
| function estimateRange(count: number | null): string | null { | |
| if (!count) return null; | |
| if (count < 50) return "10-49"; | |
| if (count < 100) return "50-99"; | |
| if (count < 200) return "100-199"; | |
| if (count < 500) return "200-499"; | |
| if (count < 1000) return "500-999"; | |
| return "1000+"; | |
| } | |
| function extractCountry(headquarters: string): string | null { | |
| const parts = headquarters.split(","); | |
| return parts[parts.length - 1]?.trim() ?? null; | |
| } | |
| function regionToCountry(region: string): string { | |
| const map: Record<string, string> = { | |
| US: "United States", UK: "United Kingdom", | |
| AU: "Australia", UAE: "United Arab Emirates", | |
| SA: "Saudi Arabia", SG: "Singapore", | |
| }; | |
| return map[region] ?? region; | |
| } | |
| function extractLinkedInCompanyUrl(url: string): string | null { | |
| const match = url.match(/https?:\/\/(www\.)?linkedin\.com\/company\/[^/?#]+/); | |
| return match ? match[0] : null; | |
| } | |
| function buildGrowthSignals( | |
| website: ScrapedCompany, | |
| linkedin: LinkedInCompanyData | null | |
| ): object[] { | |
| const signals: object[] = []; | |
| // AI-related job postings | |
| website.jobPostings | |
| .filter((j) => j.hasAiSignal) | |
| .forEach((j) => { | |
| signals.push({ | |
| type: "job_posting", | |
| content: j.title, | |
| source_url: j.url, | |
| ai_related: true, | |
| detected_at: new Date().toISOString(), | |
| }); | |
| }); | |
| // Recent LinkedIn posts | |
| (linkedin?.recentPosts ?? []).forEach((post) => { | |
| signals.push({ | |
| type: "social_post", | |
| content: post.slice(0, 200), | |
| ai_related: /automat|ai\b|machine learning|digital/i.test(post), | |
| detected_at: new Date().toISOString(), | |
| }); | |
| }); | |
| return signals.slice(0, 10); // cap at 10 signals | |
| } | |