import { chromium, Browser, BrowserContext } from "playwright"; import { playwrightLimiter } from "../../shared/utils/rate-limiter"; import { logger } from "../../shared/utils/logger"; export interface LinkedInCompanyData { name: string | null; description: string | null; employeeCount: number | null; employeeRange: string | null; industry: string | null; headquarters: string | null; website: string | null; recentPosts: string[]; decisionMakers: LinkedInPerson[]; } export interface LinkedInPerson { fullName: string; title: string; linkedinUrl: string; isDecisionMaker: boolean; } const DECISION_MAKER_TITLES = [ "ceo", "chief executive", "founder", "co-founder", "cofounder", "cto", "chief technology", "coo", "chief operating", "vp", "vice president", "director", "head of", "managing director", "general manager", "president", ]; /** * Scrapes LinkedIn public company page. * Only reads publicly visible data — no login, no TOS violation. */ export async function scrapeLinkedInCompany( linkedinUrl: string ): Promise { await playwrightLimiter.consume("linkedin"); const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", locale: "en-US", }); const result: LinkedInCompanyData = { name: null, description: null, employeeCount: null, employeeRange: null, industry: null, headquarters: null, website: null, recentPosts: [], decisionMakers: [], }; const page = await context.newPage(); try { // ── Company About Page ──────────────────────────────────── const aboutUrl = linkedinUrl.replace(/\/$/, "") + "/about/"; await page.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 20_000 }); // Add small delay to let JS render await page.waitForTimeout(2000); const pageText = await page.evaluate(() => document.body.innerText); // Extract employee count const empMatch = pageText.match(/(\d[\d,]+)\s*(?:followers|employees)/i); if (empMatch) { result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10); } // Extract range if count not found const rangeMatch = pageText.match(/(\d+[\d,]*)\s*[-–]\s*(\d+[\d,]*)\s*employees/i); if (rangeMatch) { result.employeeRange = `${rangeMatch[1]}-${rangeMatch[2]}`; } // Extract company name from og:title result.name = await page .$eval('meta[property="og:title"]', (el) => el.getAttribute("content")) .catch(() => null); // Extract description result.description = await page .$eval('meta[property="og:description"]', (el) => el.getAttribute("content")) .catch(() => null); // Extract industry + HQ from About section text const industryMatch = pageText.match(/Industry\s*\n([^\n]+)/i); if (industryMatch) result.industry = industryMatch[1].trim(); const hqMatch = pageText.match(/Headquarters\s*\n([^\n]+)/i); if (hqMatch) result.headquarters = hqMatch[1].trim(); logger.info( { linkedinUrl, employees: result.employeeCount, industry: result.industry }, "LinkedIn company scraped" ); // ── People Page (public) ───────────────────────────────── const peopleUrl = linkedinUrl.replace(/\/$/, "") + "/people/"; await page.goto(peopleUrl, { waitUntil: "domcontentloaded", timeout: 15_000 }); await page.waitForTimeout(1500); const peopleText = await page.evaluate(() => document.body.innerText); result.decisionMakers = extractDecisionMakers(peopleText, linkedinUrl); logger.info({ linkedinUrl, dmCount: result.decisionMakers.length }, "LinkedIn people scraped"); } catch (err) { logger.warn({ linkedinUrl, err }, "LinkedIn scrape partial failure"); } finally { await page.close(); await context.close(); await browser.close(); } return result; } /** * Searches LinkedIn for a company by name + region using Google. * Returns the LinkedIn company URL if found. */ export function buildLinkedInSearchUrl(companyName: string): string { const q = encodeURIComponent(`site:linkedin.com/company "${companyName}"`); return `https://www.google.com/search?q=${q}`; } function extractDecisionMakers(text: string, companyUrl: string): LinkedInPerson[] { const lines = text.split("\n").map((l) => l.trim()).filter((l) => l.length > 2); const people: LinkedInPerson[] = []; for (let i = 0; i < lines.length - 1; i++) { const nameLine = lines[i]; const titleLine = lines[i + 1] ?? ""; // Names are typically 2-4 words, Title follows const isName = /^[A-Z][a-z]+ [A-Z]/.test(nameLine) && nameLine.split(" ").length <= 4; if (!isName) continue; const titleLower = titleLine.toLowerCase(); const isDecisionMaker = DECISION_MAKER_TITLES.some((t) => titleLower.includes(t)); if (isDecisionMaker || titleLower.length < 60) { people.push({ fullName: nameLine, title: titleLine, linkedinUrl: `${companyUrl}/people/`, // public people page isDecisionMaker, }); i++; // skip title line } if (people.length >= 10) break; } // Sort: decision-makers first return people.sort((a, b) => Number(b.isDecisionMaker) - Number(a.isDecisionMaker)); }