clienttarget / src /discovery /lib /linkedin-scraper.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
import { chromium, Browser, BrowserContext } from "playwright";
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
import { logger } from "../../shared/utils/logger";
export interface LinkedInCompanyData {
name: string | null;
description: string | null;
employeeCount: number | null;
employeeRange: string | null;
industry: string | null;
headquarters: string | null;
website: string | null;
recentPosts: string[];
decisionMakers: LinkedInPerson[];
}
export interface LinkedInPerson {
fullName: string;
title: string;
linkedinUrl: string;
isDecisionMaker: boolean;
}
const DECISION_MAKER_TITLES = [
"ceo", "chief executive", "founder", "co-founder", "cofounder",
"cto", "chief technology", "coo", "chief operating",
"vp", "vice president", "director", "head of",
"managing director", "general manager", "president",
];
/**
* Scrapes LinkedIn public company page.
* Only reads publicly visible data β€” no login, no TOS violation.
*/
export async function scrapeLinkedInCompany(
linkedinUrl: string
): Promise<LinkedInCompanyData> {
await playwrightLimiter.consume("linkedin");
const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
locale: "en-US",
});
const result: LinkedInCompanyData = {
name: null,
description: null,
employeeCount: null,
employeeRange: null,
industry: null,
headquarters: null,
website: null,
recentPosts: [],
decisionMakers: [],
};
const page = await context.newPage();
try {
// ── Company About Page ────────────────────────────────────
const aboutUrl = linkedinUrl.replace(/\/$/, "") + "/about/";
await page.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 20_000 });
// Add small delay to let JS render
await page.waitForTimeout(2000);
const pageText = await page.evaluate(() => document.body.innerText);
// Extract employee count
const empMatch = pageText.match(/(\d[\d,]+)\s*(?:followers|employees)/i);
if (empMatch) {
result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
}
// Extract range if count not found
const rangeMatch = pageText.match(/(\d+[\d,]*)\s*[-–]\s*(\d+[\d,]*)\s*employees/i);
if (rangeMatch) {
result.employeeRange = `${rangeMatch[1]}-${rangeMatch[2]}`;
}
// Extract company name from og:title
result.name = await page
.$eval('meta[property="og:title"]', (el) => el.getAttribute("content"))
.catch(() => null);
// Extract description
result.description = await page
.$eval('meta[property="og:description"]', (el) => el.getAttribute("content"))
.catch(() => null);
// Extract industry + HQ from About section text
const industryMatch = pageText.match(/Industry\s*\n([^\n]+)/i);
if (industryMatch) result.industry = industryMatch[1].trim();
const hqMatch = pageText.match(/Headquarters\s*\n([^\n]+)/i);
if (hqMatch) result.headquarters = hqMatch[1].trim();
logger.info(
{ linkedinUrl, employees: result.employeeCount, industry: result.industry },
"LinkedIn company scraped"
);
// ── People Page (public) ─────────────────────────────────
const peopleUrl = linkedinUrl.replace(/\/$/, "") + "/people/";
await page.goto(peopleUrl, { waitUntil: "domcontentloaded", timeout: 15_000 });
await page.waitForTimeout(1500);
const peopleText = await page.evaluate(() => document.body.innerText);
result.decisionMakers = extractDecisionMakers(peopleText, linkedinUrl);
logger.info({ linkedinUrl, dmCount: result.decisionMakers.length }, "LinkedIn people scraped");
} catch (err) {
logger.warn({ linkedinUrl, err }, "LinkedIn scrape partial failure");
} finally {
await page.close();
await context.close();
await browser.close();
}
return result;
}
/**
* Searches LinkedIn for a company by name + region using Google.
* Returns the LinkedIn company URL if found.
*/
export function buildLinkedInSearchUrl(companyName: string): string {
const q = encodeURIComponent(`site:linkedin.com/company "${companyName}"`);
return `https://www.google.com/search?q=${q}`;
}
function extractDecisionMakers(text: string, companyUrl: string): LinkedInPerson[] {
const lines = text.split("\n").map((l) => l.trim()).filter((l) => l.length > 2);
const people: LinkedInPerson[] = [];
for (let i = 0; i < lines.length - 1; i++) {
const nameLine = lines[i];
const titleLine = lines[i + 1] ?? "";
// Names are typically 2-4 words, Title follows
const isName = /^[A-Z][a-z]+ [A-Z]/.test(nameLine) && nameLine.split(" ").length <= 4;
if (!isName) continue;
const titleLower = titleLine.toLowerCase();
const isDecisionMaker = DECISION_MAKER_TITLES.some((t) => titleLower.includes(t));
if (isDecisionMaker || titleLower.length < 60) {
people.push({
fullName: nameLine,
title: titleLine,
linkedinUrl: `${companyUrl}/people/`, // public people page
isDecisionMaker,
});
i++; // skip title line
}
if (people.length >= 10) break;
}
// Sort: decision-makers first
return people.sort((a, b) => Number(b.isDecisionMaker) - Number(a.isDecisionMaker));
}