Spaces:

dexakif
/

clienttarget

Running

clienttarget / src /discovery /lib /linkedin-scraper.ts

iDevBuddy

feat: Phase 1 — AI Client Acquisition System

bd28470 3 days ago

5.59 kB

	import { chromium, Browser, BrowserContext } from "playwright";
	import { playwrightLimiter } from "../../shared/utils/rate-limiter";
	import { logger } from "../../shared/utils/logger";

	export interface LinkedInCompanyData {
	name: string \| null;
	description: string \| null;
	employeeCount: number \| null;
	employeeRange: string \| null;
	industry: string \| null;
	headquarters: string \| null;
	website: string \| null;
	recentPosts: string[];
	decisionMakers: LinkedInPerson[];
	}

	export interface LinkedInPerson {
	fullName: string;
	title: string;
	linkedinUrl: string;
	isDecisionMaker: boolean;
	}

	const DECISION_MAKER_TITLES = [
	"ceo", "chief executive", "founder", "co-founder", "cofounder",
	"cto", "chief technology", "coo", "chief operating",
	"vp", "vice president", "director", "head of",
	"managing director", "general manager", "president",
	];

	/**
	* Scrapes LinkedIn public company page.
	* Only reads publicly visible data — no login, no TOS violation.
	*/
	export async function scrapeLinkedInCompany(
	linkedinUrl: string
	): Promise<LinkedInCompanyData> {
	await playwrightLimiter.consume("linkedin");

	const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
	const context = await browser.newContext({
	userAgent:
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
	"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
	locale: "en-US",
	});

	const result: LinkedInCompanyData = {
	name: null,
	description: null,
	employeeCount: null,
	employeeRange: null,
	industry: null,
	headquarters: null,
	website: null,
	recentPosts: [],
	decisionMakers: [],
	};

	const page = await context.newPage();

	try {
	// ── Company About Page ────────────────────────────────────
	const aboutUrl = linkedinUrl.replace(/\/$/, "") + "/about/";
	await page.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 20_000 });

	// Add small delay to let JS render
	await page.waitForTimeout(2000);

	const pageText = await page.evaluate(() => document.body.innerText);

	// Extract employee count
	const empMatch = pageText.match(/(\d[\d,]+)\s*(?:followers\|employees)/i);
	if (empMatch) {
	result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
	}

	// Extract range if count not found
	const rangeMatch = pageText.match(/(\d+[\d,])\s[-–]\s(\d+[\d,])\s*employees/i);
	if (rangeMatch) {
	result.employeeRange = `${rangeMatch[1]}-${rangeMatch[2]}`;
	}

	// Extract company name from og:title
	result.name = await page
	.$eval('meta[property="og:title"]', (el) => el.getAttribute("content"))
	.catch(() => null);

	// Extract description
	result.description = await page
	.$eval('meta[property="og:description"]', (el) => el.getAttribute("content"))
	.catch(() => null);

	// Extract industry + HQ from About section text
	const industryMatch = pageText.match(/Industry\s*\n([^\n]+)/i);
	if (industryMatch) result.industry = industryMatch[1].trim();

	const hqMatch = pageText.match(/Headquarters\s*\n([^\n]+)/i);
	if (hqMatch) result.headquarters = hqMatch[1].trim();

	logger.info(
	{ linkedinUrl, employees: result.employeeCount, industry: result.industry },
	"LinkedIn company scraped"
	);

	// ── People Page (public) ─────────────────────────────────
	const peopleUrl = linkedinUrl.replace(/\/$/, "") + "/people/";
	await page.goto(peopleUrl, { waitUntil: "domcontentloaded", timeout: 15_000 });
	await page.waitForTimeout(1500);

	const peopleText = await page.evaluate(() => document.body.innerText);
	result.decisionMakers = extractDecisionMakers(peopleText, linkedinUrl);

	logger.info({ linkedinUrl, dmCount: result.decisionMakers.length }, "LinkedIn people scraped");
	} catch (err) {
	logger.warn({ linkedinUrl, err }, "LinkedIn scrape partial failure");
	} finally {
	await page.close();
	await context.close();
	await browser.close();
	}

	return result;
	}

	/**
	* Searches LinkedIn for a company by name + region using Google.
	* Returns the LinkedIn company URL if found.
	*/
	export function buildLinkedInSearchUrl(companyName: string): string {
	const q = encodeURIComponent(`site:linkedin.com/company "${companyName}"`);
	return `https://www.google.com/search?q=${q}`;
	}

	function extractDecisionMakers(text: string, companyUrl: string): LinkedInPerson[] {
	const lines = text.split("\n").map((l) => l.trim()).filter((l) => l.length > 2);
	const people: LinkedInPerson[] = [];

	for (let i = 0; i < lines.length - 1; i++) {
	const nameLine = lines[i];
	const titleLine = lines[i + 1] ?? "";

	// Names are typically 2-4 words, Title follows
	const isName = /^[A-Z][a-z]+ [A-Z]/.test(nameLine) && nameLine.split(" ").length <= 4;
	if (!isName) continue;

	const titleLower = titleLine.toLowerCase();
	const isDecisionMaker = DECISION_MAKER_TITLES.some((t) => titleLower.includes(t));

	if (isDecisionMaker \|\| titleLower.length < 60) {
	people.push({
	fullName: nameLine,
	title: titleLine,
	linkedinUrl: `${companyUrl}/people/`, // public people page
	isDecisionMaker,
	});
	i++; // skip title line
	}

	if (people.length >= 10) break;
	}

	// Sort: decision-makers first
	return people.sort((a, b) => Number(b.isDecisionMaker) - Number(a.isDecisionMaker));
	}