Spaces:
Running
Running
| import { chromium, Browser, BrowserContext } from "playwright"; | |
| import { playwrightLimiter } from "../../shared/utils/rate-limiter"; | |
| import { logger } from "../../shared/utils/logger"; | |
| export interface LinkedInCompanyData { | |
| name: string | null; | |
| description: string | null; | |
| employeeCount: number | null; | |
| employeeRange: string | null; | |
| industry: string | null; | |
| headquarters: string | null; | |
| website: string | null; | |
| recentPosts: string[]; | |
| decisionMakers: LinkedInPerson[]; | |
| } | |
| export interface LinkedInPerson { | |
| fullName: string; | |
| title: string; | |
| linkedinUrl: string; | |
| isDecisionMaker: boolean; | |
| } | |
| const DECISION_MAKER_TITLES = [ | |
| "ceo", "chief executive", "founder", "co-founder", "cofounder", | |
| "cto", "chief technology", "coo", "chief operating", | |
| "vp", "vice president", "director", "head of", | |
| "managing director", "general manager", "president", | |
| ]; | |
| /** | |
| * Scrapes LinkedIn public company page. | |
| * Only reads publicly visible data β no login, no TOS violation. | |
| */ | |
| export async function scrapeLinkedInCompany( | |
| linkedinUrl: string | |
| ): Promise<LinkedInCompanyData> { | |
| await playwrightLimiter.consume("linkedin"); | |
| const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] }); | |
| const context = await browser.newContext({ | |
| userAgent: | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + | |
| "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| locale: "en-US", | |
| }); | |
| const result: LinkedInCompanyData = { | |
| name: null, | |
| description: null, | |
| employeeCount: null, | |
| employeeRange: null, | |
| industry: null, | |
| headquarters: null, | |
| website: null, | |
| recentPosts: [], | |
| decisionMakers: [], | |
| }; | |
| const page = await context.newPage(); | |
| try { | |
| // ββ Company About Page ββββββββββββββββββββββββββββββββββββ | |
| const aboutUrl = linkedinUrl.replace(/\/$/, "") + "/about/"; | |
| await page.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 20_000 }); | |
| // Add small delay to let JS render | |
| await page.waitForTimeout(2000); | |
| const pageText = await page.evaluate(() => document.body.innerText); | |
| // Extract employee count | |
| const empMatch = pageText.match(/(\d[\d,]+)\s*(?:followers|employees)/i); | |
| if (empMatch) { | |
| result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10); | |
| } | |
| // Extract range if count not found | |
| const rangeMatch = pageText.match(/(\d+[\d,]*)\s*[-β]\s*(\d+[\d,]*)\s*employees/i); | |
| if (rangeMatch) { | |
| result.employeeRange = `${rangeMatch[1]}-${rangeMatch[2]}`; | |
| } | |
| // Extract company name from og:title | |
| result.name = await page | |
| .$eval('meta[property="og:title"]', (el) => el.getAttribute("content")) | |
| .catch(() => null); | |
| // Extract description | |
| result.description = await page | |
| .$eval('meta[property="og:description"]', (el) => el.getAttribute("content")) | |
| .catch(() => null); | |
| // Extract industry + HQ from About section text | |
| const industryMatch = pageText.match(/Industry\s*\n([^\n]+)/i); | |
| if (industryMatch) result.industry = industryMatch[1].trim(); | |
| const hqMatch = pageText.match(/Headquarters\s*\n([^\n]+)/i); | |
| if (hqMatch) result.headquarters = hqMatch[1].trim(); | |
| logger.info( | |
| { linkedinUrl, employees: result.employeeCount, industry: result.industry }, | |
| "LinkedIn company scraped" | |
| ); | |
| // ββ People Page (public) βββββββββββββββββββββββββββββββββ | |
| const peopleUrl = linkedinUrl.replace(/\/$/, "") + "/people/"; | |
| await page.goto(peopleUrl, { waitUntil: "domcontentloaded", timeout: 15_000 }); | |
| await page.waitForTimeout(1500); | |
| const peopleText = await page.evaluate(() => document.body.innerText); | |
| result.decisionMakers = extractDecisionMakers(peopleText, linkedinUrl); | |
| logger.info({ linkedinUrl, dmCount: result.decisionMakers.length }, "LinkedIn people scraped"); | |
| } catch (err) { | |
| logger.warn({ linkedinUrl, err }, "LinkedIn scrape partial failure"); | |
| } finally { | |
| await page.close(); | |
| await context.close(); | |
| await browser.close(); | |
| } | |
| return result; | |
| } | |
| /** | |
| * Searches LinkedIn for a company by name + region using Google. | |
| * Returns the LinkedIn company URL if found. | |
| */ | |
| export function buildLinkedInSearchUrl(companyName: string): string { | |
| const q = encodeURIComponent(`site:linkedin.com/company "${companyName}"`); | |
| return `https://www.google.com/search?q=${q}`; | |
| } | |
| function extractDecisionMakers(text: string, companyUrl: string): LinkedInPerson[] { | |
| const lines = text.split("\n").map((l) => l.trim()).filter((l) => l.length > 2); | |
| const people: LinkedInPerson[] = []; | |
| for (let i = 0; i < lines.length - 1; i++) { | |
| const nameLine = lines[i]; | |
| const titleLine = lines[i + 1] ?? ""; | |
| // Names are typically 2-4 words, Title follows | |
| const isName = /^[A-Z][a-z]+ [A-Z]/.test(nameLine) && nameLine.split(" ").length <= 4; | |
| if (!isName) continue; | |
| const titleLower = titleLine.toLowerCase(); | |
| const isDecisionMaker = DECISION_MAKER_TITLES.some((t) => titleLower.includes(t)); | |
| if (isDecisionMaker || titleLower.length < 60) { | |
| people.push({ | |
| fullName: nameLine, | |
| title: titleLine, | |
| linkedinUrl: `${companyUrl}/people/`, // public people page | |
| isDecisionMaker, | |
| }); | |
| i++; // skip title line | |
| } | |
| if (people.length >= 10) break; | |
| } | |
| // Sort: decision-makers first | |
| return people.sort((a, b) => Number(b.isDecisionMaker) - Number(a.isDecisionMaker)); | |
| } | |