import { chromium, Browser, BrowserContext } from "playwright"; import { playwrightLimiter } from "../../shared/utils/rate-limiter"; import { logger } from "../../shared/utils/logger"; // ─── Types ─────────────────────────────────────────────────── export interface ScrapedCompany { domain: string; name: string | null; description: string | null; employeeRange: string | null; employeeCount: number | null; industry: string | null; country: string | null; linkedinUrl: string | null; techStack: string[]; jobPostings: JobPosting[]; recentNews: string[]; websiteText: string; html: string; // raw HTML for pain signal detection text: string; // alias for websiteText (used by auto-discovery) aiJobCount: number; // count of AI-related job postings } export interface JobPosting { title: string; url: string; hasAiSignal: boolean; } // ─── AI signal keywords ────────────────────────────────────── const AI_KEYWORDS = [ "automation", "artificial intelligence", "machine learning", "ai", "llm", "workflow automation", "robotic process", "rpa", "data pipeline", "digital transformation", "predictive analytics", "nlp", ]; const TECH_STACK_SIGNALS = [ "salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk", "servicenow", "workday", "netsuite", "quickbooks", "zoho", "slack", "jira", "notion", "monday.com", "asana", ]; // ─── Browser singleton ─────────────────────────────────────── let _browser: Browser | null = null; async function getBrowser(): Promise { if (!_browser || !_browser.isConnected()) { _browser = await chromium.launch({ headless: true, args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"], }); } return _browser; } export async function closeBrowser(): Promise { if (_browser) { await _browser.close(); _browser = null; } } // ─── Main scraper ───────────────────────────────────────────── /** * Scrapes a company website for ICP-relevant signals. * Respects rate limits and robots.txt awareness (no sitemap abuse). */ export async function scrapeCompanyWebsite(domain: string): Promise { await playwrightLimiter.consume("playwright"); const browser = await getBrowser(); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)", extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" }, }); const result: ScrapedCompany = { domain, name: null, description: null, employeeRange: null, employeeCount: null, industry: null, country: null, linkedinUrl: null, techStack: [], jobPostings: [], recentNews: [], websiteText: "", html: "", text: "", aiJobCount: 0, }; try { // ── Homepage ───────────────────────────────────────────── const homePage = await context.newPage(); await homePage.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 15_000, }); const homeText = await homePage.evaluate(() => document.body.innerText); result.websiteText = homeText.slice(0, 3000); result.text = result.websiteText; // alias // Extract company name from title tag result.name = await homePage.title().then((t) => t.split("|")[0].split("-")[0].trim() ); // Find LinkedIn link on homepage const linkedinHref = await homePage .$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href")) .catch(() => null); result.linkedinUrl = linkedinHref ?? null; // Tech stack detection from script/link tags const pageSource = await homePage.content(); result.techStack = detectTechStack(pageSource); result.html = pageSource.slice(0, 10000); // raw HTML for pain detection await homePage.close(); // ── About Page ─────────────────────────────────────────── const aboutPage = await context.newPage(); const aboutUrl = `https://${domain}/about`; try { await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 }); const aboutText = await aboutPage.evaluate(() => document.body.innerText); result.description = extractDescription(aboutText); const empMatch = aboutText.match(/(\d[\d,]*)\s*(employees?|people|team members?|staff)/i); if (empMatch) { result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10); } } catch { // About page not found — that's fine } finally { await aboutPage.close(); } // ── Jobs Page ──────────────────────────────────────────── const jobsPage = await context.newPage(); const jobsUrls = [ `https://${domain}/careers`, `https://${domain}/jobs`, `https://${domain}/work-with-us`, ]; for (const jobUrl of jobsUrls) { try { await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 }); const jobsText = await jobsPage.evaluate(() => document.body.innerText); result.jobPostings = extractJobPostings(jobsText, jobUrl); if (result.jobPostings.length) break; } catch { // Try next URL } } await jobsPage.close(); result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length; logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length }, "Website scraped successfully" ); } catch (err) { logger.warn({ domain, err }, "Website scrape partial failure"); } finally { await context.close(); } return result; } // ─── Helpers ───────────────────────────────────────────────── function detectTechStack(html: string): string[] { const found: string[] = []; const lower = html.toLowerCase(); for (const tech of TECH_STACK_SIGNALS) { if (lower.includes(tech)) found.push(tech); } return [...new Set(found)]; } function extractDescription(text: string): string { // Take first 3 meaningful sentences const sentences = text .replace(/\n+/g, " ") .split(/(?<=[.!?])\s+/) .filter((s) => s.length > 30 && s.length < 300); return sentences.slice(0, 3).join(" "); } function extractJobPostings(text: string, sourceUrl: string): JobPosting[] { const lines = text.split("\n").filter((l) => l.trim().length > 5); const postings: JobPosting[] = []; for (const line of lines.slice(0, 30)) { const lower = line.toLowerCase(); const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw)); // Heuristic: job titles are usually 2-6 words const wordCount = line.trim().split(/\s+/).length; if (wordCount >= 2 && wordCount <= 8) { postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal }); } } return postings.slice(0, 15); } export function hasAiSignals(company: ScrapedCompany): boolean { const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length; const websiteHasAi = AI_KEYWORDS.some((kw) => company.websiteText.toLowerCase().includes(kw) ); return aiJobs > 0 || websiteHasAi; }