Spaces:
Running
Running
| import { chromium, Browser, BrowserContext } from "playwright"; | |
| import { playwrightLimiter } from "../../shared/utils/rate-limiter"; | |
| import { logger } from "../../shared/utils/logger"; | |
| // βββ Types βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export interface ScrapedCompany { | |
| domain: string; | |
| name: string | null; | |
| description: string | null; | |
| employeeRange: string | null; | |
| employeeCount: number | null; | |
| industry: string | null; | |
| country: string | null; | |
| linkedinUrl: string | null; | |
| techStack: string[]; | |
| jobPostings: JobPosting[]; | |
| recentNews: string[]; | |
| websiteText: string; | |
| html: string; // raw HTML for pain signal detection | |
| text: string; // alias for websiteText (used by auto-discovery) | |
| aiJobCount: number; // count of AI-related job postings | |
| } | |
| export interface JobPosting { | |
| title: string; | |
| url: string; | |
| hasAiSignal: boolean; | |
| } | |
| // βββ AI signal keywords ββββββββββββββββββββββββββββββββββββββ | |
| const AI_KEYWORDS = [ | |
| "automation", "artificial intelligence", "machine learning", "ai", "llm", | |
| "workflow automation", "robotic process", "rpa", "data pipeline", | |
| "digital transformation", "predictive analytics", "nlp", | |
| ]; | |
| const TECH_STACK_SIGNALS = [ | |
| "salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk", | |
| "servicenow", "workday", "netsuite", "quickbooks", "zoho", | |
| "slack", "jira", "notion", "monday.com", "asana", | |
| ]; | |
| // βββ Browser singleton βββββββββββββββββββββββββββββββββββββββ | |
| let _browser: Browser | null = null; | |
| async function getBrowser(): Promise<Browser> { | |
| if (!_browser || !_browser.isConnected()) { | |
| _browser = await chromium.launch({ | |
| headless: true, | |
| args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"], | |
| }); | |
| } | |
| return _browser; | |
| } | |
| export async function closeBrowser(): Promise<void> { | |
| if (_browser) { | |
| await _browser.close(); | |
| _browser = null; | |
| } | |
| } | |
| // βββ Main scraper βββββββββββββββββββββββββββββββββββββββββββββ | |
| /** | |
| * Scrapes a company website for ICP-relevant signals. | |
| * Respects rate limits and robots.txt awareness (no sitemap abuse). | |
| */ | |
| export async function scrapeCompanyWebsite(domain: string): Promise<ScrapedCompany> { | |
| await playwrightLimiter.consume("playwright"); | |
| const browser = await getBrowser(); | |
| const context = await browser.newContext({ | |
| userAgent: | |
| "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)", | |
| extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" }, | |
| }); | |
| const result: ScrapedCompany = { | |
| domain, | |
| name: null, | |
| description: null, | |
| employeeRange: null, | |
| employeeCount: null, | |
| industry: null, | |
| country: null, | |
| linkedinUrl: null, | |
| techStack: [], | |
| jobPostings: [], | |
| recentNews: [], | |
| websiteText: "", | |
| html: "", | |
| text: "", | |
| aiJobCount: 0, | |
| }; | |
| try { | |
| // ββ Homepage βββββββββββββββββββββββββββββββββββββββββββββ | |
| const homePage = await context.newPage(); | |
| await homePage.goto(`https://${domain}`, { | |
| waitUntil: "domcontentloaded", | |
| timeout: 15_000, | |
| }); | |
| const homeText = await homePage.evaluate(() => document.body.innerText); | |
| result.websiteText = homeText.slice(0, 3000); | |
| result.text = result.websiteText; // alias | |
| // Extract company name from title tag | |
| result.name = await homePage.title().then((t) => | |
| t.split("|")[0].split("-")[0].trim() | |
| ); | |
| // Find LinkedIn link on homepage | |
| const linkedinHref = await homePage | |
| .$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href")) | |
| .catch(() => null); | |
| result.linkedinUrl = linkedinHref ?? null; | |
| // Tech stack detection from script/link tags | |
| const pageSource = await homePage.content(); | |
| result.techStack = detectTechStack(pageSource); | |
| result.html = pageSource.slice(0, 10000); // raw HTML for pain detection | |
| await homePage.close(); | |
| // ββ About Page βββββββββββββββββββββββββββββββββββββββββββ | |
| const aboutPage = await context.newPage(); | |
| const aboutUrl = `https://${domain}/about`; | |
| try { | |
| await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 }); | |
| const aboutText = await aboutPage.evaluate(() => document.body.innerText); | |
| result.description = extractDescription(aboutText); | |
| const empMatch = aboutText.match(/(\d[\d,]*)\s*(employees?|people|team members?|staff)/i); | |
| if (empMatch) { | |
| result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10); | |
| } | |
| } catch { | |
| // About page not found β that's fine | |
| } finally { | |
| await aboutPage.close(); | |
| } | |
| // ββ Jobs Page ββββββββββββββββββββββββββββββββββββββββββββ | |
| const jobsPage = await context.newPage(); | |
| const jobsUrls = [ | |
| `https://${domain}/careers`, | |
| `https://${domain}/jobs`, | |
| `https://${domain}/work-with-us`, | |
| ]; | |
| for (const jobUrl of jobsUrls) { | |
| try { | |
| await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 }); | |
| const jobsText = await jobsPage.evaluate(() => document.body.innerText); | |
| result.jobPostings = extractJobPostings(jobsText, jobUrl); | |
| if (result.jobPostings.length) break; | |
| } catch { | |
| // Try next URL | |
| } | |
| } | |
| await jobsPage.close(); | |
| result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length; | |
| logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length }, | |
| "Website scraped successfully" | |
| ); | |
| } catch (err) { | |
| logger.warn({ domain, err }, "Website scrape partial failure"); | |
| } finally { | |
| await context.close(); | |
| } | |
| return result; | |
| } | |
| // βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function detectTechStack(html: string): string[] { | |
| const found: string[] = []; | |
| const lower = html.toLowerCase(); | |
| for (const tech of TECH_STACK_SIGNALS) { | |
| if (lower.includes(tech)) found.push(tech); | |
| } | |
| return [...new Set(found)]; | |
| } | |
| function extractDescription(text: string): string { | |
| // Take first 3 meaningful sentences | |
| const sentences = text | |
| .replace(/\n+/g, " ") | |
| .split(/(?<=[.!?])\s+/) | |
| .filter((s) => s.length > 30 && s.length < 300); | |
| return sentences.slice(0, 3).join(" "); | |
| } | |
| function extractJobPostings(text: string, sourceUrl: string): JobPosting[] { | |
| const lines = text.split("\n").filter((l) => l.trim().length > 5); | |
| const postings: JobPosting[] = []; | |
| for (const line of lines.slice(0, 30)) { | |
| const lower = line.toLowerCase(); | |
| const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw)); | |
| // Heuristic: job titles are usually 2-6 words | |
| const wordCount = line.trim().split(/\s+/).length; | |
| if (wordCount >= 2 && wordCount <= 8) { | |
| postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal }); | |
| } | |
| } | |
| return postings.slice(0, 15); | |
| } | |
| export function hasAiSignals(company: ScrapedCompany): boolean { | |
| const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length; | |
| const websiteHasAi = AI_KEYWORDS.some((kw) => | |
| company.websiteText.toLowerCase().includes(kw) | |
| ); | |
| return aiJobs > 0 || websiteHasAi; | |
| } | |