clienttarget / src /discovery /lib /web-scraper.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
import { chromium, Browser, BrowserContext } from "playwright";
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
import { logger } from "../../shared/utils/logger";
// ─── Types ───────────────────────────────────────────────────
export interface ScrapedCompany {
domain: string;
name: string | null;
description: string | null;
employeeRange: string | null;
employeeCount: number | null;
industry: string | null;
country: string | null;
linkedinUrl: string | null;
techStack: string[];
jobPostings: JobPosting[];
recentNews: string[];
websiteText: string;
html: string; // raw HTML for pain signal detection
text: string; // alias for websiteText (used by auto-discovery)
aiJobCount: number; // count of AI-related job postings
}
export interface JobPosting {
title: string;
url: string;
hasAiSignal: boolean;
}
// ─── AI signal keywords ──────────────────────────────────────
const AI_KEYWORDS = [
"automation", "artificial intelligence", "machine learning", "ai", "llm",
"workflow automation", "robotic process", "rpa", "data pipeline",
"digital transformation", "predictive analytics", "nlp",
];
const TECH_STACK_SIGNALS = [
"salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk",
"servicenow", "workday", "netsuite", "quickbooks", "zoho",
"slack", "jira", "notion", "monday.com", "asana",
];
// ─── Browser singleton ───────────────────────────────────────
let _browser: Browser | null = null;
async function getBrowser(): Promise<Browser> {
if (!_browser || !_browser.isConnected()) {
_browser = await chromium.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
});
}
return _browser;
}
export async function closeBrowser(): Promise<void> {
if (_browser) {
await _browser.close();
_browser = null;
}
}
// ─── Main scraper ─────────────────────────────────────────────
/**
* Scrapes a company website for ICP-relevant signals.
* Respects rate limits and robots.txt awareness (no sitemap abuse).
*/
export async function scrapeCompanyWebsite(domain: string): Promise<ScrapedCompany> {
await playwrightLimiter.consume("playwright");
const browser = await getBrowser();
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)",
extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" },
});
const result: ScrapedCompany = {
domain,
name: null,
description: null,
employeeRange: null,
employeeCount: null,
industry: null,
country: null,
linkedinUrl: null,
techStack: [],
jobPostings: [],
recentNews: [],
websiteText: "",
html: "",
text: "",
aiJobCount: 0,
};
try {
// ── Homepage ─────────────────────────────────────────────
const homePage = await context.newPage();
await homePage.goto(`https://${domain}`, {
waitUntil: "domcontentloaded",
timeout: 15_000,
});
const homeText = await homePage.evaluate(() => document.body.innerText);
result.websiteText = homeText.slice(0, 3000);
result.text = result.websiteText; // alias
// Extract company name from title tag
result.name = await homePage.title().then((t) =>
t.split("|")[0].split("-")[0].trim()
);
// Find LinkedIn link on homepage
const linkedinHref = await homePage
.$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href"))
.catch(() => null);
result.linkedinUrl = linkedinHref ?? null;
// Tech stack detection from script/link tags
const pageSource = await homePage.content();
result.techStack = detectTechStack(pageSource);
result.html = pageSource.slice(0, 10000); // raw HTML for pain detection
await homePage.close();
// ── About Page ───────────────────────────────────────────
const aboutPage = await context.newPage();
const aboutUrl = `https://${domain}/about`;
try {
await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
const aboutText = await aboutPage.evaluate(() => document.body.innerText);
result.description = extractDescription(aboutText);
const empMatch = aboutText.match(/(\d[\d,]*)\s*(employees?|people|team members?|staff)/i);
if (empMatch) {
result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
}
} catch {
// About page not found β€” that's fine
} finally {
await aboutPage.close();
}
// ── Jobs Page ────────────────────────────────────────────
const jobsPage = await context.newPage();
const jobsUrls = [
`https://${domain}/careers`,
`https://${domain}/jobs`,
`https://${domain}/work-with-us`,
];
for (const jobUrl of jobsUrls) {
try {
await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
const jobsText = await jobsPage.evaluate(() => document.body.innerText);
result.jobPostings = extractJobPostings(jobsText, jobUrl);
if (result.jobPostings.length) break;
} catch {
// Try next URL
}
}
await jobsPage.close();
result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length;
logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length },
"Website scraped successfully"
);
} catch (err) {
logger.warn({ domain, err }, "Website scrape partial failure");
} finally {
await context.close();
}
return result;
}
// ─── Helpers ─────────────────────────────────────────────────
function detectTechStack(html: string): string[] {
const found: string[] = [];
const lower = html.toLowerCase();
for (const tech of TECH_STACK_SIGNALS) {
if (lower.includes(tech)) found.push(tech);
}
return [...new Set(found)];
}
function extractDescription(text: string): string {
// Take first 3 meaningful sentences
const sentences = text
.replace(/\n+/g, " ")
.split(/(?<=[.!?])\s+/)
.filter((s) => s.length > 30 && s.length < 300);
return sentences.slice(0, 3).join(" ");
}
function extractJobPostings(text: string, sourceUrl: string): JobPosting[] {
const lines = text.split("\n").filter((l) => l.trim().length > 5);
const postings: JobPosting[] = [];
for (const line of lines.slice(0, 30)) {
const lower = line.toLowerCase();
const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw));
// Heuristic: job titles are usually 2-6 words
const wordCount = line.trim().split(/\s+/).length;
if (wordCount >= 2 && wordCount <= 8) {
postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal });
}
}
return postings.slice(0, 15);
}
export function hasAiSignals(company: ScrapedCompany): boolean {
const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length;
const websiteHasAi = AI_KEYWORDS.some((kw) =>
company.websiteText.toLowerCase().includes(kw)
);
return aiJobs > 0 || websiteHasAi;
}