Spaces:

dexakif
/

clienttarget

Running

clienttarget / src /discovery /lib /web-scraper.ts

iDevBuddy

feat: Phase 1 — AI Client Acquisition System

bd28470 3 days ago

7.84 kB

	import { chromium, Browser, BrowserContext } from "playwright";
	import { playwrightLimiter } from "../../shared/utils/rate-limiter";
	import { logger } from "../../shared/utils/logger";

	// ─── Types ───────────────────────────────────────────────────

	export interface ScrapedCompany {
	domain: string;
	name: string \| null;
	description: string \| null;
	employeeRange: string \| null;
	employeeCount: number \| null;
	industry: string \| null;
	country: string \| null;
	linkedinUrl: string \| null;
	techStack: string[];
	jobPostings: JobPosting[];
	recentNews: string[];
	websiteText: string;
	html: string; // raw HTML for pain signal detection
	text: string; // alias for websiteText (used by auto-discovery)
	aiJobCount: number; // count of AI-related job postings
	}

	export interface JobPosting {
	title: string;
	url: string;
	hasAiSignal: boolean;
	}

	// ─── AI signal keywords ──────────────────────────────────────

	const AI_KEYWORDS = [
	"automation", "artificial intelligence", "machine learning", "ai", "llm",
	"workflow automation", "robotic process", "rpa", "data pipeline",
	"digital transformation", "predictive analytics", "nlp",
	];

	const TECH_STACK_SIGNALS = [
	"salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk",
	"servicenow", "workday", "netsuite", "quickbooks", "zoho",
	"slack", "jira", "notion", "monday.com", "asana",
	];

	// ─── Browser singleton ───────────────────────────────────────

	let _browser: Browser \| null = null;

	async function getBrowser(): Promise<Browser> {
	if (!_browser \|\| !_browser.isConnected()) {
	_browser = await chromium.launch({
	headless: true,
	args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
	});
	}
	return _browser;
	}

	export async function closeBrowser(): Promise<void> {
	if (_browser) {
	await _browser.close();
	_browser = null;
	}
	}

	// ─── Main scraper ─────────────────────────────────────────────

	/**
	* Scrapes a company website for ICP-relevant signals.
	* Respects rate limits and robots.txt awareness (no sitemap abuse).
	*/
	export async function scrapeCompanyWebsite(domain: string): Promise<ScrapedCompany> {
	await playwrightLimiter.consume("playwright");

	const browser = await getBrowser();
	const context = await browser.newContext({
	userAgent:
	"Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)",
	extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" },
	});

	const result: ScrapedCompany = {
	domain,
	name: null,
	description: null,
	employeeRange: null,
	employeeCount: null,
	industry: null,
	country: null,
	linkedinUrl: null,
	techStack: [],
	jobPostings: [],
	recentNews: [],
	websiteText: "",
	html: "",
	text: "",
	aiJobCount: 0,
	};

	try {
	// ── Homepage ─────────────────────────────────────────────
	const homePage = await context.newPage();
	await homePage.goto(`https://${domain}`, {
	waitUntil: "domcontentloaded",
	timeout: 15_000,
	});

	const homeText = await homePage.evaluate(() => document.body.innerText);
	result.websiteText = homeText.slice(0, 3000);
	result.text = result.websiteText; // alias

	// Extract company name from title tag
	result.name = await homePage.title().then((t) =>
	t.split("\|")[0].split("-")[0].trim()
	);

	// Find LinkedIn link on homepage
	const linkedinHref = await homePage
	.$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href"))
	.catch(() => null);
	result.linkedinUrl = linkedinHref ?? null;

	// Tech stack detection from script/link tags
	const pageSource = await homePage.content();
	result.techStack = detectTechStack(pageSource);
	result.html = pageSource.slice(0, 10000); // raw HTML for pain detection

	await homePage.close();

	// ── About Page ───────────────────────────────────────────
	const aboutPage = await context.newPage();
	const aboutUrl = `https://${domain}/about`;
	try {
	await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
	const aboutText = await aboutPage.evaluate(() => document.body.innerText);
	result.description = extractDescription(aboutText);

	const empMatch = aboutText.match(/(\d[\d,])\s(employees?\|people\|team members?\|staff)/i);
	if (empMatch) {
	result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
	}
	} catch {
	// About page not found — that's fine
	} finally {
	await aboutPage.close();
	}

	// ── Jobs Page ────────────────────────────────────────────
	const jobsPage = await context.newPage();
	const jobsUrls = [
	`https://${domain}/careers`,
	`https://${domain}/jobs`,
	`https://${domain}/work-with-us`,
	];

	for (const jobUrl of jobsUrls) {
	try {
	await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
	const jobsText = await jobsPage.evaluate(() => document.body.innerText);
	result.jobPostings = extractJobPostings(jobsText, jobUrl);
	if (result.jobPostings.length) break;
	} catch {
	// Try next URL
	}
	}
	await jobsPage.close();
	result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length;

	logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length },
	"Website scraped successfully"
	);
	} catch (err) {
	logger.warn({ domain, err }, "Website scrape partial failure");
	} finally {
	await context.close();
	}

	return result;
	}

	// ─── Helpers ─────────────────────────────────────────────────

	function detectTechStack(html: string): string[] {
	const found: string[] = [];
	const lower = html.toLowerCase();
	for (const tech of TECH_STACK_SIGNALS) {
	if (lower.includes(tech)) found.push(tech);
	}
	return [...new Set(found)];
	}

	function extractDescription(text: string): string {
	// Take first 3 meaningful sentences
	const sentences = text
	.replace(/\n+/g, " ")
	.split(/(?<=[.!?])\s+/)
	.filter((s) => s.length > 30 && s.length < 300);
	return sentences.slice(0, 3).join(" ");
	}

	function extractJobPostings(text: string, sourceUrl: string): JobPosting[] {
	const lines = text.split("\n").filter((l) => l.trim().length > 5);
	const postings: JobPosting[] = [];

	for (const line of lines.slice(0, 30)) {
	const lower = line.toLowerCase();
	const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw));

	// Heuristic: job titles are usually 2-6 words
	const wordCount = line.trim().split(/\s+/).length;
	if (wordCount >= 2 && wordCount <= 8) {
	postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal });
	}
	}

	return postings.slice(0, 15);
	}

	export function hasAiSignals(company: ScrapedCompany): boolean {
	const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length;
	const websiteHasAi = AI_KEYWORDS.some((kw) =>
	company.websiteText.toLowerCase().includes(kw)
	);
	return aiJobs > 0 \|\| websiteHasAi;
	}