Spaces:

dexakif
/

clienttarget

Running

File size: 7,843 Bytes

bd28470

import { chromium, Browser, BrowserContext } from "playwright";
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
import { logger } from "../../shared/utils/logger";

// ─── Types ───────────────────────────────────────────────────

export interface ScrapedCompany {
  domain: string;
  name: string | null;
  description: string | null;
  employeeRange: string | null;
  employeeCount: number | null;
  industry: string | null;
  country: string | null;
  linkedinUrl: string | null;
  techStack: string[];
  jobPostings: JobPosting[];
  recentNews: string[];
  websiteText: string;
  html: string;           // raw HTML for pain signal detection
  text: string;           // alias for websiteText (used by auto-discovery)
  aiJobCount: number;     // count of AI-related job postings
}

export interface JobPosting {
  title: string;
  url: string;
  hasAiSignal: boolean;
}

// ─── AI signal keywords ──────────────────────────────────────

const AI_KEYWORDS = [
  "automation", "artificial intelligence", "machine learning", "ai", "llm",
  "workflow automation", "robotic process", "rpa", "data pipeline",
  "digital transformation", "predictive analytics", "nlp",
];

const TECH_STACK_SIGNALS = [
  "salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk",
  "servicenow", "workday", "netsuite", "quickbooks", "zoho",
  "slack", "jira", "notion", "monday.com", "asana",
];

// ─── Browser singleton ───────────────────────────────────────

let _browser: Browser | null = null;

async function getBrowser(): Promise<Browser> {
  if (!_browser || !_browser.isConnected()) {
    _browser = await chromium.launch({
      headless: true,
      args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
    });
  }
  return _browser;
}

export async function closeBrowser(): Promise<void> {
  if (_browser) {
    await _browser.close();
    _browser = null;
  }
}

// ─── Main scraper ─────────────────────────────────────────────

/**
 * Scrapes a company website for ICP-relevant signals.
 * Respects rate limits and robots.txt awareness (no sitemap abuse).
 */
export async function scrapeCompanyWebsite(domain: string): Promise<ScrapedCompany> {
  await playwrightLimiter.consume("playwright");

  const browser = await getBrowser();
  const context = await browser.newContext({
    userAgent:
      "Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)",
    extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" },
  });

  const result: ScrapedCompany = {
    domain,
    name: null,
    description: null,
    employeeRange: null,
    employeeCount: null,
    industry: null,
    country: null,
    linkedinUrl: null,
    techStack: [],
    jobPostings: [],
    recentNews: [],
    websiteText: "",
    html: "",
    text: "",
    aiJobCount: 0,
  };

  try {
    // ── Homepage ─────────────────────────────────────────────
    const homePage = await context.newPage();
    await homePage.goto(`https://${domain}`, {
      waitUntil: "domcontentloaded",
      timeout: 15_000,
    });

    const homeText = await homePage.evaluate(() => document.body.innerText);
    result.websiteText = homeText.slice(0, 3000);
    result.text = result.websiteText; // alias

    // Extract company name from title tag
    result.name = await homePage.title().then((t) =>
      t.split("|")[0].split("-")[0].trim()
    );

    // Find LinkedIn link on homepage
    const linkedinHref = await homePage
      .$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href"))
      .catch(() => null);
    result.linkedinUrl = linkedinHref ?? null;

    // Tech stack detection from script/link tags
    const pageSource = await homePage.content();
    result.techStack = detectTechStack(pageSource);
    result.html = pageSource.slice(0, 10000); // raw HTML for pain detection

    await homePage.close();

    // ── About Page ───────────────────────────────────────────
    const aboutPage = await context.newPage();
    const aboutUrl = `https://${domain}/about`;
    try {
      await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
      const aboutText = await aboutPage.evaluate(() => document.body.innerText);
      result.description = extractDescription(aboutText);

      const empMatch = aboutText.match(/(\d[\d,]*)\s*(employees?|people|team members?|staff)/i);
      if (empMatch) {
        result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
      }
    } catch {
      // About page not found — that's fine
    } finally {
      await aboutPage.close();
    }

    // ── Jobs Page ────────────────────────────────────────────
    const jobsPage = await context.newPage();
    const jobsUrls = [
      `https://${domain}/careers`,
      `https://${domain}/jobs`,
      `https://${domain}/work-with-us`,
    ];

    for (const jobUrl of jobsUrls) {
      try {
        await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
        const jobsText = await jobsPage.evaluate(() => document.body.innerText);
        result.jobPostings = extractJobPostings(jobsText, jobUrl);
        if (result.jobPostings.length) break;
      } catch {
        // Try next URL
      }
    }
    await jobsPage.close();
    result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length;

    logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length },
      "Website scraped successfully"
    );
  } catch (err) {
    logger.warn({ domain, err }, "Website scrape partial failure");
  } finally {
    await context.close();
  }

  return result;
}

// ─── Helpers ─────────────────────────────────────────────────

function detectTechStack(html: string): string[] {
  const found: string[] = [];
  const lower = html.toLowerCase();
  for (const tech of TECH_STACK_SIGNALS) {
    if (lower.includes(tech)) found.push(tech);
  }
  return [...new Set(found)];
}

function extractDescription(text: string): string {
  // Take first 3 meaningful sentences
  const sentences = text
    .replace(/\n+/g, " ")
    .split(/(?<=[.!?])\s+/)
    .filter((s) => s.length > 30 && s.length < 300);
  return sentences.slice(0, 3).join(" ");
}

function extractJobPostings(text: string, sourceUrl: string): JobPosting[] {
  const lines = text.split("\n").filter((l) => l.trim().length > 5);
  const postings: JobPosting[] = [];

  for (const line of lines.slice(0, 30)) {
    const lower = line.toLowerCase();
    const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw));

    // Heuristic: job titles are usually 2-6 words
    const wordCount = line.trim().split(/\s+/).length;
    if (wordCount >= 2 && wordCount <= 8) {
      postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal });
    }
  }

  return postings.slice(0, 15);
}

export function hasAiSignals(company: ScrapedCompany): boolean {
  const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length;
  const websiteHasAi = AI_KEYWORDS.some((kw) =>
    company.websiteText.toLowerCase().includes(kw)
  );
  return aiJobs > 0 || websiteHasAi;
}