/** * Social Profile Finder * * Finds company + decision-maker social profiles: * - Instagram (business account) * - Facebook (business page) * - Twitter/X * - YouTube * * Two sources: * 1. Website footer/header scraping (most reliable) * 2. Google search fallback * * Phase 2 uses these for multi-channel outreach. */ import { chromium } from "playwright"; import { playwrightLimiter } from "../../shared/utils/rate-limiter"; import { serperLimiter } from "../../shared/utils/rate-limiter"; import { logger } from "../../shared/utils/logger"; import axios from "axios"; import { getEnv } from "../../shared/config/env"; export interface SocialProfiles { instagram: string | null; facebook: string | null; twitter: string | null; youtube: string | null; source: "website" | "google" | "mixed"; } /** * Find all social profiles for a company. * Method 1 first (website scrape), then Google fills gaps. */ export async function findSocialProfiles( domain: string, companyName: string, websiteHtml?: string ): Promise { const profiles: SocialProfiles = { instagram: null, facebook: null, twitter: null, youtube: null, source: "website", }; // ── Method 1: Extract from website HTML ──────────────────── if (websiteHtml) { extractFromHtml(websiteHtml, profiles); } else { // Scrape website specifically for social links await scrapeWebsiteForSocials(domain, profiles); } // ── Method 2: Google search for missing profiles ─────────── const missing = getMissing(profiles); if (missing.length > 0) { await searchGoogleForSocials(companyName, domain, profiles, missing); if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) { profiles.source = "mixed"; } } const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube] .filter(Boolean).length; logger.info({ domain, found }, "Social profiles discovered"); return profiles; } // ─── Method 1: HTML extraction ────────────────────────────── const SOCIAL_PATTERNS = { instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi, facebook: /https?:\/\/(www\.)?(facebook|fb)\.com\/[a-zA-Z0-9.]+/gi, twitter: /https?:\/\/(www\.)?(twitter|x)\.com\/[a-zA-Z0-9_]+/gi, youtube: /https?:\/\/(www\.)?youtube\.com\/(channel|c|@)[\/a-zA-Z0-9._-]+/gi, }; function extractFromHtml(html: string, profiles: SocialProfiles): void { for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) { const matches = html.match(pattern); if (matches && matches.length > 0) { // Take first match, clean it const url = cleanSocialUrl(matches[0], platform); if (url && !isGenericSocial(url)) { (profiles as Record)[platform] = url; } } } } // ─── Website scrape (if HTML not already available) ────────── async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise { try { await playwrightLimiter.consume("playwright"); const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)", }); const page = await context.newPage(); await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 }); // Get all link hrefs on the page const links = await page.$$eval("a[href]", (anchors) => anchors.map((a) => a.getAttribute("href") ?? "") ); const pageHtml = links.join("\n"); extractFromHtml(pageHtml, profiles); await page.close(); await context.close(); await browser.close(); } catch (err) { logger.debug({ domain, err }, "Social scrape failed — trying Google"); } } // ─── Method 2: Google search ──────────────────────────────── async function searchGoogleForSocials( companyName: string, domain: string, profiles: SocialProfiles, missing: string[] ): Promise { const searchMap: Record = { instagram: `"${companyName}" site:instagram.com`, facebook: `"${companyName}" site:facebook.com`, twitter: `"${companyName}" site:twitter.com OR site:x.com`, youtube: `"${companyName}" site:youtube.com`, }; for (const platform of missing) { try { await serperLimiter.consume("serper"); const env = getEnv(); const response = await axios.post( "https://google.serper.dev/search", { q: searchMap[platform], num: 3 }, { headers: { "X-API-KEY": env.SERPER_API_KEY, "Content-Type": "application/json", }, timeout: 6_000, } ); const organic = response.data?.organic ?? []; for (const result of organic) { const url = cleanSocialUrl(result.link, platform); if (url && !isGenericSocial(url)) { // Verify it mentions company name or domain in snippet const snippet = (result.snippet ?? "").toLowerCase(); const title = (result.title ?? "").toLowerCase(); const combined = `${snippet} ${title}`; const companyWords = companyName.toLowerCase().split(/\s+/); const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w)); if (hasCompany || combined.includes(domain.replace(/\.\w+$/, ""))) { (profiles as Record)[platform] = url; break; } } } } catch (err) { logger.debug({ platform, err }, "Social Google search failed — skipping"); } } } // ─── Helpers ───────────────────────────────────────────────── function getMissing(profiles: SocialProfiles): string[] { return ["instagram", "facebook", "twitter", "youtube"] .filter(p => !(profiles as Record)[p]); } function cleanSocialUrl(url: string, platform: string): string | null { try { const parsed = new URL(url); // Remove query params and fragments return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`; } catch { return null; } } function isGenericSocial(url: string): boolean { // Filter out generic profile links (not actual company pages) const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"]; return genericPaths.some(p => url.includes(p)); }