Spaces:
Running
Running
| /** | |
| * Social Profile Finder | |
| * | |
| * Finds company + decision-maker social profiles: | |
| * - Instagram (business account) | |
| * - Facebook (business page) | |
| * - Twitter/X | |
| * - YouTube | |
| * | |
| * Two sources: | |
| * 1. Website footer/header scraping (most reliable) | |
| * 2. Google search fallback | |
| * | |
| * Phase 2 uses these for multi-channel outreach. | |
| */ | |
| import { chromium } from "playwright"; | |
| import { playwrightLimiter } from "../../shared/utils/rate-limiter"; | |
| import { serperLimiter } from "../../shared/utils/rate-limiter"; | |
| import { logger } from "../../shared/utils/logger"; | |
| import axios from "axios"; | |
| import { getEnv } from "../../shared/config/env"; | |
| export interface SocialProfiles { | |
| instagram: string | null; | |
| facebook: string | null; | |
| twitter: string | null; | |
| youtube: string | null; | |
| source: "website" | "google" | "mixed"; | |
| } | |
| /** | |
| * Find all social profiles for a company. | |
| * Method 1 first (website scrape), then Google fills gaps. | |
| */ | |
| export async function findSocialProfiles( | |
| domain: string, | |
| companyName: string, | |
| websiteHtml?: string | |
| ): Promise<SocialProfiles> { | |
| const profiles: SocialProfiles = { | |
| instagram: null, | |
| facebook: null, | |
| twitter: null, | |
| youtube: null, | |
| source: "website", | |
| }; | |
| // ββ Method 1: Extract from website HTML ββββββββββββββββββββ | |
| if (websiteHtml) { | |
| extractFromHtml(websiteHtml, profiles); | |
| } else { | |
| // Scrape website specifically for social links | |
| await scrapeWebsiteForSocials(domain, profiles); | |
| } | |
| // ββ Method 2: Google search for missing profiles βββββββββββ | |
| const missing = getMissing(profiles); | |
| if (missing.length > 0) { | |
| await searchGoogleForSocials(companyName, domain, profiles, missing); | |
| if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) { | |
| profiles.source = "mixed"; | |
| } | |
| } | |
| const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube] | |
| .filter(Boolean).length; | |
| logger.info({ domain, found }, "Social profiles discovered"); | |
| return profiles; | |
| } | |
| // βββ Method 1: HTML extraction ββββββββββββββββββββββββββββββ | |
| const SOCIAL_PATTERNS = { | |
| instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi, | |
| facebook: /https?:\/\/(www\.)?(facebook|fb)\.com\/[a-zA-Z0-9.]+/gi, | |
| twitter: /https?:\/\/(www\.)?(twitter|x)\.com\/[a-zA-Z0-9_]+/gi, | |
| youtube: /https?:\/\/(www\.)?youtube\.com\/(channel|c|@)[\/a-zA-Z0-9._-]+/gi, | |
| }; | |
| function extractFromHtml(html: string, profiles: SocialProfiles): void { | |
| for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) { | |
| const matches = html.match(pattern); | |
| if (matches && matches.length > 0) { | |
| // Take first match, clean it | |
| const url = cleanSocialUrl(matches[0], platform); | |
| if (url && !isGenericSocial(url)) { | |
| (profiles as Record<string, unknown>)[platform] = url; | |
| } | |
| } | |
| } | |
| } | |
| // βββ Website scrape (if HTML not already available) ββββββββββ | |
| async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise<void> { | |
| try { | |
| await playwrightLimiter.consume("playwright"); | |
| const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] }); | |
| const context = await browser.newContext({ | |
| userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)", | |
| }); | |
| const page = await context.newPage(); | |
| await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 }); | |
| // Get all link hrefs on the page | |
| const links = await page.$$eval("a[href]", (anchors) => | |
| anchors.map((a) => a.getAttribute("href") ?? "") | |
| ); | |
| const pageHtml = links.join("\n"); | |
| extractFromHtml(pageHtml, profiles); | |
| await page.close(); | |
| await context.close(); | |
| await browser.close(); | |
| } catch (err) { | |
| logger.debug({ domain, err }, "Social scrape failed β trying Google"); | |
| } | |
| } | |
| // βββ Method 2: Google search ββββββββββββββββββββββββββββββββ | |
| async function searchGoogleForSocials( | |
| companyName: string, | |
| domain: string, | |
| profiles: SocialProfiles, | |
| missing: string[] | |
| ): Promise<void> { | |
| const searchMap: Record<string, string> = { | |
| instagram: `"${companyName}" site:instagram.com`, | |
| facebook: `"${companyName}" site:facebook.com`, | |
| twitter: `"${companyName}" site:twitter.com OR site:x.com`, | |
| youtube: `"${companyName}" site:youtube.com`, | |
| }; | |
| for (const platform of missing) { | |
| try { | |
| await serperLimiter.consume("serper"); | |
| const env = getEnv(); | |
| const response = await axios.post( | |
| "https://google.serper.dev/search", | |
| { q: searchMap[platform], num: 3 }, | |
| { | |
| headers: { | |
| "X-API-KEY": env.SERPER_API_KEY, | |
| "Content-Type": "application/json", | |
| }, | |
| timeout: 6_000, | |
| } | |
| ); | |
| const organic = response.data?.organic ?? []; | |
| for (const result of organic) { | |
| const url = cleanSocialUrl(result.link, platform); | |
| if (url && !isGenericSocial(url)) { | |
| // Verify it mentions company name or domain in snippet | |
| const snippet = (result.snippet ?? "").toLowerCase(); | |
| const title = (result.title ?? "").toLowerCase(); | |
| const combined = `${snippet} ${title}`; | |
| const companyWords = companyName.toLowerCase().split(/\s+/); | |
| const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w)); | |
| if (hasCompany || combined.includes(domain.replace(/\.\w+$/, ""))) { | |
| (profiles as Record<string, unknown>)[platform] = url; | |
| break; | |
| } | |
| } | |
| } | |
| } catch (err) { | |
| logger.debug({ platform, err }, "Social Google search failed β skipping"); | |
| } | |
| } | |
| } | |
| // βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function getMissing(profiles: SocialProfiles): string[] { | |
| return ["instagram", "facebook", "twitter", "youtube"] | |
| .filter(p => !(profiles as Record<string, unknown>)[p]); | |
| } | |
| function cleanSocialUrl(url: string, platform: string): string | null { | |
| try { | |
| const parsed = new URL(url); | |
| // Remove query params and fragments | |
| return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`; | |
| } catch { | |
| return null; | |
| } | |
| } | |
| function isGenericSocial(url: string): boolean { | |
| // Filter out generic profile links (not actual company pages) | |
| const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"]; | |
| return genericPaths.some(p => url.includes(p)); | |
| } | |