clienttarget-python / src /discovery /lib /social-finder.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
/**
* Social Profile Finder
*
* Finds company + decision-maker social profiles:
* - Instagram (business account)
* - Facebook (business page)
* - Twitter/X
* - YouTube
*
* Two sources:
* 1. Website footer/header scraping (most reliable)
* 2. Google search fallback
*
* Phase 2 uses these for multi-channel outreach.
*/
import { chromium } from "playwright";
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
import { serperLimiter } from "../../shared/utils/rate-limiter";
import { logger } from "../../shared/utils/logger";
import axios from "axios";
import { getEnv } from "../../shared/config/env";
export interface SocialProfiles {
instagram: string | null;
facebook: string | null;
twitter: string | null;
youtube: string | null;
source: "website" | "google" | "mixed";
}
/**
* Find all social profiles for a company.
* Method 1 first (website scrape), then Google fills gaps.
*/
export async function findSocialProfiles(
domain: string,
companyName: string,
websiteHtml?: string
): Promise<SocialProfiles> {
const profiles: SocialProfiles = {
instagram: null,
facebook: null,
twitter: null,
youtube: null,
source: "website",
};
// ── Method 1: Extract from website HTML ────────────────────
if (websiteHtml) {
extractFromHtml(websiteHtml, profiles);
} else {
// Scrape website specifically for social links
await scrapeWebsiteForSocials(domain, profiles);
}
// ── Method 2: Google search for missing profiles ───────────
const missing = getMissing(profiles);
if (missing.length > 0) {
await searchGoogleForSocials(companyName, domain, profiles, missing);
if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) {
profiles.source = "mixed";
}
}
const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube]
.filter(Boolean).length;
logger.info({ domain, found }, "Social profiles discovered");
return profiles;
}
// ─── Method 1: HTML extraction ──────────────────────────────
const SOCIAL_PATTERNS = {
instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi,
facebook: /https?:\/\/(www\.)?(facebook|fb)\.com\/[a-zA-Z0-9.]+/gi,
twitter: /https?:\/\/(www\.)?(twitter|x)\.com\/[a-zA-Z0-9_]+/gi,
youtube: /https?:\/\/(www\.)?youtube\.com\/(channel|c|@)[\/a-zA-Z0-9._-]+/gi,
};
function extractFromHtml(html: string, profiles: SocialProfiles): void {
for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) {
const matches = html.match(pattern);
if (matches && matches.length > 0) {
// Take first match, clean it
const url = cleanSocialUrl(matches[0], platform);
if (url && !isGenericSocial(url)) {
(profiles as Record<string, unknown>)[platform] = url;
}
}
}
}
// ─── Website scrape (if HTML not already available) ──────────
async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise<void> {
try {
await playwrightLimiter.consume("playwright");
const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
const context = await browser.newContext({
userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)",
});
const page = await context.newPage();
await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 });
// Get all link hrefs on the page
const links = await page.$$eval("a[href]", (anchors) =>
anchors.map((a) => a.getAttribute("href") ?? "")
);
const pageHtml = links.join("\n");
extractFromHtml(pageHtml, profiles);
await page.close();
await context.close();
await browser.close();
} catch (err) {
logger.debug({ domain, err }, "Social scrape failed β€” trying Google");
}
}
// ─── Method 2: Google search ────────────────────────────────
async function searchGoogleForSocials(
companyName: string,
domain: string,
profiles: SocialProfiles,
missing: string[]
): Promise<void> {
const searchMap: Record<string, string> = {
instagram: `"${companyName}" site:instagram.com`,
facebook: `"${companyName}" site:facebook.com`,
twitter: `"${companyName}" site:twitter.com OR site:x.com`,
youtube: `"${companyName}" site:youtube.com`,
};
for (const platform of missing) {
try {
await serperLimiter.consume("serper");
const env = getEnv();
const response = await axios.post(
"https://google.serper.dev/search",
{ q: searchMap[platform], num: 3 },
{
headers: {
"X-API-KEY": env.SERPER_API_KEY,
"Content-Type": "application/json",
},
timeout: 6_000,
}
);
const organic = response.data?.organic ?? [];
for (const result of organic) {
const url = cleanSocialUrl(result.link, platform);
if (url && !isGenericSocial(url)) {
// Verify it mentions company name or domain in snippet
const snippet = (result.snippet ?? "").toLowerCase();
const title = (result.title ?? "").toLowerCase();
const combined = `${snippet} ${title}`;
const companyWords = companyName.toLowerCase().split(/\s+/);
const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w));
if (hasCompany || combined.includes(domain.replace(/\.\w+$/, ""))) {
(profiles as Record<string, unknown>)[platform] = url;
break;
}
}
}
} catch (err) {
logger.debug({ platform, err }, "Social Google search failed β€” skipping");
}
}
}
// ─── Helpers ─────────────────────────────────────────────────
function getMissing(profiles: SocialProfiles): string[] {
return ["instagram", "facebook", "twitter", "youtube"]
.filter(p => !(profiles as Record<string, unknown>)[p]);
}
function cleanSocialUrl(url: string, platform: string): string | null {
try {
const parsed = new URL(url);
// Remove query params and fragments
return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`;
} catch {
return null;
}
}
function isGenericSocial(url: string): boolean {
// Filter out generic profile links (not actual company pages)
const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"];
return genericPaths.some(p => url.includes(p));
}