Spaces:

dexakif
/

clienttarget-python

Running

clienttarget-python / src /discovery /lib /social-finder.ts

iDevBuddy

feat: Phase 1 — AI Client Acquisition System

bd28470 5 days ago

6.81 kB

	/**
	* Social Profile Finder
	*
	* Finds company + decision-maker social profiles:
	* - Instagram (business account)
	* - Facebook (business page)
	* - Twitter/X
	* - YouTube
	*
	* Two sources:
	* 1. Website footer/header scraping (most reliable)
	* 2. Google search fallback
	*
	* Phase 2 uses these for multi-channel outreach.
	*/

	import { chromium } from "playwright";
	import { playwrightLimiter } from "../../shared/utils/rate-limiter";
	import { serperLimiter } from "../../shared/utils/rate-limiter";
	import { logger } from "../../shared/utils/logger";
	import axios from "axios";
	import { getEnv } from "../../shared/config/env";

	export interface SocialProfiles {
	instagram: string \| null;
	facebook: string \| null;
	twitter: string \| null;
	youtube: string \| null;
	source: "website" \| "google" \| "mixed";
	}

	/**
	* Find all social profiles for a company.
	* Method 1 first (website scrape), then Google fills gaps.
	*/
	export async function findSocialProfiles(
	domain: string,
	companyName: string,
	websiteHtml?: string
	): Promise<SocialProfiles> {
	const profiles: SocialProfiles = {
	instagram: null,
	facebook: null,
	twitter: null,
	youtube: null,
	source: "website",
	};

	// ── Method 1: Extract from website HTML ────────────────────
	if (websiteHtml) {
	extractFromHtml(websiteHtml, profiles);
	} else {
	// Scrape website specifically for social links
	await scrapeWebsiteForSocials(domain, profiles);
	}

	// ── Method 2: Google search for missing profiles ───────────
	const missing = getMissing(profiles);
	if (missing.length > 0) {
	await searchGoogleForSocials(companyName, domain, profiles, missing);
	if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) {
	profiles.source = "mixed";
	}
	}

	const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube]
	.filter(Boolean).length;
	logger.info({ domain, found }, "Social profiles discovered");

	return profiles;
	}

	// ─── Method 1: HTML extraction ──────────────────────────────

	const SOCIAL_PATTERNS = {
	instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi,
	facebook: /https?:\/\/(www\.)?(facebook\|fb)\.com\/[a-zA-Z0-9.]+/gi,
	twitter: /https?:\/\/(www\.)?(twitter\|x)\.com\/[a-zA-Z0-9_]+/gi,
	youtube: /https?:\/\/(www\.)?youtube\.com\/(channel\|c\|@)[\/a-zA-Z0-9._-]+/gi,
	};

	function extractFromHtml(html: string, profiles: SocialProfiles): void {
	for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) {
	const matches = html.match(pattern);
	if (matches && matches.length > 0) {
	// Take first match, clean it
	const url = cleanSocialUrl(matches[0], platform);
	if (url && !isGenericSocial(url)) {
	(profiles as Record<string, unknown>)[platform] = url;
	}
	}
	}
	}

	// ─── Website scrape (if HTML not already available) ──────────

	async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise<void> {
	try {
	await playwrightLimiter.consume("playwright");

	const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
	const context = await browser.newContext({
	userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)",
	});
	const page = await context.newPage();

	await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 });

	// Get all link hrefs on the page
	const links = await page.$$eval("a[href]", (anchors) =>
	anchors.map((a) => a.getAttribute("href") ?? "")
	);

	const pageHtml = links.join("\n");
	extractFromHtml(pageHtml, profiles);

	await page.close();
	await context.close();
	await browser.close();
	} catch (err) {
	logger.debug({ domain, err }, "Social scrape failed — trying Google");
	}
	}

	// ─── Method 2: Google search ────────────────────────────────

	async function searchGoogleForSocials(
	companyName: string,
	domain: string,
	profiles: SocialProfiles,
	missing: string[]
	): Promise<void> {
	const searchMap: Record<string, string> = {
	instagram: `"${companyName}" site:instagram.com`,
	facebook: `"${companyName}" site:facebook.com`,
	twitter: `"${companyName}" site:twitter.com OR site:x.com`,
	youtube: `"${companyName}" site:youtube.com`,
	};

	for (const platform of missing) {
	try {
	await serperLimiter.consume("serper");

	const env = getEnv();
	const response = await axios.post(
	"https://google.serper.dev/search",
	{ q: searchMap[platform], num: 3 },
	{
	headers: {
	"X-API-KEY": env.SERPER_API_KEY,
	"Content-Type": "application/json",
	},
	timeout: 6_000,
	}
	);

	const organic = response.data?.organic ?? [];
	for (const result of organic) {
	const url = cleanSocialUrl(result.link, platform);
	if (url && !isGenericSocial(url)) {
	// Verify it mentions company name or domain in snippet
	const snippet = (result.snippet ?? "").toLowerCase();
	const title = (result.title ?? "").toLowerCase();
	const combined = `${snippet} ${title}`;

	const companyWords = companyName.toLowerCase().split(/\s+/);
	const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w));

	if (hasCompany \|\| combined.includes(domain.replace(/\.\w+$/, ""))) {
	(profiles as Record<string, unknown>)[platform] = url;
	break;
	}
	}
	}
	} catch (err) {
	logger.debug({ platform, err }, "Social Google search failed — skipping");
	}
	}
	}

	// ─── Helpers ─────────────────────────────────────────────────

	function getMissing(profiles: SocialProfiles): string[] {
	return ["instagram", "facebook", "twitter", "youtube"]
	.filter(p => !(profiles as Record<string, unknown>)[p]);
	}

	function cleanSocialUrl(url: string, platform: string): string \| null {
	try {
	const parsed = new URL(url);
	// Remove query params and fragments
	return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`;
	} catch {
	return null;
	}
	}

	function isGenericSocial(url: string): boolean {
	// Filter out generic profile links (not actual company pages)
	const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"];
	return genericPaths.some(p => url.includes(p));
	}