File size: 6,812 Bytes
bd28470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/**
 * Social Profile Finder
 * 
 * Finds company + decision-maker social profiles:
 * - Instagram (business account)
 * - Facebook (business page)
 * - Twitter/X
 * - YouTube
 * 
 * Two sources:
 * 1. Website footer/header scraping (most reliable)
 * 2. Google search fallback
 * 
 * Phase 2 uses these for multi-channel outreach.
 */

import { chromium } from "playwright";
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
import { serperLimiter } from "../../shared/utils/rate-limiter";
import { logger } from "../../shared/utils/logger";
import axios from "axios";
import { getEnv } from "../../shared/config/env";

export interface SocialProfiles {
  instagram: string | null;
  facebook: string | null;
  twitter: string | null;
  youtube: string | null;
  source: "website" | "google" | "mixed";
}

/**
 * Find all social profiles for a company.
 * Method 1 first (website scrape), then Google fills gaps.
 */
export async function findSocialProfiles(
  domain: string,
  companyName: string,
  websiteHtml?: string
): Promise<SocialProfiles> {
  const profiles: SocialProfiles = {
    instagram: null,
    facebook: null,
    twitter: null,
    youtube: null,
    source: "website",
  };

  // ── Method 1: Extract from website HTML ────────────────────
  if (websiteHtml) {
    extractFromHtml(websiteHtml, profiles);
  } else {
    // Scrape website specifically for social links
    await scrapeWebsiteForSocials(domain, profiles);
  }

  // ── Method 2: Google search for missing profiles ───────────
  const missing = getMissing(profiles);
  if (missing.length > 0) {
    await searchGoogleForSocials(companyName, domain, profiles, missing);
    if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) {
      profiles.source = "mixed";
    }
  }

  const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube]
    .filter(Boolean).length;
  logger.info({ domain, found }, "Social profiles discovered");

  return profiles;
}

// ─── Method 1: HTML extraction ──────────────────────────────

const SOCIAL_PATTERNS = {
  instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi,
  facebook: /https?:\/\/(www\.)?(facebook|fb)\.com\/[a-zA-Z0-9.]+/gi,
  twitter: /https?:\/\/(www\.)?(twitter|x)\.com\/[a-zA-Z0-9_]+/gi,
  youtube: /https?:\/\/(www\.)?youtube\.com\/(channel|c|@)[\/a-zA-Z0-9._-]+/gi,
};

function extractFromHtml(html: string, profiles: SocialProfiles): void {
  for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) {
    const matches = html.match(pattern);
    if (matches && matches.length > 0) {
      // Take first match, clean it
      const url = cleanSocialUrl(matches[0], platform);
      if (url && !isGenericSocial(url)) {
        (profiles as Record<string, unknown>)[platform] = url;
      }
    }
  }
}

// ─── Website scrape (if HTML not already available) ──────────

async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise<void> {
  try {
    await playwrightLimiter.consume("playwright");

    const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
    const context = await browser.newContext({
      userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)",
    });
    const page = await context.newPage();

    await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 });
    
    // Get all link hrefs on the page
    const links = await page.$$eval("a[href]", (anchors) =>
      anchors.map((a) => a.getAttribute("href") ?? "")
    );

    const pageHtml = links.join("\n");
    extractFromHtml(pageHtml, profiles);

    await page.close();
    await context.close();
    await browser.close();
  } catch (err) {
    logger.debug({ domain, err }, "Social scrape failed β€” trying Google");
  }
}

// ─── Method 2: Google search ────────────────────────────────

async function searchGoogleForSocials(
  companyName: string,
  domain: string,
  profiles: SocialProfiles,
  missing: string[]
): Promise<void> {
  const searchMap: Record<string, string> = {
    instagram: `"${companyName}" site:instagram.com`,
    facebook: `"${companyName}" site:facebook.com`,
    twitter: `"${companyName}" site:twitter.com OR site:x.com`,
    youtube: `"${companyName}" site:youtube.com`,
  };

  for (const platform of missing) {
    try {
      await serperLimiter.consume("serper");

      const env = getEnv();
      const response = await axios.post(
        "https://google.serper.dev/search",
        { q: searchMap[platform], num: 3 },
        {
          headers: {
            "X-API-KEY": env.SERPER_API_KEY,
            "Content-Type": "application/json",
          },
          timeout: 6_000,
        }
      );

      const organic = response.data?.organic ?? [];
      for (const result of organic) {
        const url = cleanSocialUrl(result.link, platform);
        if (url && !isGenericSocial(url)) {
          // Verify it mentions company name or domain in snippet
          const snippet = (result.snippet ?? "").toLowerCase();
          const title = (result.title ?? "").toLowerCase();
          const combined = `${snippet} ${title}`;
          
          const companyWords = companyName.toLowerCase().split(/\s+/);
          const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w));
          
          if (hasCompany || combined.includes(domain.replace(/\.\w+$/, ""))) {
            (profiles as Record<string, unknown>)[platform] = url;
            break;
          }
        }
      }
    } catch (err) {
      logger.debug({ platform, err }, "Social Google search failed β€” skipping");
    }
  }
}

// ─── Helpers ─────────────────────────────────────────────────

function getMissing(profiles: SocialProfiles): string[] {
  return ["instagram", "facebook", "twitter", "youtube"]
    .filter(p => !(profiles as Record<string, unknown>)[p]);
}

function cleanSocialUrl(url: string, platform: string): string | null {
  try {
    const parsed = new URL(url);
    // Remove query params and fragments
    return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`;
  } catch {
    return null;
  }
}

function isGenericSocial(url: string): boolean {
  // Filter out generic profile links (not actual company pages)
  const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"];
  return genericPaths.some(p => url.includes(p));
}