| import json |
| import re |
| import os |
| import csv |
| import httpx |
| from bs4 import BeautifulSoup |
| import argparse |
| from urllib.parse import urljoin, urlparse |
|
|
| |
| PATH_PATTERNS = ["contact", "about", "team", "our-team", "contact-us", "about-us"] |
|
|
| EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' |
|
|
| def extract_emails(text): |
| """ |
| Extracts all unique emails from a block of text. |
| """ |
| return list(set(re.findall(EMAIL_REGEX, text))) |
|
|
| def extract_socials(soup, base_url): |
| """ |
| Extracts Facebook, Instagram, and LinkedIn links from a BeautifulSoup object. |
| """ |
| socials = {"facebook": "", "instagram": "", "linkedin": ""} |
| |
| for a in soup.find_all('a', href=True): |
| href = a['href'] |
| if "facebook.com" in href: |
| socials["facebook"] = href |
| elif "instagram.com" in href: |
| socials["instagram"] = href |
| elif "linkedin.com" in href: |
| socials["linkedin"] = href |
| |
| return socials |
|
|
| def enrich_lead(lead_item): |
| """ |
| Visits a lead's website and attempts to enrich it with email and social media links. |
| """ |
| website = lead_item.get("website") |
| if not website or "google.com" in website: |
| return {**lead_item, "email": "", "facebook": "", "instagram": "", "linkedin": "", "status": "No Website"} |
| |
| print(f"[*] Enriching {lead_item['name']} via {website}...") |
| |
| enriched_data = { |
| "email": "", |
| "facebook": "", |
| "instagram": "", |
| "linkedin": "", |
| "status": "Success" |
| } |
|
|
| try: |
| with httpx.Client(timeout=10.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as client: |
| |
| response = client.get(website) |
| if response.status_code != 200: |
| enriched_data["status"] = f"Error {response.status_code}" |
| return {**lead_item, **enriched_data} |
| |
| |
| soup = BeautifulSoup(response.text, 'html.parser') |
| |
| email_list = extract_emails(response.text) |
| socials = extract_socials(soup, website) |
| |
| |
| found_paths = [] |
| for a in soup.find_all('a', href=True): |
| href = a['href'] |
| for pattern in PATH_PATTERNS: |
| if pattern in href.lower() and href not in found_paths: |
| full_url = urljoin(website, href) |
| found_paths.append(full_url) |
| |
| |
| for path in found_paths[:2]: |
| try: |
| res = client.get(path) |
| if res.status_code == 200: |
| email_list.extend(extract_emails(res.text)) |
| s = extract_socials(BeautifulSoup(res.text, 'html.parser'), website) |
| for k, v in s.items(): |
| if v: socials[k] = v |
| except: |
| continue |
|
|
| |
| enriched_data["email"] = ", ".join(list(set(email_list))) |
| enriched_data["facebook"] = socials["facebook"] |
| enriched_data["instagram"] = socials["instagram"] |
| enriched_data["linkedin"] = socials["linkedin"] |
|
|
| except Exception as e: |
| print(f"[!] Critical error enriching {website}: {e}") |
| enriched_data["status"] = "Failed" |
|
|
| return {**lead_item, **enriched_data} |
|
|
| def process_leads(input_file, output_csv): |
| """ |
| Process all leads from a raw JSON file and save to an enriched CSV using built-in csv module. |
| """ |
| if not os.path.exists(input_file): |
| print(f"[!] Input file {input_file} not found.") |
| return |
| |
| with open(input_file, 'r') as f: |
| raw_leads = json.load(f) |
| |
| all_enriched = [] |
| for lead in raw_leads: |
| enriched = enrich_lead(lead) |
| all_enriched.append(enriched) |
| |
| if not all_enriched: |
| print("[!] No leads to process.") |
| return |
|
|
| |
| fieldnames = all_enriched[0].keys() |
| |
| with open(output_csv, 'w', newline='', encoding='utf-8') as f: |
| writer = csv.DictWriter(f, fieldnames=fieldnames) |
| writer.writeheader() |
| writer.writerows(all_enriched) |
| |
| print(f"[+] Enrichment complete. Saved {len(all_enriched)} leads to {output_csv}") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description='Contact Enricher') |
| parser.add_argument('--input', type=str, default='.tmp/raw_leads.json', help='Input raw leads JSON') |
| parser.add_argument('--output', type=str, required=True, help='Output CSV file') |
| |
| args = parser.parse_args() |
| process_leads(args.input, args.output) |
|
|