lead-hunter-ai / tools /contact_enricher.py
agenticworkflowsspace's picture
Upload tools/contact_enricher.py with huggingface_hub
86e0563 verified
import json
import re
import os
import csv
import httpx
from bs4 import BeautifulSoup
import argparse
from urllib.parse import urljoin, urlparse
# Common contact and about page patterns
PATH_PATTERNS = ["contact", "about", "team", "our-team", "contact-us", "about-us"]
EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
def extract_emails(text):
"""
Extracts all unique emails from a block of text.
"""
return list(set(re.findall(EMAIL_REGEX, text)))
def extract_socials(soup, base_url):
"""
Extracts Facebook, Instagram, and LinkedIn links from a BeautifulSoup object.
"""
socials = {"facebook": "", "instagram": "", "linkedin": ""}
for a in soup.find_all('a', href=True):
href = a['href']
if "facebook.com" in href:
socials["facebook"] = href
elif "instagram.com" in href:
socials["instagram"] = href
elif "linkedin.com" in href:
socials["linkedin"] = href
return socials
def enrich_lead(lead_item):
"""
Visits a lead's website and attempts to enrich it with email and social media links.
"""
website = lead_item.get("website")
if not website or "google.com" in website:
return {**lead_item, "email": "", "facebook": "", "instagram": "", "linkedin": "", "status": "No Website"}
print(f"[*] Enriching {lead_item['name']} via {website}...")
enriched_data = {
"email": "",
"facebook": "",
"instagram": "",
"linkedin": "",
"status": "Success"
}
try:
with httpx.Client(timeout=10.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as client:
# 1. Scrape Homepage
response = client.get(website)
if response.status_code != 200:
enriched_data["status"] = f"Error {response.status_code}"
return {**lead_item, **enriched_data}
# Using html.parser instead of lxml as it's built-in
soup = BeautifulSoup(response.text, 'html.parser')
email_list = extract_emails(response.text)
socials = extract_socials(soup, website)
# 2. Look for Contact/About pages
found_paths = []
for a in soup.find_all('a', href=True):
href = a['href']
for pattern in PATH_PATTERNS:
if pattern in href.lower() and href not in found_paths:
full_url = urljoin(website, href)
found_paths.append(full_url)
# Visit the first 2 contact/about pages found
for path in found_paths[:2]:
try:
res = client.get(path)
if res.status_code == 200:
email_list.extend(extract_emails(res.text))
s = extract_socials(BeautifulSoup(res.text, 'html.parser'), website)
for k, v in s.items():
if v: socials[k] = v
except:
continue
# Update lead with enriched data
enriched_data["email"] = ", ".join(list(set(email_list)))
enriched_data["facebook"] = socials["facebook"]
enriched_data["instagram"] = socials["instagram"]
enriched_data["linkedin"] = socials["linkedin"]
except Exception as e:
print(f"[!] Critical error enriching {website}: {e}")
enriched_data["status"] = "Failed"
return {**lead_item, **enriched_data}
def process_leads(input_file, output_csv):
"""
Process all leads from a raw JSON file and save to an enriched CSV using built-in csv module.
"""
if not os.path.exists(input_file):
print(f"[!] Input file {input_file} not found.")
return
with open(input_file, 'r') as f:
raw_leads = json.load(f)
all_enriched = []
for lead in raw_leads:
enriched = enrich_lead(lead)
all_enriched.append(enriched)
if not all_enriched:
print("[!] No leads to process.")
return
# Get fieldnames from the first lead
fieldnames = all_enriched[0].keys()
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_enriched)
print(f"[+] Enrichment complete. Saved {len(all_enriched)} leads to {output_csv}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Contact Enricher')
parser.add_argument('--input', type=str, default='.tmp/raw_leads.json', help='Input raw leads JSON')
parser.add_argument('--output', type=str, required=True, help='Output CSV file')
args = parser.parse_args()
process_leads(args.input, args.output)