Spaces:

agenticworkflowsspace
/

lead-hunter-ai

Running

App Files Files Community

lead-hunter-ai / tools /contact_enricher.py

agenticworkflowsspace

Upload tools/contact_enricher.py with huggingface_hub

86e0563 verified 19 days ago

raw

history blame contribute delete

4.93 kB

	import json
	import re
	import os
	import csv
	import httpx
	from bs4 import BeautifulSoup
	import argparse
	from urllib.parse import urljoin, urlparse

	# Common contact and about page patterns
	PATH_PATTERNS = ["contact", "about", "team", "our-team", "contact-us", "about-us"]

	EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

	def extract_emails(text):
	"""
	Extracts all unique emails from a block of text.
	"""
	return list(set(re.findall(EMAIL_REGEX, text)))

	def extract_socials(soup, base_url):
	"""
	Extracts Facebook, Instagram, and LinkedIn links from a BeautifulSoup object.
	"""
	socials = {"facebook": "", "instagram": "", "linkedin": ""}

	for a in soup.find_all('a', href=True):
	href = a['href']
	if "facebook.com" in href:
	socials["facebook"] = href
	elif "instagram.com" in href:
	socials["instagram"] = href
	elif "linkedin.com" in href:
	socials["linkedin"] = href

	return socials

	def enrich_lead(lead_item):
	"""
	Visits a lead's website and attempts to enrich it with email and social media links.
	"""
	website = lead_item.get("website")
	if not website or "google.com" in website:
	return {**lead_item, "email": "", "facebook": "", "instagram": "", "linkedin": "", "status": "No Website"}

	print(f"[*] Enriching {lead_item['name']} via {website}...")

	enriched_data = {
	"email": "",
	"facebook": "",
	"instagram": "",
	"linkedin": "",
	"status": "Success"
	}

	try:
	with httpx.Client(timeout=10.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as client:
	# 1. Scrape Homepage
	response = client.get(website)
	if response.status_code != 200:
	enriched_data["status"] = f"Error {response.status_code}"
	return {lead_item, enriched_data}

	# Using html.parser instead of lxml as it's built-in
	soup = BeautifulSoup(response.text, 'html.parser')

	email_list = extract_emails(response.text)
	socials = extract_socials(soup, website)

	# 2. Look for Contact/About pages
	found_paths = []
	for a in soup.find_all('a', href=True):
	href = a['href']
	for pattern in PATH_PATTERNS:
	if pattern in href.lower() and href not in found_paths:
	full_url = urljoin(website, href)
	found_paths.append(full_url)

	# Visit the first 2 contact/about pages found
	for path in found_paths[:2]:
	try:
	res = client.get(path)
	if res.status_code == 200:
	email_list.extend(extract_emails(res.text))
	s = extract_socials(BeautifulSoup(res.text, 'html.parser'), website)
	for k, v in s.items():
	if v: socials[k] = v
	except:
	continue

	# Update lead with enriched data
	enriched_data["email"] = ", ".join(list(set(email_list)))
	enriched_data["facebook"] = socials["facebook"]
	enriched_data["instagram"] = socials["instagram"]
	enriched_data["linkedin"] = socials["linkedin"]

	except Exception as e:
	print(f"[!] Critical error enriching {website}: {e}")
	enriched_data["status"] = "Failed"

	return {lead_item, enriched_data}

	def process_leads(input_file, output_csv):
	"""
	Process all leads from a raw JSON file and save to an enriched CSV using built-in csv module.
	"""
	if not os.path.exists(input_file):
	print(f"[!] Input file {input_file} not found.")
	return

	with open(input_file, 'r') as f:
	raw_leads = json.load(f)

	all_enriched = []
	for lead in raw_leads:
	enriched = enrich_lead(lead)
	all_enriched.append(enriched)

	if not all_enriched:
	print("[!] No leads to process.")
	return

	# Get fieldnames from the first lead
	fieldnames = all_enriched[0].keys()

	with open(output_csv, 'w', newline='', encoding='utf-8') as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(all_enriched)

	print(f"[+] Enrichment complete. Saved {len(all_enriched)} leads to {output_csv}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Contact Enricher')
	parser.add_argument('--input', type=str, default='.tmp/raw_leads.json', help='Input raw leads JSON')
	parser.add_argument('--output', type=str, required=True, help='Output CSV file')

	args = parser.parse_args()
	process_leads(args.input, args.output)