Spaces:

edstellar
/

bulk-link-auditor

Sleeping

App Files Files Community

bulk-link-auditor / audit_engine.py

vijaykumaredstellar

Upload 6 files

7f10996 verified 5 days ago

raw

history blame contribute delete

12 kB

	"""
	Core Link Audit Engine
	Crawls pages, extracts body-content links, checks status, detects issues.
	"""

	import requests
	from bs4 import BeautifulSoup, Comment
	from urllib.parse import urljoin, urlparse
	from collections import defaultdict
	import concurrent.futures

	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	}

	DEFAULT_BODY_SELECTORS = [
	"div.blog-rich-text",
	"div.w-richtext",
	"article .rich-text",
	"article",
	"div.blog-content",
	"div.post-content",
	"main",
	]

	DEFAULT_SUGGESTION_MAP = {
	"artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"),
	"machine learning": ("/category/artificial-intelligence-training", "machine learning training"),
	"leadership": ("/type/leadership-training", "leadership training programs"),
	"soft skills": ("/type/behavioral-training", "behavioral training programs"),
	"remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"),
	"training management": ("/training-management-software", "training management software"),
	"instructor-led": ("/instructor-led-training-services", "instructor-led training"),
	"corporate training": ("/corporate-training-courses", "corporate training programs"),
	"skill matrix": ("/skill-matrix", "skills matrix"),
	"stellar ai": ("/stellar-ai", "AI-powered training"),
	"book a demo": ("/book-a-demo", "book a demo"),
	"compliance": ("/type/compliance-training", "compliance training"),
	"cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"),
	"data analytics": ("/category/data-analytics-training", "data analytics training"),
	"project management": ("/category/project-management-training", "project management training"),
	"coaching": ("/coaching-solutions", "coaching solutions"),
	"hr training": ("/category/human-resource-training", "HR training programs"),
	"employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"),
	"onboarding": ("/category/human-resource-training", "onboarding training"),
	"digital transformation": ("/type/it-technical-training", "IT & technical training"),
	}


	def is_internal(href, domain):
	if not href:
	return False
	parsed = urlparse(href)
	if not parsed.netloc:
	return True
	return domain.lower() in parsed.netloc.lower()


	def normalize_url(href, base_url):
	if not href:
	return None
	href = href.strip()
	if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
	return None
	return urljoin(base_url, href)


	def get_follow_status(tag):
	rel = tag.get('rel', [])
	if isinstance(rel, str):
	rel = rel.split()
	return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow'


	def find_body_content(soup, selectors):
	for sel in selectors:
	el = soup.select_one(sel)
	if el:
	return el
	return soup.find('body')


	def get_link_location(link_tag, body_el):
	body_text = body_el.get_text()
	total_len = len(body_text)
	if total_len == 0:
	return "Unknown"

	preceding_text = ""
	for el in body_el.descendants:
	if el == link_tag:
	break
	if isinstance(el, str) and not isinstance(el, Comment):
	preceding_text += el

	pos = len(preceding_text)
	ratio = pos / total_len if total_len > 0 else 0

	heading = ""
	for parent in link_tag.parents:
	for sib in parent.previous_siblings:
	if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']:
	heading = sib.get_text(strip=True)[:60]
	break
	if heading:
	break

	if ratio < 0.1:
	section = "Intro"
	elif ratio > 0.85:
	section = "Conclusion"
	else:
	section = f"Mid-article (~{int(ratio*100)}%)"

	if heading:
	return f'{section} · near "{heading}"'
	return section


	def check_url_status(url, timeout=15):
	try:
	r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False)
	status = r.status_code
	redirect_url = ""

	if status in (301, 302, 303, 307, 308):
	redirect_url = r.headers.get('Location', '')
	if redirect_url and not redirect_url.startswith('http'):
	redirect_url = urljoin(url, redirect_url)

	if status == 405:
	r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True)
	status = r.status_code
	if status in (301, 302, 303, 307, 308):
	redirect_url = r.headers.get('Location', '')
	r.close()

	if status in (301, 302, 303, 307, 308):
	link_status = "Redirect"
	elif 200 <= status < 300:
	link_status = "Active"
	else:
	link_status = "Broken"

	return url, status, link_status, redirect_url

	except requests.exceptions.Timeout:
	return url, "Timeout", "Broken", ""
	except requests.exceptions.ConnectionError:
	return url, "ConnError", "Broken", ""
	except Exception:
	return url, "Error", "Broken", ""


	def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None):
	if suggestion_map is None:
	suggestion_map = DEFAULT_SUGGESTION_MAP

	suggestions = []
	text_lower = body_text.lower()
	existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls)

	for keyword, (path, anchor) in suggestion_map.items():
	clean_path = path.rstrip('/')
	if clean_path in existing_paths:
	continue
	if clean_path == urlparse(page_url).path.rstrip('/'):
	continue
	count = text_lower.count(keyword.lower())
	if count > 0:
	pos = text_lower.find(keyword.lower())
	ratio = pos / len(text_lower) if len(text_lower) > 0 else 0
	if ratio < 0.15:
	loc = "Intro"
	elif ratio > 0.85:
	loc = "Conclusion"
	else:
	loc = f"Mid-article (~{int(ratio*100)}%)"

	priority = "High" if count >= 3 else "Med"
	suggestions.append({
	'section': loc,
	'target': path,
	'anchor': anchor,
	'priority': priority,
	'keyword': keyword,
	'count': count
	})

	suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count']))
	return suggestions[:10]


	def audit_page(page_url, domain, body_selectors=None, suggestion_map=None,
	timeout=15, concurrent_workers=5):
	if body_selectors is None:
	body_selectors = DEFAULT_BODY_SELECTORS

	result = {
	'url': page_url, 'error': None,
	'internal_links': [], 'external_links': [],
	'broken_internal': [], 'broken_external': [],
	'redirect_internal': [], 'redirect_external': [],
	'follow_flags': [], 'duplicates': [], 'suggestions': [],
	'int_count': 0, 'ext_count': 0,
	'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0,
	'broken_int_count': 0, 'broken_ext_count': 0,
	'redirect_int_count': 0, 'redirect_ext_count': 0,
	'follow_flag_count': 0, 'duplicate_count': 0,
	}

	try:
	resp = requests.get(page_url, headers=HEADERS, timeout=timeout)
	resp.raise_for_status()
	except Exception as e:
	result['error'] = str(e)
	return result

	soup = BeautifulSoup(resp.text, 'lxml')
	body_el = find_body_content(soup, body_selectors)
	if not body_el:
	result['error'] = "Could not find body content element"
	return result

	body_text = body_el.get_text(' ', strip=True)
	all_links = body_el.find_all('a', href=True)
	url_locations = defaultdict(list)

	raw_links = []
	for tag in all_links:
	href = normalize_url(tag['href'], page_url)
	if not href:
	continue
	anchor = tag.get_text(strip=True) or "[no text]"
	follow = get_follow_status(tag)
	location = get_link_location(tag, body_el)
	internal = is_internal(href, domain)
	link_type = 'internal' if internal else 'external'

	link_data = {
	'url': href, 'anchor': anchor[:100], 'follow': follow,
	'location': location, 'type': link_type,
	'status_code': None, 'link_status': None,
	'redirect_url': '', 'flags': [],
	}
	raw_links.append(link_data)
	clean_url = href.rstrip('/').split('?')[0].split('#')[0]
	url_locations[clean_url].append(location)

	# Check status in parallel
	unique_urls = list(set(l['url'] for l in raw_links))
	status_map = {}
	with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor:
	futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls}
	for future in concurrent.futures.as_completed(futures):
	url, status, link_status, redirect_url = future.result()
	status_map[url] = (status, link_status, redirect_url)

	for link in raw_links:
	if link['url'] in status_map:
	status, link_status, redirect_url = status_map[link['url']]
	link['status_code'] = status
	link['link_status'] = link_status
	link['redirect_url'] = redirect_url

	if link['type'] == 'internal' and link['follow'] == 'Nofollow':
	link['flags'].append('Internal link is Nofollow — should be Dofollow')
	if link['type'] == 'external' and link['follow'] == 'Dofollow':
	link['flags'].append('External link is Dofollow — should be Nofollow')

	# Detect duplicates
	duplicates = []
	for clean_url, locations in url_locations.items():
	if len(locations) > 1:
	duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations})
	for link in raw_links:
	link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0]
	if link_clean == clean_url:
	link['flags'].append(f'Duplicate: appears {len(locations)}x in body')

	for link in raw_links:
	if link['type'] == 'internal':
	result['internal_links'].append(link)
	if link['follow'] == 'Dofollow': result['int_df'] += 1
	else: result['int_nf'] += 1
	if link['link_status'] == 'Broken': result['broken_internal'].append(link)
	if link['link_status'] == 'Redirect': result['redirect_internal'].append(link)
	else:
	result['external_links'].append(link)
	if link['follow'] == 'Dofollow': result['ext_df'] += 1
	else: result['ext_nf'] += 1
	if link['link_status'] == 'Broken': result['broken_external'].append(link)
	if link['link_status'] == 'Redirect': result['redirect_external'].append(link)

	if link['flags']:
	result['follow_flags'].append(link)

	result['int_count'] = len(result['internal_links'])
	result['ext_count'] = len(result['external_links'])
	result['broken_int_count'] = len(result['broken_internal'])
	result['broken_ext_count'] = len(result['broken_external'])
	result['redirect_int_count'] = len(result['redirect_internal'])
	result['redirect_ext_count'] = len(result['redirect_external'])
	result['follow_flag_count'] = len(result['follow_flags'])
	result['duplicates'] = duplicates
	result['duplicate_count'] = len(duplicates)

	existing_int_urls = [l['url'] for l in result['internal_links']]
	result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map)

	return result