bulk-link-auditor / audit_engine.py
vijaykumaredstellar's picture
Upload 6 files
7f10996 verified
"""
Core Link Audit Engine
Crawls pages, extracts body-content links, checks status, detects issues.
"""
import requests
from bs4 import BeautifulSoup, Comment
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import concurrent.futures
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
DEFAULT_BODY_SELECTORS = [
"div.blog-rich-text",
"div.w-richtext",
"article .rich-text",
"article",
"div.blog-content",
"div.post-content",
"main",
]
DEFAULT_SUGGESTION_MAP = {
"artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"),
"machine learning": ("/category/artificial-intelligence-training", "machine learning training"),
"leadership": ("/type/leadership-training", "leadership training programs"),
"soft skills": ("/type/behavioral-training", "behavioral training programs"),
"remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"),
"training management": ("/training-management-software", "training management software"),
"instructor-led": ("/instructor-led-training-services", "instructor-led training"),
"corporate training": ("/corporate-training-courses", "corporate training programs"),
"skill matrix": ("/skill-matrix", "skills matrix"),
"stellar ai": ("/stellar-ai", "AI-powered training"),
"book a demo": ("/book-a-demo", "book a demo"),
"compliance": ("/type/compliance-training", "compliance training"),
"cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"),
"data analytics": ("/category/data-analytics-training", "data analytics training"),
"project management": ("/category/project-management-training", "project management training"),
"coaching": ("/coaching-solutions", "coaching solutions"),
"hr training": ("/category/human-resource-training", "HR training programs"),
"employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"),
"onboarding": ("/category/human-resource-training", "onboarding training"),
"digital transformation": ("/type/it-technical-training", "IT & technical training"),
}
def is_internal(href, domain):
if not href:
return False
parsed = urlparse(href)
if not parsed.netloc:
return True
return domain.lower() in parsed.netloc.lower()
def normalize_url(href, base_url):
if not href:
return None
href = href.strip()
if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
return None
return urljoin(base_url, href)
def get_follow_status(tag):
rel = tag.get('rel', [])
if isinstance(rel, str):
rel = rel.split()
return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow'
def find_body_content(soup, selectors):
for sel in selectors:
el = soup.select_one(sel)
if el:
return el
return soup.find('body')
def get_link_location(link_tag, body_el):
body_text = body_el.get_text()
total_len = len(body_text)
if total_len == 0:
return "Unknown"
preceding_text = ""
for el in body_el.descendants:
if el == link_tag:
break
if isinstance(el, str) and not isinstance(el, Comment):
preceding_text += el
pos = len(preceding_text)
ratio = pos / total_len if total_len > 0 else 0
heading = ""
for parent in link_tag.parents:
for sib in parent.previous_siblings:
if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']:
heading = sib.get_text(strip=True)[:60]
break
if heading:
break
if ratio < 0.1:
section = "Intro"
elif ratio > 0.85:
section = "Conclusion"
else:
section = f"Mid-article (~{int(ratio*100)}%)"
if heading:
return f'{section} · near "{heading}"'
return section
def check_url_status(url, timeout=15):
try:
r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False)
status = r.status_code
redirect_url = ""
if status in (301, 302, 303, 307, 308):
redirect_url = r.headers.get('Location', '')
if redirect_url and not redirect_url.startswith('http'):
redirect_url = urljoin(url, redirect_url)
if status == 405:
r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True)
status = r.status_code
if status in (301, 302, 303, 307, 308):
redirect_url = r.headers.get('Location', '')
r.close()
if status in (301, 302, 303, 307, 308):
link_status = "Redirect"
elif 200 <= status < 300:
link_status = "Active"
else:
link_status = "Broken"
return url, status, link_status, redirect_url
except requests.exceptions.Timeout:
return url, "Timeout", "Broken", ""
except requests.exceptions.ConnectionError:
return url, "ConnError", "Broken", ""
except Exception:
return url, "Error", "Broken", ""
def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None):
if suggestion_map is None:
suggestion_map = DEFAULT_SUGGESTION_MAP
suggestions = []
text_lower = body_text.lower()
existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls)
for keyword, (path, anchor) in suggestion_map.items():
clean_path = path.rstrip('/')
if clean_path in existing_paths:
continue
if clean_path == urlparse(page_url).path.rstrip('/'):
continue
count = text_lower.count(keyword.lower())
if count > 0:
pos = text_lower.find(keyword.lower())
ratio = pos / len(text_lower) if len(text_lower) > 0 else 0
if ratio < 0.15:
loc = "Intro"
elif ratio > 0.85:
loc = "Conclusion"
else:
loc = f"Mid-article (~{int(ratio*100)}%)"
priority = "High" if count >= 3 else "Med"
suggestions.append({
'section': loc,
'target': path,
'anchor': anchor,
'priority': priority,
'keyword': keyword,
'count': count
})
suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count']))
return suggestions[:10]
def audit_page(page_url, domain, body_selectors=None, suggestion_map=None,
timeout=15, concurrent_workers=5):
if body_selectors is None:
body_selectors = DEFAULT_BODY_SELECTORS
result = {
'url': page_url, 'error': None,
'internal_links': [], 'external_links': [],
'broken_internal': [], 'broken_external': [],
'redirect_internal': [], 'redirect_external': [],
'follow_flags': [], 'duplicates': [], 'suggestions': [],
'int_count': 0, 'ext_count': 0,
'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0,
'broken_int_count': 0, 'broken_ext_count': 0,
'redirect_int_count': 0, 'redirect_ext_count': 0,
'follow_flag_count': 0, 'duplicate_count': 0,
}
try:
resp = requests.get(page_url, headers=HEADERS, timeout=timeout)
resp.raise_for_status()
except Exception as e:
result['error'] = str(e)
return result
soup = BeautifulSoup(resp.text, 'lxml')
body_el = find_body_content(soup, body_selectors)
if not body_el:
result['error'] = "Could not find body content element"
return result
body_text = body_el.get_text(' ', strip=True)
all_links = body_el.find_all('a', href=True)
url_locations = defaultdict(list)
raw_links = []
for tag in all_links:
href = normalize_url(tag['href'], page_url)
if not href:
continue
anchor = tag.get_text(strip=True) or "[no text]"
follow = get_follow_status(tag)
location = get_link_location(tag, body_el)
internal = is_internal(href, domain)
link_type = 'internal' if internal else 'external'
link_data = {
'url': href, 'anchor': anchor[:100], 'follow': follow,
'location': location, 'type': link_type,
'status_code': None, 'link_status': None,
'redirect_url': '', 'flags': [],
}
raw_links.append(link_data)
clean_url = href.rstrip('/').split('?')[0].split('#')[0]
url_locations[clean_url].append(location)
# Check status in parallel
unique_urls = list(set(l['url'] for l in raw_links))
status_map = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor:
futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls}
for future in concurrent.futures.as_completed(futures):
url, status, link_status, redirect_url = future.result()
status_map[url] = (status, link_status, redirect_url)
for link in raw_links:
if link['url'] in status_map:
status, link_status, redirect_url = status_map[link['url']]
link['status_code'] = status
link['link_status'] = link_status
link['redirect_url'] = redirect_url
if link['type'] == 'internal' and link['follow'] == 'Nofollow':
link['flags'].append('Internal link is Nofollow — should be Dofollow')
if link['type'] == 'external' and link['follow'] == 'Dofollow':
link['flags'].append('External link is Dofollow — should be Nofollow')
# Detect duplicates
duplicates = []
for clean_url, locations in url_locations.items():
if len(locations) > 1:
duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations})
for link in raw_links:
link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0]
if link_clean == clean_url:
link['flags'].append(f'Duplicate: appears {len(locations)}x in body')
for link in raw_links:
if link['type'] == 'internal':
result['internal_links'].append(link)
if link['follow'] == 'Dofollow': result['int_df'] += 1
else: result['int_nf'] += 1
if link['link_status'] == 'Broken': result['broken_internal'].append(link)
if link['link_status'] == 'Redirect': result['redirect_internal'].append(link)
else:
result['external_links'].append(link)
if link['follow'] == 'Dofollow': result['ext_df'] += 1
else: result['ext_nf'] += 1
if link['link_status'] == 'Broken': result['broken_external'].append(link)
if link['link_status'] == 'Redirect': result['redirect_external'].append(link)
if link['flags']:
result['follow_flags'].append(link)
result['int_count'] = len(result['internal_links'])
result['ext_count'] = len(result['external_links'])
result['broken_int_count'] = len(result['broken_internal'])
result['broken_ext_count'] = len(result['broken_external'])
result['redirect_int_count'] = len(result['redirect_internal'])
result['redirect_ext_count'] = len(result['redirect_external'])
result['follow_flag_count'] = len(result['follow_flags'])
result['duplicates'] = duplicates
result['duplicate_count'] = len(duplicates)
existing_int_urls = [l['url'] for l in result['internal_links']]
result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map)
return result