""" Core Link Audit Engine Crawls pages, extracts body-content links, checks status, detects issues. """ import requests from bs4 import BeautifulSoup, Comment from urllib.parse import urljoin, urlparse from collections import defaultdict import concurrent.futures HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } DEFAULT_BODY_SELECTORS = [ "div.blog-rich-text", "div.w-richtext", "article .rich-text", "article", "div.blog-content", "div.post-content", "main", ] DEFAULT_SUGGESTION_MAP = { "artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"), "machine learning": ("/category/artificial-intelligence-training", "machine learning training"), "leadership": ("/type/leadership-training", "leadership training programs"), "soft skills": ("/type/behavioral-training", "behavioral training programs"), "remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"), "training management": ("/training-management-software", "training management software"), "instructor-led": ("/instructor-led-training-services", "instructor-led training"), "corporate training": ("/corporate-training-courses", "corporate training programs"), "skill matrix": ("/skill-matrix", "skills matrix"), "stellar ai": ("/stellar-ai", "AI-powered training"), "book a demo": ("/book-a-demo", "book a demo"), "compliance": ("/type/compliance-training", "compliance training"), "cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"), "data analytics": ("/category/data-analytics-training", "data analytics training"), "project management": ("/category/project-management-training", "project management training"), "coaching": ("/coaching-solutions", "coaching solutions"), "hr training": ("/category/human-resource-training", "HR training programs"), "employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"), "onboarding": ("/category/human-resource-training", "onboarding training"), "digital transformation": ("/type/it-technical-training", "IT & technical training"), } def is_internal(href, domain): if not href: return False parsed = urlparse(href) if not parsed.netloc: return True return domain.lower() in parsed.netloc.lower() def normalize_url(href, base_url): if not href: return None href = href.strip() if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): return None return urljoin(base_url, href) def get_follow_status(tag): rel = tag.get('rel', []) if isinstance(rel, str): rel = rel.split() return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow' def find_body_content(soup, selectors): for sel in selectors: el = soup.select_one(sel) if el: return el return soup.find('body') def get_link_location(link_tag, body_el): body_text = body_el.get_text() total_len = len(body_text) if total_len == 0: return "Unknown" preceding_text = "" for el in body_el.descendants: if el == link_tag: break if isinstance(el, str) and not isinstance(el, Comment): preceding_text += el pos = len(preceding_text) ratio = pos / total_len if total_len > 0 else 0 heading = "" for parent in link_tag.parents: for sib in parent.previous_siblings: if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']: heading = sib.get_text(strip=True)[:60] break if heading: break if ratio < 0.1: section = "Intro" elif ratio > 0.85: section = "Conclusion" else: section = f"Mid-article (~{int(ratio*100)}%)" if heading: return f'{section} · near "{heading}"' return section def check_url_status(url, timeout=15): try: r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False) status = r.status_code redirect_url = "" if status in (301, 302, 303, 307, 308): redirect_url = r.headers.get('Location', '') if redirect_url and not redirect_url.startswith('http'): redirect_url = urljoin(url, redirect_url) if status == 405: r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True) status = r.status_code if status in (301, 302, 303, 307, 308): redirect_url = r.headers.get('Location', '') r.close() if status in (301, 302, 303, 307, 308): link_status = "Redirect" elif 200 <= status < 300: link_status = "Active" else: link_status = "Broken" return url, status, link_status, redirect_url except requests.exceptions.Timeout: return url, "Timeout", "Broken", "" except requests.exceptions.ConnectionError: return url, "ConnError", "Broken", "" except Exception: return url, "Error", "Broken", "" def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None): if suggestion_map is None: suggestion_map = DEFAULT_SUGGESTION_MAP suggestions = [] text_lower = body_text.lower() existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls) for keyword, (path, anchor) in suggestion_map.items(): clean_path = path.rstrip('/') if clean_path in existing_paths: continue if clean_path == urlparse(page_url).path.rstrip('/'): continue count = text_lower.count(keyword.lower()) if count > 0: pos = text_lower.find(keyword.lower()) ratio = pos / len(text_lower) if len(text_lower) > 0 else 0 if ratio < 0.15: loc = "Intro" elif ratio > 0.85: loc = "Conclusion" else: loc = f"Mid-article (~{int(ratio*100)}%)" priority = "High" if count >= 3 else "Med" suggestions.append({ 'section': loc, 'target': path, 'anchor': anchor, 'priority': priority, 'keyword': keyword, 'count': count }) suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count'])) return suggestions[:10] def audit_page(page_url, domain, body_selectors=None, suggestion_map=None, timeout=15, concurrent_workers=5): if body_selectors is None: body_selectors = DEFAULT_BODY_SELECTORS result = { 'url': page_url, 'error': None, 'internal_links': [], 'external_links': [], 'broken_internal': [], 'broken_external': [], 'redirect_internal': [], 'redirect_external': [], 'follow_flags': [], 'duplicates': [], 'suggestions': [], 'int_count': 0, 'ext_count': 0, 'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0, 'broken_int_count': 0, 'broken_ext_count': 0, 'redirect_int_count': 0, 'redirect_ext_count': 0, 'follow_flag_count': 0, 'duplicate_count': 0, } try: resp = requests.get(page_url, headers=HEADERS, timeout=timeout) resp.raise_for_status() except Exception as e: result['error'] = str(e) return result soup = BeautifulSoup(resp.text, 'lxml') body_el = find_body_content(soup, body_selectors) if not body_el: result['error'] = "Could not find body content element" return result body_text = body_el.get_text(' ', strip=True) all_links = body_el.find_all('a', href=True) url_locations = defaultdict(list) raw_links = [] for tag in all_links: href = normalize_url(tag['href'], page_url) if not href: continue anchor = tag.get_text(strip=True) or "[no text]" follow = get_follow_status(tag) location = get_link_location(tag, body_el) internal = is_internal(href, domain) link_type = 'internal' if internal else 'external' link_data = { 'url': href, 'anchor': anchor[:100], 'follow': follow, 'location': location, 'type': link_type, 'status_code': None, 'link_status': None, 'redirect_url': '', 'flags': [], } raw_links.append(link_data) clean_url = href.rstrip('/').split('?')[0].split('#')[0] url_locations[clean_url].append(location) # Check status in parallel unique_urls = list(set(l['url'] for l in raw_links)) status_map = {} with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor: futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls} for future in concurrent.futures.as_completed(futures): url, status, link_status, redirect_url = future.result() status_map[url] = (status, link_status, redirect_url) for link in raw_links: if link['url'] in status_map: status, link_status, redirect_url = status_map[link['url']] link['status_code'] = status link['link_status'] = link_status link['redirect_url'] = redirect_url if link['type'] == 'internal' and link['follow'] == 'Nofollow': link['flags'].append('Internal link is Nofollow — should be Dofollow') if link['type'] == 'external' and link['follow'] == 'Dofollow': link['flags'].append('External link is Dofollow — should be Nofollow') # Detect duplicates duplicates = [] for clean_url, locations in url_locations.items(): if len(locations) > 1: duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations}) for link in raw_links: link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0] if link_clean == clean_url: link['flags'].append(f'Duplicate: appears {len(locations)}x in body') for link in raw_links: if link['type'] == 'internal': result['internal_links'].append(link) if link['follow'] == 'Dofollow': result['int_df'] += 1 else: result['int_nf'] += 1 if link['link_status'] == 'Broken': result['broken_internal'].append(link) if link['link_status'] == 'Redirect': result['redirect_internal'].append(link) else: result['external_links'].append(link) if link['follow'] == 'Dofollow': result['ext_df'] += 1 else: result['ext_nf'] += 1 if link['link_status'] == 'Broken': result['broken_external'].append(link) if link['link_status'] == 'Redirect': result['redirect_external'].append(link) if link['flags']: result['follow_flags'].append(link) result['int_count'] = len(result['internal_links']) result['ext_count'] = len(result['external_links']) result['broken_int_count'] = len(result['broken_internal']) result['broken_ext_count'] = len(result['broken_external']) result['redirect_int_count'] = len(result['redirect_internal']) result['redirect_ext_count'] = len(result['redirect_external']) result['follow_flag_count'] = len(result['follow_flags']) result['duplicates'] = duplicates result['duplicate_count'] = len(duplicates) existing_int_urls = [l['url'] for l in result['internal_links']] result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map) return result