Spaces:
Sleeping
Sleeping
| """ | |
| Core Link Audit Engine | |
| Crawls pages, extracts body-content links, checks status, detects issues. | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup, Comment | |
| from urllib.parse import urljoin, urlparse | |
| from collections import defaultdict | |
| import concurrent.futures | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| } | |
| DEFAULT_BODY_SELECTORS = [ | |
| "div.blog-rich-text", | |
| "div.w-richtext", | |
| "article .rich-text", | |
| "article", | |
| "div.blog-content", | |
| "div.post-content", | |
| "main", | |
| ] | |
| DEFAULT_SUGGESTION_MAP = { | |
| "artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"), | |
| "machine learning": ("/category/artificial-intelligence-training", "machine learning training"), | |
| "leadership": ("/type/leadership-training", "leadership training programs"), | |
| "soft skills": ("/type/behavioral-training", "behavioral training programs"), | |
| "remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"), | |
| "training management": ("/training-management-software", "training management software"), | |
| "instructor-led": ("/instructor-led-training-services", "instructor-led training"), | |
| "corporate training": ("/corporate-training-courses", "corporate training programs"), | |
| "skill matrix": ("/skill-matrix", "skills matrix"), | |
| "stellar ai": ("/stellar-ai", "AI-powered training"), | |
| "book a demo": ("/book-a-demo", "book a demo"), | |
| "compliance": ("/type/compliance-training", "compliance training"), | |
| "cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"), | |
| "data analytics": ("/category/data-analytics-training", "data analytics training"), | |
| "project management": ("/category/project-management-training", "project management training"), | |
| "coaching": ("/coaching-solutions", "coaching solutions"), | |
| "hr training": ("/category/human-resource-training", "HR training programs"), | |
| "employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"), | |
| "onboarding": ("/category/human-resource-training", "onboarding training"), | |
| "digital transformation": ("/type/it-technical-training", "IT & technical training"), | |
| } | |
| def is_internal(href, domain): | |
| if not href: | |
| return False | |
| parsed = urlparse(href) | |
| if not parsed.netloc: | |
| return True | |
| return domain.lower() in parsed.netloc.lower() | |
| def normalize_url(href, base_url): | |
| if not href: | |
| return None | |
| href = href.strip() | |
| if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')): | |
| return None | |
| return urljoin(base_url, href) | |
| def get_follow_status(tag): | |
| rel = tag.get('rel', []) | |
| if isinstance(rel, str): | |
| rel = rel.split() | |
| return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow' | |
| def find_body_content(soup, selectors): | |
| for sel in selectors: | |
| el = soup.select_one(sel) | |
| if el: | |
| return el | |
| return soup.find('body') | |
| def get_link_location(link_tag, body_el): | |
| body_text = body_el.get_text() | |
| total_len = len(body_text) | |
| if total_len == 0: | |
| return "Unknown" | |
| preceding_text = "" | |
| for el in body_el.descendants: | |
| if el == link_tag: | |
| break | |
| if isinstance(el, str) and not isinstance(el, Comment): | |
| preceding_text += el | |
| pos = len(preceding_text) | |
| ratio = pos / total_len if total_len > 0 else 0 | |
| heading = "" | |
| for parent in link_tag.parents: | |
| for sib in parent.previous_siblings: | |
| if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']: | |
| heading = sib.get_text(strip=True)[:60] | |
| break | |
| if heading: | |
| break | |
| if ratio < 0.1: | |
| section = "Intro" | |
| elif ratio > 0.85: | |
| section = "Conclusion" | |
| else: | |
| section = f"Mid-article (~{int(ratio*100)}%)" | |
| if heading: | |
| return f'{section} · near "{heading}"' | |
| return section | |
| def check_url_status(url, timeout=15): | |
| try: | |
| r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False) | |
| status = r.status_code | |
| redirect_url = "" | |
| if status in (301, 302, 303, 307, 308): | |
| redirect_url = r.headers.get('Location', '') | |
| if redirect_url and not redirect_url.startswith('http'): | |
| redirect_url = urljoin(url, redirect_url) | |
| if status == 405: | |
| r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True) | |
| status = r.status_code | |
| if status in (301, 302, 303, 307, 308): | |
| redirect_url = r.headers.get('Location', '') | |
| r.close() | |
| if status in (301, 302, 303, 307, 308): | |
| link_status = "Redirect" | |
| elif 200 <= status < 300: | |
| link_status = "Active" | |
| else: | |
| link_status = "Broken" | |
| return url, status, link_status, redirect_url | |
| except requests.exceptions.Timeout: | |
| return url, "Timeout", "Broken", "" | |
| except requests.exceptions.ConnectionError: | |
| return url, "ConnError", "Broken", "" | |
| except Exception: | |
| return url, "Error", "Broken", "" | |
| def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None): | |
| if suggestion_map is None: | |
| suggestion_map = DEFAULT_SUGGESTION_MAP | |
| suggestions = [] | |
| text_lower = body_text.lower() | |
| existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls) | |
| for keyword, (path, anchor) in suggestion_map.items(): | |
| clean_path = path.rstrip('/') | |
| if clean_path in existing_paths: | |
| continue | |
| if clean_path == urlparse(page_url).path.rstrip('/'): | |
| continue | |
| count = text_lower.count(keyword.lower()) | |
| if count > 0: | |
| pos = text_lower.find(keyword.lower()) | |
| ratio = pos / len(text_lower) if len(text_lower) > 0 else 0 | |
| if ratio < 0.15: | |
| loc = "Intro" | |
| elif ratio > 0.85: | |
| loc = "Conclusion" | |
| else: | |
| loc = f"Mid-article (~{int(ratio*100)}%)" | |
| priority = "High" if count >= 3 else "Med" | |
| suggestions.append({ | |
| 'section': loc, | |
| 'target': path, | |
| 'anchor': anchor, | |
| 'priority': priority, | |
| 'keyword': keyword, | |
| 'count': count | |
| }) | |
| suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count'])) | |
| return suggestions[:10] | |
| def audit_page(page_url, domain, body_selectors=None, suggestion_map=None, | |
| timeout=15, concurrent_workers=5): | |
| if body_selectors is None: | |
| body_selectors = DEFAULT_BODY_SELECTORS | |
| result = { | |
| 'url': page_url, 'error': None, | |
| 'internal_links': [], 'external_links': [], | |
| 'broken_internal': [], 'broken_external': [], | |
| 'redirect_internal': [], 'redirect_external': [], | |
| 'follow_flags': [], 'duplicates': [], 'suggestions': [], | |
| 'int_count': 0, 'ext_count': 0, | |
| 'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0, | |
| 'broken_int_count': 0, 'broken_ext_count': 0, | |
| 'redirect_int_count': 0, 'redirect_ext_count': 0, | |
| 'follow_flag_count': 0, 'duplicate_count': 0, | |
| } | |
| try: | |
| resp = requests.get(page_url, headers=HEADERS, timeout=timeout) | |
| resp.raise_for_status() | |
| except Exception as e: | |
| result['error'] = str(e) | |
| return result | |
| soup = BeautifulSoup(resp.text, 'lxml') | |
| body_el = find_body_content(soup, body_selectors) | |
| if not body_el: | |
| result['error'] = "Could not find body content element" | |
| return result | |
| body_text = body_el.get_text(' ', strip=True) | |
| all_links = body_el.find_all('a', href=True) | |
| url_locations = defaultdict(list) | |
| raw_links = [] | |
| for tag in all_links: | |
| href = normalize_url(tag['href'], page_url) | |
| if not href: | |
| continue | |
| anchor = tag.get_text(strip=True) or "[no text]" | |
| follow = get_follow_status(tag) | |
| location = get_link_location(tag, body_el) | |
| internal = is_internal(href, domain) | |
| link_type = 'internal' if internal else 'external' | |
| link_data = { | |
| 'url': href, 'anchor': anchor[:100], 'follow': follow, | |
| 'location': location, 'type': link_type, | |
| 'status_code': None, 'link_status': None, | |
| 'redirect_url': '', 'flags': [], | |
| } | |
| raw_links.append(link_data) | |
| clean_url = href.rstrip('/').split('?')[0].split('#')[0] | |
| url_locations[clean_url].append(location) | |
| # Check status in parallel | |
| unique_urls = list(set(l['url'] for l in raw_links)) | |
| status_map = {} | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor: | |
| futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls} | |
| for future in concurrent.futures.as_completed(futures): | |
| url, status, link_status, redirect_url = future.result() | |
| status_map[url] = (status, link_status, redirect_url) | |
| for link in raw_links: | |
| if link['url'] in status_map: | |
| status, link_status, redirect_url = status_map[link['url']] | |
| link['status_code'] = status | |
| link['link_status'] = link_status | |
| link['redirect_url'] = redirect_url | |
| if link['type'] == 'internal' and link['follow'] == 'Nofollow': | |
| link['flags'].append('Internal link is Nofollow — should be Dofollow') | |
| if link['type'] == 'external' and link['follow'] == 'Dofollow': | |
| link['flags'].append('External link is Dofollow — should be Nofollow') | |
| # Detect duplicates | |
| duplicates = [] | |
| for clean_url, locations in url_locations.items(): | |
| if len(locations) > 1: | |
| duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations}) | |
| for link in raw_links: | |
| link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0] | |
| if link_clean == clean_url: | |
| link['flags'].append(f'Duplicate: appears {len(locations)}x in body') | |
| for link in raw_links: | |
| if link['type'] == 'internal': | |
| result['internal_links'].append(link) | |
| if link['follow'] == 'Dofollow': result['int_df'] += 1 | |
| else: result['int_nf'] += 1 | |
| if link['link_status'] == 'Broken': result['broken_internal'].append(link) | |
| if link['link_status'] == 'Redirect': result['redirect_internal'].append(link) | |
| else: | |
| result['external_links'].append(link) | |
| if link['follow'] == 'Dofollow': result['ext_df'] += 1 | |
| else: result['ext_nf'] += 1 | |
| if link['link_status'] == 'Broken': result['broken_external'].append(link) | |
| if link['link_status'] == 'Redirect': result['redirect_external'].append(link) | |
| if link['flags']: | |
| result['follow_flags'].append(link) | |
| result['int_count'] = len(result['internal_links']) | |
| result['ext_count'] = len(result['external_links']) | |
| result['broken_int_count'] = len(result['broken_internal']) | |
| result['broken_ext_count'] = len(result['broken_external']) | |
| result['redirect_int_count'] = len(result['redirect_internal']) | |
| result['redirect_ext_count'] = len(result['redirect_external']) | |
| result['follow_flag_count'] = len(result['follow_flags']) | |
| result['duplicates'] = duplicates | |
| result['duplicate_count'] = len(duplicates) | |
| existing_int_urls = [l['url'] for l in result['internal_links']] | |
| result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map) | |
| return result | |