Spaces:

edstellar
/

bulk-link-auditor

Sleeping

File size: 11,957 Bytes

7f10996

"""
Core Link Audit Engine
Crawls pages, extracts body-content links, checks status, detects issues.
"""

import requests
from bs4 import BeautifulSoup, Comment
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import concurrent.futures

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}

DEFAULT_BODY_SELECTORS = [
    "div.blog-rich-text",
    "div.w-richtext",
    "article .rich-text",
    "article",
    "div.blog-content",
    "div.post-content",
    "main",
]

DEFAULT_SUGGESTION_MAP = {
    "artificial intelligence": ("/category/artificial-intelligence-training", "artificial intelligence training programs"),
    "machine learning": ("/category/artificial-intelligence-training", "machine learning training"),
    "leadership": ("/type/leadership-training", "leadership training programs"),
    "soft skills": ("/type/behavioral-training", "behavioral training programs"),
    "remote employee": ("/blog/how-to-train-remote-employees", "remote employee training"),
    "training management": ("/training-management-software", "training management software"),
    "instructor-led": ("/instructor-led-training-services", "instructor-led training"),
    "corporate training": ("/corporate-training-courses", "corporate training programs"),
    "skill matrix": ("/skill-matrix", "skills matrix"),
    "stellar ai": ("/stellar-ai", "AI-powered training"),
    "book a demo": ("/book-a-demo", "book a demo"),
    "compliance": ("/type/compliance-training", "compliance training"),
    "cybersecurity": ("/category/cybersecurity-training", "cybersecurity training"),
    "data analytics": ("/category/data-analytics-training", "data analytics training"),
    "project management": ("/category/project-management-training", "project management training"),
    "coaching": ("/coaching-solutions", "coaching solutions"),
    "hr training": ("/category/human-resource-training", "HR training programs"),
    "employee engagement": ("/blog/how-to-train-remote-employees", "employee training best practices"),
    "onboarding": ("/category/human-resource-training", "onboarding training"),
    "digital transformation": ("/type/it-technical-training", "IT & technical training"),
}


def is_internal(href, domain):
    if not href:
        return False
    parsed = urlparse(href)
    if not parsed.netloc:
        return True
    return domain.lower() in parsed.netloc.lower()


def normalize_url(href, base_url):
    if not href:
        return None
    href = href.strip()
    if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
        return None
    return urljoin(base_url, href)


def get_follow_status(tag):
    rel = tag.get('rel', [])
    if isinstance(rel, str):
        rel = rel.split()
    return 'Nofollow' if 'nofollow' in [r.lower() for r in rel] else 'Dofollow'


def find_body_content(soup, selectors):
    for sel in selectors:
        el = soup.select_one(sel)
        if el:
            return el
    return soup.find('body')


def get_link_location(link_tag, body_el):
    body_text = body_el.get_text()
    total_len = len(body_text)
    if total_len == 0:
        return "Unknown"

    preceding_text = ""
    for el in body_el.descendants:
        if el == link_tag:
            break
        if isinstance(el, str) and not isinstance(el, Comment):
            preceding_text += el

    pos = len(preceding_text)
    ratio = pos / total_len if total_len > 0 else 0

    heading = ""
    for parent in link_tag.parents:
        for sib in parent.previous_siblings:
            if hasattr(sib, 'name') and sib.name in ['h1', 'h2', 'h3', 'h4']:
                heading = sib.get_text(strip=True)[:60]
                break
        if heading:
            break

    if ratio < 0.1:
        section = "Intro"
    elif ratio > 0.85:
        section = "Conclusion"
    else:
        section = f"Mid-article (~{int(ratio*100)}%)"

    if heading:
        return f'{section} · near "{heading}"'
    return section


def check_url_status(url, timeout=15):
    try:
        r = requests.head(url, headers=HEADERS, timeout=timeout, allow_redirects=False)
        status = r.status_code
        redirect_url = ""

        if status in (301, 302, 303, 307, 308):
            redirect_url = r.headers.get('Location', '')
            if redirect_url and not redirect_url.startswith('http'):
                redirect_url = urljoin(url, redirect_url)

        if status == 405:
            r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=False, stream=True)
            status = r.status_code
            if status in (301, 302, 303, 307, 308):
                redirect_url = r.headers.get('Location', '')
            r.close()

        if status in (301, 302, 303, 307, 308):
            link_status = "Redirect"
        elif 200 <= status < 300:
            link_status = "Active"
        else:
            link_status = "Broken"

        return url, status, link_status, redirect_url

    except requests.exceptions.Timeout:
        return url, "Timeout", "Broken", ""
    except requests.exceptions.ConnectionError:
        return url, "ConnError", "Broken", ""
    except Exception:
        return url, "Error", "Broken", ""


def generate_suggestions(body_text, existing_internal_urls, page_url, suggestion_map=None):
    if suggestion_map is None:
        suggestion_map = DEFAULT_SUGGESTION_MAP

    suggestions = []
    text_lower = body_text.lower()
    existing_paths = set(urlparse(u).path.rstrip('/') for u in existing_internal_urls)

    for keyword, (path, anchor) in suggestion_map.items():
        clean_path = path.rstrip('/')
        if clean_path in existing_paths:
            continue
        if clean_path == urlparse(page_url).path.rstrip('/'):
            continue
        count = text_lower.count(keyword.lower())
        if count > 0:
            pos = text_lower.find(keyword.lower())
            ratio = pos / len(text_lower) if len(text_lower) > 0 else 0
            if ratio < 0.15:
                loc = "Intro"
            elif ratio > 0.85:
                loc = "Conclusion"
            else:
                loc = f"Mid-article (~{int(ratio*100)}%)"

            priority = "High" if count >= 3 else "Med"
            suggestions.append({
                'section': loc,
                'target': path,
                'anchor': anchor,
                'priority': priority,
                'keyword': keyword,
                'count': count
            })

    suggestions.sort(key=lambda x: (0 if x['priority'] == 'High' else 1, -x['count']))
    return suggestions[:10]


def audit_page(page_url, domain, body_selectors=None, suggestion_map=None,
               timeout=15, concurrent_workers=5):
    if body_selectors is None:
        body_selectors = DEFAULT_BODY_SELECTORS

    result = {
        'url': page_url, 'error': None,
        'internal_links': [], 'external_links': [],
        'broken_internal': [], 'broken_external': [],
        'redirect_internal': [], 'redirect_external': [],
        'follow_flags': [], 'duplicates': [], 'suggestions': [],
        'int_count': 0, 'ext_count': 0,
        'int_df': 0, 'int_nf': 0, 'ext_df': 0, 'ext_nf': 0,
        'broken_int_count': 0, 'broken_ext_count': 0,
        'redirect_int_count': 0, 'redirect_ext_count': 0,
        'follow_flag_count': 0, 'duplicate_count': 0,
    }

    try:
        resp = requests.get(page_url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
    except Exception as e:
        result['error'] = str(e)
        return result

    soup = BeautifulSoup(resp.text, 'lxml')
    body_el = find_body_content(soup, body_selectors)
    if not body_el:
        result['error'] = "Could not find body content element"
        return result

    body_text = body_el.get_text(' ', strip=True)
    all_links = body_el.find_all('a', href=True)
    url_locations = defaultdict(list)

    raw_links = []
    for tag in all_links:
        href = normalize_url(tag['href'], page_url)
        if not href:
            continue
        anchor = tag.get_text(strip=True) or "[no text]"
        follow = get_follow_status(tag)
        location = get_link_location(tag, body_el)
        internal = is_internal(href, domain)
        link_type = 'internal' if internal else 'external'

        link_data = {
            'url': href, 'anchor': anchor[:100], 'follow': follow,
            'location': location, 'type': link_type,
            'status_code': None, 'link_status': None,
            'redirect_url': '', 'flags': [],
        }
        raw_links.append(link_data)
        clean_url = href.rstrip('/').split('?')[0].split('#')[0]
        url_locations[clean_url].append(location)

    # Check status in parallel
    unique_urls = list(set(l['url'] for l in raw_links))
    status_map = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_workers) as executor:
        futures = {executor.submit(check_url_status, u, timeout): u for u in unique_urls}
        for future in concurrent.futures.as_completed(futures):
            url, status, link_status, redirect_url = future.result()
            status_map[url] = (status, link_status, redirect_url)

    for link in raw_links:
        if link['url'] in status_map:
            status, link_status, redirect_url = status_map[link['url']]
            link['status_code'] = status
            link['link_status'] = link_status
            link['redirect_url'] = redirect_url

        if link['type'] == 'internal' and link['follow'] == 'Nofollow':
            link['flags'].append('Internal link is Nofollow — should be Dofollow')
        if link['type'] == 'external' and link['follow'] == 'Dofollow':
            link['flags'].append('External link is Dofollow — should be Nofollow')

    # Detect duplicates
    duplicates = []
    for clean_url, locations in url_locations.items():
        if len(locations) > 1:
            duplicates.append({'url': clean_url, 'count': len(locations), 'locations': locations})
            for link in raw_links:
                link_clean = link['url'].rstrip('/').split('?')[0].split('#')[0]
                if link_clean == clean_url:
                    link['flags'].append(f'Duplicate: appears {len(locations)}x in body')

    for link in raw_links:
        if link['type'] == 'internal':
            result['internal_links'].append(link)
            if link['follow'] == 'Dofollow': result['int_df'] += 1
            else: result['int_nf'] += 1
            if link['link_status'] == 'Broken': result['broken_internal'].append(link)
            if link['link_status'] == 'Redirect': result['redirect_internal'].append(link)
        else:
            result['external_links'].append(link)
            if link['follow'] == 'Dofollow': result['ext_df'] += 1
            else: result['ext_nf'] += 1
            if link['link_status'] == 'Broken': result['broken_external'].append(link)
            if link['link_status'] == 'Redirect': result['redirect_external'].append(link)

        if link['flags']:
            result['follow_flags'].append(link)

    result['int_count'] = len(result['internal_links'])
    result['ext_count'] = len(result['external_links'])
    result['broken_int_count'] = len(result['broken_internal'])
    result['broken_ext_count'] = len(result['broken_external'])
    result['redirect_int_count'] = len(result['redirect_internal'])
    result['redirect_ext_count'] = len(result['redirect_external'])
    result['follow_flag_count'] = len(result['follow_flags'])
    result['duplicates'] = duplicates
    result['duplicate_count'] = len(duplicates)

    existing_int_urls = [l['url'] for l in result['internal_links']]
    result['suggestions'] = generate_suggestions(body_text, existing_int_urls, page_url, suggestion_map)

    return result