File size: 2,691 Bytes
e5ab217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import logging

logger = logging.getLogger(__name__)

class TechnicalAuditorAgent:
    def __init__(self):
        try:
            self.classifier = pipeline(
                "zero-shot-classification",
                model="facebook/bart-large-mnli",
                device=-1  # CPU inference
            )
        except Exception as e:
             logger.warning(f"Could not load ML models for Technical Auditor: {e}")
             self.classifier = None

    def assess_content_quality(self, page_text):
        if not self.classifier:
            return ["error-model-not-loaded"]
            
        result = self.classifier(
            page_text[:1024], # Limit text length for speed
            candidate_labels=["high-quality", "thin-content", "spam", "keyword-stuffing"]
        )
        return result['labels']

    def audit_page(self, url):
        logger.info(f"Auditing page: {url}")
        results = {
            "url": url,
            "broken_links": [],
            "missing_alt": [],
            "content_quality": "unknown"
        }
        
        try:
            # Note: In a real scenario, you might need to handle headers/user-agents
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                logger.error(f"Page returned status {response.status_code}")
                return results

            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Check content quality
            text_content = soup.get_text(separator=' ', strip=True)
            results['content_quality'] = self.assess_content_quality(text_content)

            # Check images
            images = soup.find_all('img')
            for img in images:
                if not img.get('alt'):
                    results['missing_alt'].append(img.get('src'))

            # Check links (simplified)
            links = soup.find_all('a')
            for link in links:
                href = link.get('href')
                if href and href.startswith('http'):
                    try:
                        head = requests.head(href, timeout=5)
                        if head.status_code >= 400:
                            results['broken_links'].append(href)
                    except:
                         results['broken_links'].append(href)

        except Exception as e:
            logger.error(f"Audit failed for {url}: {e}")
        
        logger.info(f"Audit complete for {url}: {results}")
        # In a real system, you would post these results back to a webhook or database
        return results