|
|
from transformers import pipeline |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class TechnicalAuditorAgent: |
|
|
def __init__(self): |
|
|
try: |
|
|
self.classifier = pipeline( |
|
|
"zero-shot-classification", |
|
|
model="facebook/bart-large-mnli", |
|
|
device=-1 |
|
|
) |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not load ML models for Technical Auditor: {e}") |
|
|
self.classifier = None |
|
|
|
|
|
def assess_content_quality(self, page_text): |
|
|
if not self.classifier: |
|
|
return ["error-model-not-loaded"] |
|
|
|
|
|
result = self.classifier( |
|
|
page_text[:1024], |
|
|
candidate_labels=["high-quality", "thin-content", "spam", "keyword-stuffing"] |
|
|
) |
|
|
return result['labels'] |
|
|
|
|
|
def audit_page(self, url): |
|
|
logger.info(f"Auditing page: {url}") |
|
|
results = { |
|
|
"url": url, |
|
|
"broken_links": [], |
|
|
"missing_alt": [], |
|
|
"content_quality": "unknown" |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
response = requests.get(url, timeout=10) |
|
|
if response.status_code != 200: |
|
|
logger.error(f"Page returned status {response.status_code}") |
|
|
return results |
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
|
|
|
text_content = soup.get_text(separator=' ', strip=True) |
|
|
results['content_quality'] = self.assess_content_quality(text_content) |
|
|
|
|
|
|
|
|
images = soup.find_all('img') |
|
|
for img in images: |
|
|
if not img.get('alt'): |
|
|
results['missing_alt'].append(img.get('src')) |
|
|
|
|
|
|
|
|
links = soup.find_all('a') |
|
|
for link in links: |
|
|
href = link.get('href') |
|
|
if href and href.startswith('http'): |
|
|
try: |
|
|
head = requests.head(href, timeout=5) |
|
|
if head.status_code >= 400: |
|
|
results['broken_links'].append(href) |
|
|
except: |
|
|
results['broken_links'].append(href) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Audit failed for {url}: {e}") |
|
|
|
|
|
logger.info(f"Audit complete for {url}: {results}") |
|
|
|
|
|
return results |
|
|
|