SEO / agents /technical_auditor.py
pkm13's picture
Upload 10 files
e5ab217 verified
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
class TechnicalAuditorAgent:
def __init__(self):
try:
self.classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=-1 # CPU inference
)
except Exception as e:
logger.warning(f"Could not load ML models for Technical Auditor: {e}")
self.classifier = None
def assess_content_quality(self, page_text):
if not self.classifier:
return ["error-model-not-loaded"]
result = self.classifier(
page_text[:1024], # Limit text length for speed
candidate_labels=["high-quality", "thin-content", "spam", "keyword-stuffing"]
)
return result['labels']
def audit_page(self, url):
logger.info(f"Auditing page: {url}")
results = {
"url": url,
"broken_links": [],
"missing_alt": [],
"content_quality": "unknown"
}
try:
# Note: In a real scenario, you might need to handle headers/user-agents
response = requests.get(url, timeout=10)
if response.status_code != 200:
logger.error(f"Page returned status {response.status_code}")
return results
soup = BeautifulSoup(response.text, 'html.parser')
# Check content quality
text_content = soup.get_text(separator=' ', strip=True)
results['content_quality'] = self.assess_content_quality(text_content)
# Check images
images = soup.find_all('img')
for img in images:
if not img.get('alt'):
results['missing_alt'].append(img.get('src'))
# Check links (simplified)
links = soup.find_all('a')
for link in links:
href = link.get('href')
if href and href.startswith('http'):
try:
head = requests.head(href, timeout=5)
if head.status_code >= 400:
results['broken_links'].append(href)
except:
results['broken_links'].append(href)
except Exception as e:
logger.error(f"Audit failed for {url}: {e}")
logger.info(f"Audit complete for {url}: {results}")
# In a real system, you would post these results back to a webhook or database
return results