Spaces:

pkm13
/

SEO

Running

App Files Files Community

SEO / agents /technical_auditor.py

pkm13

Upload 10 files

e5ab217 verified about 1 month ago

raw

history blame contribute delete

2.69 kB

	from transformers import pipeline
	import requests
	from bs4 import BeautifulSoup
	import logging

	logger = logging.getLogger(__name__)

	class TechnicalAuditorAgent:
	def __init__(self):
	try:
	self.classifier = pipeline(
	"zero-shot-classification",
	model="facebook/bart-large-mnli",
	device=-1 # CPU inference
	)
	except Exception as e:
	logger.warning(f"Could not load ML models for Technical Auditor: {e}")
	self.classifier = None

	def assess_content_quality(self, page_text):
	if not self.classifier:
	return ["error-model-not-loaded"]

	result = self.classifier(
	page_text[:1024], # Limit text length for speed
	candidate_labels=["high-quality", "thin-content", "spam", "keyword-stuffing"]
	)
	return result['labels']

	def audit_page(self, url):
	logger.info(f"Auditing page: {url}")
	results = {
	"url": url,
	"broken_links": [],
	"missing_alt": [],
	"content_quality": "unknown"
	}

	try:
	# Note: In a real scenario, you might need to handle headers/user-agents
	response = requests.get(url, timeout=10)
	if response.status_code != 200:
	logger.error(f"Page returned status {response.status_code}")
	return results

	soup = BeautifulSoup(response.text, 'html.parser')

	# Check content quality
	text_content = soup.get_text(separator=' ', strip=True)
	results['content_quality'] = self.assess_content_quality(text_content)

	# Check images
	images = soup.find_all('img')
	for img in images:
	if not img.get('alt'):
	results['missing_alt'].append(img.get('src'))

	# Check links (simplified)
	links = soup.find_all('a')
	for link in links:
	href = link.get('href')
	if href and href.startswith('http'):
	try:
	head = requests.head(href, timeout=5)
	if head.status_code >= 400:
	results['broken_links'].append(href)
	except:
	results['broken_links'].append(href)

	except Exception as e:
	logger.error(f"Audit failed for {url}: {e}")

	logger.info(f"Audit complete for {url}: {results}")
	# In a real system, you would post these results back to a webhook or database
	return results