from transformers import pipeline import requests from bs4 import BeautifulSoup import logging logger = logging.getLogger(__name__) class TechnicalAuditorAgent: def __init__(self): try: self.classifier = pipeline( "zero-shot-classification", model="facebook/bart-large-mnli", device=-1 # CPU inference ) except Exception as e: logger.warning(f"Could not load ML models for Technical Auditor: {e}") self.classifier = None def assess_content_quality(self, page_text): if not self.classifier: return ["error-model-not-loaded"] result = self.classifier( page_text[:1024], # Limit text length for speed candidate_labels=["high-quality", "thin-content", "spam", "keyword-stuffing"] ) return result['labels'] def audit_page(self, url): logger.info(f"Auditing page: {url}") results = { "url": url, "broken_links": [], "missing_alt": [], "content_quality": "unknown" } try: # Note: In a real scenario, you might need to handle headers/user-agents response = requests.get(url, timeout=10) if response.status_code != 200: logger.error(f"Page returned status {response.status_code}") return results soup = BeautifulSoup(response.text, 'html.parser') # Check content quality text_content = soup.get_text(separator=' ', strip=True) results['content_quality'] = self.assess_content_quality(text_content) # Check images images = soup.find_all('img') for img in images: if not img.get('alt'): results['missing_alt'].append(img.get('src')) # Check links (simplified) links = soup.find_all('a') for link in links: href = link.get('href') if href and href.startswith('http'): try: head = requests.head(href, timeout=5) if head.status_code >= 400: results['broken_links'].append(href) except: results['broken_links'].append(href) except Exception as e: logger.error(f"Audit failed for {url}: {e}") logger.info(f"Audit complete for {url}: {results}") # In a real system, you would post these results back to a webhook or database return results