Audit_AI / scanner.py
Sakshi2005's picture
Upload folder using huggingface_hub
27697ee verified
from bs4 import BeautifulSoup
import time
from utils import safe_request
def scan_website(url):
data = {}
# Measure total load time including HTTP request
start = time.time()
response = safe_request(url)
if not response:
return {"error": "Unable to fetch URL", "score": 0}
soup = BeautifulSoup(response.text, "html.parser")
load_time = round(time.time() - start, 2)
# Page size in MB
page_size_mb = len(response.content) / (1024*1024)
# Count internal vs external links
internal_links = 0
external_links = 0
for link in soup.find_all("a", href=True):
href = link.get("href")
if href.startswith("http") and url.split("//")[1] in href:
internal_links += 1
elif href.startswith("http"):
external_links += 1
# Heading counts
headings_count = {
"H1": len(soup.find_all("h1")),
"H2": len(soup.find_all("h2")),
"H3": len(soup.find_all("h3"))
}
data.update({
"status_code": response.status_code,
"load_time": load_time,
"https": url.startswith("https"),
"title": soup.title.string if soup.title else "Missing",
"meta_description": bool(soup.find("meta", attrs={"name": "description"})),
"h1_count": headings_count["H1"],
"h2_count": headings_count["H2"],
"h3_count": headings_count["H3"],
"headings_count": headings_count,
"images_without_alt": len([img for img in soup.find_all("img") if not img.get("alt")]),
"links_count": len(soup.find_all("a")),
"internal_links": internal_links,
"external_links": external_links,
"scripts_count": len(soup.find_all("script")),
"paragraph_count": len(soup.find_all("p")),
"page_size_mb": page_size_mb
})
return data