Spaces:
Running
Running
| from bs4 import BeautifulSoup | |
| import time | |
| from utils import safe_request | |
| def scan_website(url): | |
| data = {} | |
| # Measure total load time including HTTP request | |
| start = time.time() | |
| response = safe_request(url) | |
| if not response: | |
| return {"error": "Unable to fetch URL", "score": 0} | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| load_time = round(time.time() - start, 2) | |
| # Page size in MB | |
| page_size_mb = len(response.content) / (1024*1024) | |
| # Count internal vs external links | |
| internal_links = 0 | |
| external_links = 0 | |
| for link in soup.find_all("a", href=True): | |
| href = link.get("href") | |
| if href.startswith("http") and url.split("//")[1] in href: | |
| internal_links += 1 | |
| elif href.startswith("http"): | |
| external_links += 1 | |
| # Heading counts | |
| headings_count = { | |
| "H1": len(soup.find_all("h1")), | |
| "H2": len(soup.find_all("h2")), | |
| "H3": len(soup.find_all("h3")) | |
| } | |
| data.update({ | |
| "status_code": response.status_code, | |
| "load_time": load_time, | |
| "https": url.startswith("https"), | |
| "title": soup.title.string if soup.title else "Missing", | |
| "meta_description": bool(soup.find("meta", attrs={"name": "description"})), | |
| "h1_count": headings_count["H1"], | |
| "h2_count": headings_count["H2"], | |
| "h3_count": headings_count["H3"], | |
| "headings_count": headings_count, | |
| "images_without_alt": len([img for img in soup.find_all("img") if not img.get("alt")]), | |
| "links_count": len(soup.find_all("a")), | |
| "internal_links": internal_links, | |
| "external_links": external_links, | |
| "scripts_count": len(soup.find_all("script")), | |
| "paragraph_count": len(soup.find_all("p")), | |
| "page_size_mb": page_size_mb | |
| }) | |
| return data | |