Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .env.example +4 -0
- .gitignore +39 -0
- .gradio/certificate.pem +31 -0
- README.md +94 -12
- README_GRADIO.md +288 -0
- accessibility_checker.py +90 -0
- ai_analyzer.py +78 -0
- app_gradio.py +319 -0
- history_tracker.py +69 -0
- link_checker.py +74 -0
- mobile_checker.py +89 -0
- report_generator.py +163 -0
- requirements.txt +11 -0
- scanner.py +56 -0
- scoring.py +25 -0
- utils.py +24 -0
.env.example
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Google Gemini API Configuration
|
| 2 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 3 |
+
|
| 4 |
+
# Get your API key from: https://aistudio.google.com/app/apikey
|
.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
|
| 4 |
+
# Python
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*$py.class
|
| 8 |
+
*.so
|
| 9 |
+
.Python
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
*.egg
|
| 14 |
+
|
| 15 |
+
# Virtual environments
|
| 16 |
+
venv/
|
| 17 |
+
env/
|
| 18 |
+
ENV/
|
| 19 |
+
.venv
|
| 20 |
+
|
| 21 |
+
# IDE
|
| 22 |
+
.vscode/
|
| 23 |
+
.idea/
|
| 24 |
+
*.swp
|
| 25 |
+
*.swo
|
| 26 |
+
*~
|
| 27 |
+
|
| 28 |
+
# OS
|
| 29 |
+
.DS_Store
|
| 30 |
+
Thumbs.db
|
| 31 |
+
desktop.ini
|
| 32 |
+
|
| 33 |
+
# Project specific
|
| 34 |
+
audit_history.json
|
| 35 |
+
audit_report_*.pdf
|
| 36 |
+
*.log
|
| 37 |
+
|
| 38 |
+
# Gradio
|
| 39 |
+
flagged/
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,12 +1,94 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Audit_AI
|
| 3 |
+
app_file: app_gradio.py
|
| 4 |
+
sdk: gradio
|
| 5 |
+
sdk_version: 5.47.2
|
| 6 |
+
---
|
| 7 |
+
# 🧠 AuditAI — AI Website Auditor
|
| 8 |
+
|
| 9 |
+
An **Agentic AI-powered web application** built with **Gradio** that audits any website and provides **SEO, performance, accessibility, and security insights**, along with **AI-generated fixes and optimized HTML**.
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## 📌 Features
|
| 13 |
+
|
| 14 |
+
- 🔍 **Website Scanning**
|
| 15 |
+
- Page load time
|
| 16 |
+
- HTTPS detection
|
| 17 |
+
- Page size analysis
|
| 18 |
+
- Internal vs external links
|
| 19 |
+
- Headings structure (H1, H2, H3)
|
| 20 |
+
- Images without ALT attributes
|
| 21 |
+
- Scripts, paragraphs, and links count
|
| 22 |
+
|
| 23 |
+
- 🤖 **Agentic AI Analysis**
|
| 24 |
+
- Automatically detects website issues
|
| 25 |
+
- Provides actionable AI-powered suggestions
|
| 26 |
+
- Generates **HTML & SEO fix snippets**
|
| 27 |
+
- Produces **fully optimized HTML**
|
| 28 |
+
- Extracts top SEO keywords
|
| 29 |
+
- Analyzes heading hierarchy
|
| 30 |
+
|
| 31 |
+
- 📊 **Interactive Dashboard**
|
| 32 |
+
- Overall website score
|
| 33 |
+
- SEO, Performance, Accessibility & Security scores
|
| 34 |
+
- Gauge & radar charts
|
| 35 |
+
- Bar charts & pie charts
|
| 36 |
+
- Keyword word cloud
|
| 37 |
+
- Heading hierarchy treemap
|
| 38 |
+
- Page element heatmap
|
| 39 |
+
|
| 40 |
+
- ⬇️ **Download Optimized HTML**
|
| 41 |
+
- One-click download of AI-improved webpage
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
## 🔍 Usage
|
| 46 |
+
|
| 47 |
+
1. Run the app locally using Gradio.
|
| 48 |
+
2. Enter a valid website URL.
|
| 49 |
+
3. Click **🚀 Start Audit**.
|
| 50 |
+
4. View:
|
| 51 |
+
- ⚠️ Detected issues
|
| 52 |
+
- ✅ AI-generated suggestions
|
| 53 |
+
- 📊 Visual audit dashboard
|
| 54 |
+
- 🤖 Agentic AI fixes
|
| 55 |
+
- 📄 PDF Reports
|
| 56 |
+
5. Download the **optimized HTML** or **PDF report** if available.
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## 📊 How It Works
|
| 64 |
+
|
| 65 |
+
1. The app scans the website using **BeautifulSoup & Requests**.
|
| 66 |
+
2. Raw metrics are calculated (SEO, performance, accessibility, mobile, security).
|
| 67 |
+
3. Scan data is sent to **Google Gemini** for agentic analysis.
|
| 68 |
+
4. AI returns:
|
| 69 |
+
- Issues
|
| 70 |
+
- Suggestions
|
| 71 |
+
- Fix snippets
|
| 72 |
+
- Optimized HTML
|
| 73 |
+
5. Results are visualized in a rich Gradio dashboard.
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## ⚙️ Tech Stack
|
| 78 |
+
|
| 79 |
+
- **Python 3.9+**
|
| 80 |
+
- **Gradio** — Web UI
|
| 81 |
+
- **Google Gemini API** — Agentic AI analysis
|
| 82 |
+
- **BeautifulSoup** — HTML parsing
|
| 83 |
+
- **Requests** — Web scraping
|
| 84 |
+
- **Plotly & Matplotlib** — Interactive charts
|
| 85 |
+
- **WordCloud** — Keyword visualization
|
| 86 |
+
- **FPDF** — PDF report generation
|
| 87 |
+
- **dotenv** — Environment variables
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## 👨💻 Author
|
| 94 |
+
**Sakshi Gupta**
|
README_GRADIO.md
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧠 AuditAI — Enhanced Agentic AI Website Auditor (Gradio Edition)
|
| 2 |
+
|
| 3 |
+
An **Agentic AI-powered web application** built with **Gradio** that provides comprehensive website audits including **SEO, performance, accessibility, security, mobile responsiveness**, and **broken link detection** with **AI-generated insights and PDF reports**.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 🆕 What's New in Gradio Edition
|
| 8 |
+
|
| 9 |
+
### **Enhanced Features:**
|
| 10 |
+
- ✅ **Accessibility Checker** - WCAG 2.1 compliance analysis
|
| 11 |
+
- ✅ **Mobile Responsiveness Analyzer** - Viewport, responsive images, touch targets
|
| 12 |
+
- ✅ **Broken Link Detection** - Parallel link checking with detailed reports
|
| 13 |
+
- ✅ **PDF Report Generation** - Professional downloadable audit reports
|
| 14 |
+
- ✅ **Historical Tracking** - Track score improvements over time
|
| 15 |
+
- ✅ **Trend Analysis** - Visualize performance changes across audits
|
| 16 |
+
- ✅ **Enhanced UI** - Modern Gradio tabbed interface with better UX
|
| 17 |
+
|
| 18 |
+
### **Original Features (Retained):**
|
| 19 |
+
- 🔍 Website scanning (load time, HTTPS, page size, links, headings)
|
| 20 |
+
- 🤖 Agentic AI analysis with Google Gemini 1.5 Flash
|
| 21 |
+
- 📊 Interactive visualizations (gauges, radar charts, bar charts)
|
| 22 |
+
- ⬇️ Downloadable optimized HTML
|
| 23 |
+
- 💡 AI-powered suggestions and fix snippets
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## 🚀 Quick Start
|
| 28 |
+
|
| 29 |
+
### 1️⃣ Install Dependencies
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### 2️⃣ Set Up Gemini API Key
|
| 36 |
+
|
| 37 |
+
Create a `.env` file in the project root:
|
| 38 |
+
|
| 39 |
+
```env
|
| 40 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### 3️⃣ Run the Gradio App
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
python app_gradio.py
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
The app will launch at `http://localhost:7860` with a shareable link.
|
| 50 |
+
|
| 51 |
+
### 4️⃣ Run the Original Streamlit App (Optional)
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
streamlit run app.py
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## 📋 New Features Details
|
| 60 |
+
|
| 61 |
+
### **♿ Accessibility Checker** (`accessibility_checker.py`)
|
| 62 |
+
Analyzes WCAG 2.1 compliance:
|
| 63 |
+
- Missing alt text on images
|
| 64 |
+
- Proper heading hierarchy (H1-H6)
|
| 65 |
+
- Form labels and ARIA landmarks
|
| 66 |
+
- Link text quality
|
| 67 |
+
- Language attributes
|
| 68 |
+
- Skip navigation links
|
| 69 |
+
- Video captions
|
| 70 |
+
|
| 71 |
+
### **📱 Mobile Responsiveness** (`mobile_checker.py`)
|
| 72 |
+
Checks mobile-friendliness:
|
| 73 |
+
- Viewport meta tag validation
|
| 74 |
+
- Responsive images (srcset/sizes)
|
| 75 |
+
- Page size optimization for mobile
|
| 76 |
+
- Flash content detection
|
| 77 |
+
- Fixed-width elements
|
| 78 |
+
- Touch target sizes
|
| 79 |
+
- Media queries analysis
|
| 80 |
+
- Relative font sizing
|
| 81 |
+
|
| 82 |
+
### **🔗 Broken Link Detector** (`link_checker.py`)
|
| 83 |
+
Identifies broken links:
|
| 84 |
+
- Parallel processing for speed (10 concurrent workers)
|
| 85 |
+
- Checks up to 50 links per audit
|
| 86 |
+
- HTTP status code validation
|
| 87 |
+
- Internal vs external link tracking
|
| 88 |
+
- Detailed error reporting
|
| 89 |
+
|
| 90 |
+
### **📄 PDF Report Generator** (`report_generator.py`)
|
| 91 |
+
Creates professional reports:
|
| 92 |
+
- Multi-page comprehensive audit summary
|
| 93 |
+
- Color-coded scores and metrics
|
| 94 |
+
- All detected issues organized by category
|
| 95 |
+
- AI recommendations
|
| 96 |
+
- Broken link details
|
| 97 |
+
- Timestamp and metadata
|
| 98 |
+
|
| 99 |
+
### **📈 Historical Tracking** (`history_tracker.py`)
|
| 100 |
+
Tracks performance over time:
|
| 101 |
+
- JSON-based storage (last 100 audits)
|
| 102 |
+
- Per-site history retrieval
|
| 103 |
+
- Trend data for visualizations
|
| 104 |
+
- Score comparison across audits
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## 🎨 Gradio UI Structure
|
| 109 |
+
|
| 110 |
+
The new interface uses **5 tabs**:
|
| 111 |
+
|
| 112 |
+
1. **📊 Overview** - Summary, scores, gauge & radar charts
|
| 113 |
+
2. **📈 Metrics & Trends** - Technical metrics and historical trends
|
| 114 |
+
3. **⚠️ Issues** - AI, accessibility, mobile, and broken link issues
|
| 115 |
+
4. **✅ Recommendations** - AI-powered suggestions
|
| 116 |
+
5. **📄 PDF Report** - Download comprehensive report
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## 📊 Scoring System
|
| 121 |
+
|
| 122 |
+
### **Overall Score Calculation** (0-100)
|
| 123 |
+
Based on:
|
| 124 |
+
- HTTPS (15 points)
|
| 125 |
+
- Load time (5-15 points)
|
| 126 |
+
- Title presence (10 points)
|
| 127 |
+
- Meta description (10 points)
|
| 128 |
+
- H1 tags (5-10 points)
|
| 129 |
+
- Images with alt text (up to 10 points)
|
| 130 |
+
- Links & scripts (up to 10 points)
|
| 131 |
+
- Paragraph content (up to 10 points)
|
| 132 |
+
- HTTP status (10 points)
|
| 133 |
+
|
| 134 |
+
### **Individual Scores**
|
| 135 |
+
- **SEO Score:** `100 - (images_without_alt × 5)`
|
| 136 |
+
- **Performance Score:** `100 - (load_time × 10)`
|
| 137 |
+
- **Accessibility Score:** WCAG compliance based (0-100)
|
| 138 |
+
- **Security Score:** 100 if HTTPS, else 50
|
| 139 |
+
- **Mobile Score:** Mobile-friendliness based (0-100)
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## 🔧 Tech Stack
|
| 144 |
+
|
| 145 |
+
### **Core Technologies**
|
| 146 |
+
- **Python 3.9+**
|
| 147 |
+
- **Gradio 4.x** — Modern web UI framework
|
| 148 |
+
- **Google Gemini API** — Gemini 1.5 Flash for AI analysis
|
| 149 |
+
- **BeautifulSoup4** — HTML parsing
|
| 150 |
+
- **Requests** — HTTP client
|
| 151 |
+
|
| 152 |
+
### **Visualization & Reports**
|
| 153 |
+
- **Plotly** — Interactive charts (gauges, radar, bar)
|
| 154 |
+
- **Matplotlib** — Word clouds
|
| 155 |
+
- **Pandas** — Data manipulation
|
| 156 |
+
- **FPDF** — PDF generation
|
| 157 |
+
|
| 158 |
+
### **Other**
|
| 159 |
+
- **python-dotenv** — Environment variables
|
| 160 |
+
- **concurrent.futures** — Parallel link checking
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## 📁 Project Structure
|
| 165 |
+
|
| 166 |
+
```
|
| 167 |
+
AuditAI-main/
|
| 168 |
+
├── app.py # Original Streamlit app
|
| 169 |
+
├── app_gradio.py # NEW: Gradio app
|
| 170 |
+
├── scanner.py # Website scanner
|
| 171 |
+
├── ai_analyzer.py # OpenAI integration
|
| 172 |
+
├── scoring.py # Score calculation
|
| 173 |
+
├── dashboard.py # Streamlit dashboard
|
| 174 |
+
├── utils.py # Utility functions
|
| 175 |
+
├── accessibility_checker.py # NEW: Accessibility analysis
|
| 176 |
+
├── mobile_checker.py # NEW: Mobile responsiveness
|
| 177 |
+
├── link_checker.py # NEW: Broken link detection
|
| 178 |
+
├── report_generator.py # NEW: PDF generation
|
| 179 |
+
├── history_tracker.py # NEW: Historical tracking
|
| 180 |
+
├── requirements.txt # Dependencies
|
| 181 |
+
├── README.md # Original readme
|
| 182 |
+
├── README_GRADIO.md # This file
|
| 183 |
+
└── .env # API keys (create this)
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
## 🎯 Usage Guide
|
| 189 |
+
|
| 190 |
+
1. **Enter URL:** Input the website URL (e.g., `https://example.com`)
|
| 191 |
+
2. **Choose Options:** Check/uncheck "Check for Broken Links" (optional, slower)
|
| 192 |
+
3. **Click Audit:** Start the comprehensive analysis
|
| 193 |
+
4. **View Results:**
|
| 194 |
+
- Overview tab shows summary and scores
|
| 195 |
+
- Issues tab lists all detected problems
|
| 196 |
+
- Recommendations tab shows AI suggestions
|
| 197 |
+
- PDF tab provides downloadable report
|
| 198 |
+
5. **Track Progress:** Re-audit the same site to see trend improvements
|
| 199 |
+
|
| 200 |
+
---
|
| 201 |
+
|
| 202 |
+
## ⚡ Performance Notes
|
| 203 |
+
|
| 204 |
+
- **Broken Link Checking:** Uses parallel processing (10 workers) but can take 30-60s for 50 links
|
| 205 |
+
- **AI Analysis:** Powered by Google Gemini AI | Enhanced with Advanced Analytics
|
| 206 |
+
- **PDF Generation:** Instant (<1s)
|
| 207 |
+
- **Historical Trends:** Only show after 2+ audits of the same site
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
## 🔒 Environment Variables
|
| 212 |
+
|
| 213 |
+
Required in `.env` file:
|
| 214 |
+
|
| 215 |
+
```env
|
| 216 |
+
GEMINI_API_KEY=your-gemini-key-here
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
---
|
| 220 |
+
|
| 221 |
+
## 🆚 Gradio vs Streamlit
|
| 222 |
+
|
| 223 |
+
### **Why Gradio?**
|
| 224 |
+
- ✅ Easier deployment (built-in sharing)
|
| 225 |
+
- ✅ Better tab organization
|
| 226 |
+
- ✅ Cleaner API for complex workflows
|
| 227 |
+
- ✅ Automatic shareable links
|
| 228 |
+
- ✅ Better mobile experience
|
| 229 |
+
|
| 230 |
+
### **Keeping Streamlit?**
|
| 231 |
+
Both versions are maintained. Use:
|
| 232 |
+
- `app_gradio.py` for the enhanced version
|
| 233 |
+
- `app.py` for the original Streamlit version
|
| 234 |
+
|
| 235 |
+
---
|
| 236 |
+
|
| 237 |
+
## 👨💻 Author
|
| 238 |
+
|
| 239 |
+
**Mirza Yasir Abdullah Baig**
|
| 240 |
+
|
| 241 |
+
- 🌐 [Kaggle](https://www.kaggle.com/mirzayasirabdullah07)
|
| 242 |
+
- 💼 [LinkedIn](https://www.linkedin.com/in/mirza-yasir-abdullah-baig/)
|
| 243 |
+
- 💻 [GitHub](https://github.com/mirzayasirabdullahbaig07)
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## 📝 License
|
| 248 |
+
|
| 249 |
+
Educational purposes. Not for commercial use without permission.
|
| 250 |
+
|
| 251 |
+
---
|
| 252 |
+
|
| 253 |
+
## 🐛 Troubleshooting
|
| 254 |
+
|
| 255 |
+
**Issue:** Gemini API errors
|
| 256 |
+
**Solution:** Check your API key in `.env` and get it from https://aistudio.google.com/app/apikey
|
| 257 |
+
|
| 258 |
+
**Issue:** Broken link checking takes too long
|
| 259 |
+
**Solution:** Uncheck the "Check for Broken Links" option
|
| 260 |
+
|
| 261 |
+
**Issue:** PDF generation fails
|
| 262 |
+
**Solution:** Ensure `fpdf` is installed: `pip install fpdf`
|
| 263 |
+
|
| 264 |
+
**Issue:** No trend data shown
|
| 265 |
+
**Solution:** Audit the same site multiple times to build history
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 🚀 Future Enhancements
|
| 270 |
+
|
| 271 |
+
- [ ] Multi-page website crawling
|
| 272 |
+
- [ ] Competitor comparison
|
| 273 |
+
- [ ] Lighthouse integration
|
| 274 |
+
- [ ] Email report scheduling
|
| 275 |
+
- [ ] Database storage (replace JSON)
|
| 276 |
+
- [ ] Custom scoring weights
|
| 277 |
+
- [ ] Screenshot capture
|
| 278 |
+
- [ ] Security header analysis
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
## 📸 Screenshots
|
| 283 |
+
|
| 284 |
+
Coming soon! Run the app to see the beautiful new Gradio interface.
|
| 285 |
+
|
| 286 |
+
---
|
| 287 |
+
|
| 288 |
+
**Enjoy auditing! 🎉**
|
accessibility_checker.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
|
| 3 |
+
def check_accessibility(soup, url):
|
| 4 |
+
"""
|
| 5 |
+
Checks WCAG 2.1 accessibility guidelines
|
| 6 |
+
Returns dict with accessibility issues and score
|
| 7 |
+
"""
|
| 8 |
+
issues = []
|
| 9 |
+
score = 100
|
| 10 |
+
|
| 11 |
+
# Check for missing alt text on images
|
| 12 |
+
images = soup.find_all('img')
|
| 13 |
+
images_without_alt = [img for img in images if not img.get('alt')]
|
| 14 |
+
if images_without_alt:
|
| 15 |
+
issues.append(f"❌ {len(images_without_alt)} images missing alt text")
|
| 16 |
+
score -= min(20, len(images_without_alt) * 2)
|
| 17 |
+
|
| 18 |
+
# Check for proper heading hierarchy
|
| 19 |
+
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
| 20 |
+
h1_count = len(soup.find_all('h1'))
|
| 21 |
+
if h1_count == 0:
|
| 22 |
+
issues.append("❌ No H1 heading found - important for screen readers")
|
| 23 |
+
score -= 10
|
| 24 |
+
elif h1_count > 1:
|
| 25 |
+
issues.append(f"⚠️ Multiple H1 headings ({h1_count}) - should be unique")
|
| 26 |
+
score -= 5
|
| 27 |
+
|
| 28 |
+
# Check for form labels
|
| 29 |
+
forms = soup.find_all('form')
|
| 30 |
+
for form in forms:
|
| 31 |
+
inputs = form.find_all(['input', 'select', 'textarea'])
|
| 32 |
+
for input_elem in inputs:
|
| 33 |
+
if input_elem.get('type') not in ['submit', 'button', 'hidden']:
|
| 34 |
+
label_id = input_elem.get('id')
|
| 35 |
+
if not label_id or not form.find('label', {'for': label_id}):
|
| 36 |
+
issues.append("❌ Form inputs missing associated labels")
|
| 37 |
+
score -= 5
|
| 38 |
+
break
|
| 39 |
+
|
| 40 |
+
# Check for color contrast (basic check)
|
| 41 |
+
inline_styles = soup.find_all(style=True)
|
| 42 |
+
if inline_styles:
|
| 43 |
+
issues.append("⚠️ Inline styles detected - may affect accessibility")
|
| 44 |
+
score -= 3
|
| 45 |
+
|
| 46 |
+
# Check for ARIA landmarks
|
| 47 |
+
main_tag = soup.find('main')
|
| 48 |
+
nav_tag = soup.find('nav')
|
| 49 |
+
if not main_tag:
|
| 50 |
+
issues.append("⚠️ No <main> landmark - helps screen reader navigation")
|
| 51 |
+
score -= 5
|
| 52 |
+
if not nav_tag:
|
| 53 |
+
issues.append("⚠️ No <nav> landmark found")
|
| 54 |
+
score -= 3
|
| 55 |
+
|
| 56 |
+
# Check for link text
|
| 57 |
+
links = soup.find_all('a')
|
| 58 |
+
generic_link_text = ['click here', 'read more', 'here', 'link']
|
| 59 |
+
for link in links:
|
| 60 |
+
text = link.get_text().strip().lower()
|
| 61 |
+
if text in generic_link_text:
|
| 62 |
+
issues.append("❌ Generic link text found (e.g., 'click here') - use descriptive text")
|
| 63 |
+
score -= 5
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
# Check for lang attribute
|
| 67 |
+
html_tag = soup.find('html')
|
| 68 |
+
if html_tag and not html_tag.get('lang'):
|
| 69 |
+
issues.append("❌ Missing lang attribute on <html> tag")
|
| 70 |
+
score -= 10
|
| 71 |
+
|
| 72 |
+
# Check for skip links
|
| 73 |
+
skip_link = soup.find('a', href='#main') or soup.find('a', href='#content')
|
| 74 |
+
if not skip_link:
|
| 75 |
+
issues.append("⚠️ No skip navigation link found")
|
| 76 |
+
score -= 5
|
| 77 |
+
|
| 78 |
+
# Check for video captions
|
| 79 |
+
videos = soup.find_all('video')
|
| 80 |
+
for video in videos:
|
| 81 |
+
if not video.find('track', kind='captions'):
|
| 82 |
+
issues.append("❌ Videos missing captions/subtitles")
|
| 83 |
+
score -= 10
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
return {
|
| 87 |
+
'accessibility_score': max(0, score),
|
| 88 |
+
'accessibility_issues': issues if issues else ["✅ No major accessibility issues detected"],
|
| 89 |
+
'wcag_compliance': 'Good' if score >= 80 else 'Needs Improvement' if score >= 60 else 'Poor'
|
| 90 |
+
}
|
ai_analyzer.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 9 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
| 10 |
+
|
| 11 |
+
def analyze_with_ai(scan_data):
|
| 12 |
+
"""
|
| 13 |
+
Returns:
|
| 14 |
+
- issues: list of problems
|
| 15 |
+
- suggestions: list of improvements
|
| 16 |
+
- fix_snippets: code snippets for fixes
|
| 17 |
+
- optimized_html: full HTML with improvements (agentic AI)
|
| 18 |
+
- keywords: top keywords
|
| 19 |
+
- headings_count: H1/H2/H3 count
|
| 20 |
+
"""
|
| 21 |
+
# Generate dummy keywords from title
|
| 22 |
+
keywords = re.findall(r'\b\w+\b', scan_data.get("title", ""))[:10]
|
| 23 |
+
|
| 24 |
+
prompt = f"""
|
| 25 |
+
You are a website audit and optimization expert.
|
| 26 |
+
Analyze this website scan data and provide:
|
| 27 |
+
1) issues (list)
|
| 28 |
+
2) suggestions (list)
|
| 29 |
+
3) fix_snippets (list of HTML/SEO fixes)
|
| 30 |
+
4) optimized_html (full HTML content with improvements applied)
|
| 31 |
+
5) keywords (list)
|
| 32 |
+
6) headings_count (dict of H1, H2, H3 counts)
|
| 33 |
+
|
| 34 |
+
Respond ONLY in JSON format.
|
| 35 |
+
|
| 36 |
+
Scan Data:
|
| 37 |
+
{json.dumps(scan_data, indent=2)}
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
response = model.generate_content(prompt)
|
| 41 |
+
content = response.text
|
| 42 |
+
|
| 43 |
+
# Clean markdown code blocks if present
|
| 44 |
+
if '```json' in content:
|
| 45 |
+
content = content.split('```json')[1].split('```')[0].strip()
|
| 46 |
+
elif '```' in content:
|
| 47 |
+
content = content.split('```')[1].split('```')[0].strip()
|
| 48 |
+
|
| 49 |
+
ai_report = json.loads(content)
|
| 50 |
+
|
| 51 |
+
# Fallbacks
|
| 52 |
+
ai_report.setdefault("keywords", keywords)
|
| 53 |
+
ai_report.setdefault("headings_count", scan_data.get("headings_count", {}))
|
| 54 |
+
ai_report.setdefault("fix_snippets", [])
|
| 55 |
+
ai_report.setdefault("optimized_html", "")
|
| 56 |
+
return ai_report
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
# Fallback
|
| 60 |
+
return {
|
| 61 |
+
"issues": [
|
| 62 |
+
f"H1 tags found: {scan_data.get('h1_count',0)}",
|
| 63 |
+
f"Images without ALT: {scan_data.get('images_without_alt',0)}",
|
| 64 |
+
f"Page load time: {scan_data.get('load_time',0)}s"
|
| 65 |
+
],
|
| 66 |
+
"suggestions": [
|
| 67 |
+
"Add missing meta description",
|
| 68 |
+
"Optimize images and include ALT text",
|
| 69 |
+
"Improve page speed"
|
| 70 |
+
],
|
| 71 |
+
"fix_snippets": [
|
| 72 |
+
"<meta name='description' content='Your description here'>",
|
| 73 |
+
"<img src='image.jpg' alt='Descriptive text'>"
|
| 74 |
+
],
|
| 75 |
+
"optimized_html": "<!-- Add optimized HTML here -->",
|
| 76 |
+
"keywords": keywords,
|
| 77 |
+
"headings_count": scan_data.get("headings_count", {})
|
| 78 |
+
}
|
app_gradio.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from scanner import scan_website
|
| 3 |
+
from ai_analyzer import analyze_with_ai
|
| 4 |
+
from utils import normalize_url, is_valid_url
|
| 5 |
+
from scoring import calculate_score
|
| 6 |
+
from accessibility_checker import check_accessibility
|
| 7 |
+
from mobile_checker import check_mobile_responsiveness
|
| 8 |
+
from link_checker import check_broken_links
|
| 9 |
+
from report_generator import generate_pdf_report
|
| 10 |
+
from history_tracker import save_audit, get_trend_data
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from bs4 import BeautifulSoup
|
| 15 |
+
import requests
|
| 16 |
+
|
| 17 |
+
def create_gauge_chart(score, title):
|
| 18 |
+
"""Create a gauge chart for scores"""
|
| 19 |
+
fig = go.Figure(go.Indicator(
|
| 20 |
+
mode="gauge+number",
|
| 21 |
+
value=score,
|
| 22 |
+
title={'text': title},
|
| 23 |
+
gauge={
|
| 24 |
+
'axis': {'range': [0, 100]},
|
| 25 |
+
'bar': {'color': "darkblue"},
|
| 26 |
+
'steps': [
|
| 27 |
+
{'range': [0, 50], 'color': "lightcoral"},
|
| 28 |
+
{'range': [50, 80], 'color': "lightyellow"},
|
| 29 |
+
{'range': [80, 100], 'color': "lightgreen"}
|
| 30 |
+
],
|
| 31 |
+
'threshold': {
|
| 32 |
+
'line': {'color': "red", 'width': 4},
|
| 33 |
+
'thickness': 0.75,
|
| 34 |
+
'value': 90
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
))
|
| 38 |
+
fig.update_layout(height=300)
|
| 39 |
+
return fig
|
| 40 |
+
|
| 41 |
+
def create_radar_chart(scores_dict):
|
| 42 |
+
"""Create radar chart for all scores"""
|
| 43 |
+
categories = list(scores_dict.keys())
|
| 44 |
+
values = list(scores_dict.values())
|
| 45 |
+
|
| 46 |
+
fig = go.Figure()
|
| 47 |
+
fig.add_trace(go.Scatterpolar(
|
| 48 |
+
r=values,
|
| 49 |
+
theta=categories,
|
| 50 |
+
fill='toself',
|
| 51 |
+
name='Audit Scores'
|
| 52 |
+
))
|
| 53 |
+
fig.update_layout(
|
| 54 |
+
polar=dict(radialaxis=dict(range=[0, 100])),
|
| 55 |
+
title="Overall Website Health Radar",
|
| 56 |
+
height=400
|
| 57 |
+
)
|
| 58 |
+
return fig
|
| 59 |
+
|
| 60 |
+
def create_metrics_bar_chart(scan_data):
|
| 61 |
+
"""Create bar chart for SEO metrics"""
|
| 62 |
+
metrics_data = pd.DataFrame({
|
| 63 |
+
'Metric': ['H1 Tags', 'H2 Tags', 'H3 Tags', 'Images w/o ALT', 'Links', 'Scripts'],
|
| 64 |
+
'Value': [
|
| 65 |
+
scan_data.get('h1_count', 0),
|
| 66 |
+
scan_data.get('h2_count', 0),
|
| 67 |
+
scan_data.get('h3_count', 0),
|
| 68 |
+
scan_data.get('images_without_alt', 0),
|
| 69 |
+
scan_data.get('links_count', 0),
|
| 70 |
+
scan_data.get('scripts_count', 0)
|
| 71 |
+
]
|
| 72 |
+
})
|
| 73 |
+
|
| 74 |
+
fig = px.bar(metrics_data, x='Metric', y='Value',
|
| 75 |
+
title='SEO & Technical Metrics',
|
| 76 |
+
color='Value',
|
| 77 |
+
color_continuous_scale='Viridis')
|
| 78 |
+
fig.update_layout(height=400)
|
| 79 |
+
return fig
|
| 80 |
+
|
| 81 |
+
def create_trend_chart(url):
|
| 82 |
+
"""Create trend chart from history"""
|
| 83 |
+
trend_data = get_trend_data(url)
|
| 84 |
+
|
| 85 |
+
if not trend_data:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
df = pd.DataFrame(trend_data['scores'])
|
| 89 |
+
df['Date'] = trend_data['dates']
|
| 90 |
+
|
| 91 |
+
fig = go.Figure()
|
| 92 |
+
for col in df.columns[:-1]:
|
| 93 |
+
fig.add_trace(go.Scatter(x=df['Date'], y=df[col], mode='lines+markers', name=col))
|
| 94 |
+
|
| 95 |
+
fig.update_layout(
|
| 96 |
+
title='Score Trends Over Time',
|
| 97 |
+
xaxis_title='Date',
|
| 98 |
+
yaxis_title='Score',
|
| 99 |
+
height=400
|
| 100 |
+
)
|
| 101 |
+
return fig
|
| 102 |
+
|
| 103 |
+
def audit_website(url, check_links=True):
|
| 104 |
+
"""Main audit function"""
|
| 105 |
+
if not url or not is_valid_url(url):
|
| 106 |
+
return ("❌ Invalid URL", None, None, None, None, None, None, None, None, None, None)
|
| 107 |
+
|
| 108 |
+
url = normalize_url(url)
|
| 109 |
+
status_msg = f"🔍 Scanning {url}..."
|
| 110 |
+
|
| 111 |
+
# Step 1: Scan website
|
| 112 |
+
scan_data = scan_website(url)
|
| 113 |
+
|
| 114 |
+
if "error" in scan_data:
|
| 115 |
+
return (f"❌ Error: {scan_data['error']}", None, None, None, None, None, None, None, None, None, None)
|
| 116 |
+
|
| 117 |
+
# Step 2: Get page content for additional checks
|
| 118 |
+
try:
|
| 119 |
+
response = requests.get(url, timeout=10, headers={"User-Agent": "AI-Site-Auditor"})
|
| 120 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 121 |
+
except:
|
| 122 |
+
return ("❌ Failed to fetch page content", None, None, None, None, None, None, None, None, None, None)
|
| 123 |
+
|
| 124 |
+
# Step 3: Run all checks
|
| 125 |
+
accessibility_data = check_accessibility(soup, url)
|
| 126 |
+
mobile_data = check_mobile_responsiveness(soup, scan_data.get('page_size_mb', 0))
|
| 127 |
+
|
| 128 |
+
if check_links:
|
| 129 |
+
link_data = check_broken_links(url, soup, max_links=50)
|
| 130 |
+
else:
|
| 131 |
+
link_data = {'total_links_checked': 0, 'working_links': 0, 'broken_links_count': 0,
|
| 132 |
+
'broken_links_details': [], 'link_health': 'Skipped'}
|
| 133 |
+
|
| 134 |
+
# Step 4: Calculate scores
|
| 135 |
+
overall_score = calculate_score(scan_data)
|
| 136 |
+
scan_data["overall_score"] = overall_score
|
| 137 |
+
scan_data["seo_score"] = max(0, 100 - scan_data.get("images_without_alt", 0) * 5)
|
| 138 |
+
scan_data["performance_score"] = max(0, 100 - scan_data.get("load_time", 5) * 10)
|
| 139 |
+
scan_data["security_score"] = 100 if scan_data.get("https") else 50
|
| 140 |
+
|
| 141 |
+
# Step 5: AI Analysis
|
| 142 |
+
ai_report = analyze_with_ai(scan_data)
|
| 143 |
+
|
| 144 |
+
# Step 6: Save to history
|
| 145 |
+
save_audit(url, scan_data, ai_report, accessibility_data, mobile_data, link_data)
|
| 146 |
+
|
| 147 |
+
# Step 7: Create visualizations
|
| 148 |
+
scores_dict = {
|
| 149 |
+
'SEO': scan_data["seo_score"],
|
| 150 |
+
'Performance': scan_data["performance_score"],
|
| 151 |
+
'Accessibility': accessibility_data['accessibility_score'],
|
| 152 |
+
'Security': scan_data["security_score"],
|
| 153 |
+
'Mobile': mobile_data['mobile_score']
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
gauge_overall = create_gauge_chart(overall_score, "Overall Score")
|
| 157 |
+
radar_chart = create_radar_chart(scores_dict)
|
| 158 |
+
metrics_chart = create_metrics_bar_chart(scan_data)
|
| 159 |
+
trend_chart = create_trend_chart(url)
|
| 160 |
+
|
| 161 |
+
# Step 8: Format results
|
| 162 |
+
summary = f"""
|
| 163 |
+
# 🎯 Audit Summary for {url}
|
| 164 |
+
|
| 165 |
+
## 📊 Scores
|
| 166 |
+
- **Overall Score:** {overall_score}/100
|
| 167 |
+
- **SEO Score:** {scan_data['seo_score']}/100
|
| 168 |
+
- **Performance Score:** {scan_data['performance_score']}/100
|
| 169 |
+
- **Accessibility Score:** {accessibility_data['accessibility_score']}/100
|
| 170 |
+
- **Security Score:** {scan_data['security_score']}/100
|
| 171 |
+
- **Mobile Score:** {mobile_data['mobile_score']}/100
|
| 172 |
+
|
| 173 |
+
## 🔧 Technical Metrics
|
| 174 |
+
- **Load Time:** {scan_data.get('load_time', 0)}s
|
| 175 |
+
- **Page Size:** {scan_data.get('page_size_mb', 0):.2f} MB
|
| 176 |
+
- **HTTPS:** {'✅ Yes' if scan_data.get('https') else '❌ No'}
|
| 177 |
+
- **Status Code:** {scan_data.get('status_code', 'N/A')}
|
| 178 |
+
|
| 179 |
+
## 🔗 Link Health
|
| 180 |
+
- **Total Links Checked:** {link_data['total_links_checked']}
|
| 181 |
+
- **Working Links:** {link_data['working_links']}
|
| 182 |
+
- **Broken Links:** {link_data['broken_links_count']}
|
| 183 |
+
- **Health Status:** {link_data['link_health']}
|
| 184 |
+
|
| 185 |
+
## 📱 Mobile Friendliness
|
| 186 |
+
- **Status:** {mobile_data['mobile_friendly']}
|
| 187 |
+
|
| 188 |
+
## ♿ Accessibility
|
| 189 |
+
- **WCAG Compliance:** {accessibility_data['wcag_compliance']}
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
# Format AI Issues
|
| 193 |
+
ai_issues_text = "## ⚠️ AI Detected Issues\n\n"
|
| 194 |
+
for issue in ai_report.get('issues', [])[:10]:
|
| 195 |
+
ai_issues_text += f"- {issue}\n"
|
| 196 |
+
|
| 197 |
+
# Format AI Suggestions
|
| 198 |
+
ai_suggestions_text = "## ✅ AI Recommendations\n\n"
|
| 199 |
+
for suggestion in ai_report.get('suggestions', [])[:10]:
|
| 200 |
+
ai_suggestions_text += f"- {suggestion}\n"
|
| 201 |
+
|
| 202 |
+
# Format Accessibility Issues
|
| 203 |
+
accessibility_text = "## ♿ Accessibility Issues\n\n"
|
| 204 |
+
for issue in accessibility_data.get('accessibility_issues', []):
|
| 205 |
+
accessibility_text += f"{issue}\n\n"
|
| 206 |
+
|
| 207 |
+
# Format Mobile Issues
|
| 208 |
+
mobile_text = "## 📱 Mobile Issues\n\n"
|
| 209 |
+
for issue in mobile_data.get('mobile_issues', []):
|
| 210 |
+
mobile_text += f"{issue}\n\n"
|
| 211 |
+
|
| 212 |
+
# Format Broken Links
|
| 213 |
+
broken_links_text = "## 🔗 Broken Links Details\n\n"
|
| 214 |
+
if link_data['broken_links_details']:
|
| 215 |
+
for broken in link_data['broken_links_details']:
|
| 216 |
+
broken_links_text += f"- **URL:** {broken['url']}\n"
|
| 217 |
+
broken_links_text += f" **Status:** {broken['status']}\n\n"
|
| 218 |
+
else:
|
| 219 |
+
broken_links_text += "✅ No broken links detected!\n"
|
| 220 |
+
|
| 221 |
+
# Generate PDF
|
| 222 |
+
try:
|
| 223 |
+
pdf_path = generate_pdf_report(url, scan_data, ai_report, accessibility_data, mobile_data, link_data)
|
| 224 |
+
except:
|
| 225 |
+
pdf_path = None
|
| 226 |
+
|
| 227 |
+
return (
|
| 228 |
+
summary,
|
| 229 |
+
ai_issues_text,
|
| 230 |
+
ai_suggestions_text,
|
| 231 |
+
accessibility_text,
|
| 232 |
+
mobile_text,
|
| 233 |
+
broken_links_text,
|
| 234 |
+
gauge_overall,
|
| 235 |
+
radar_chart,
|
| 236 |
+
metrics_chart,
|
| 237 |
+
trend_chart if trend_chart else None,
|
| 238 |
+
pdf_path
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Create Gradio Interface
|
| 242 |
+
with gr.Blocks(title="AuditAI - Agentic Website Auditor", theme=gr.themes.Soft()) as demo:
|
| 243 |
+
|
| 244 |
+
gr.Markdown("""
|
| 245 |
+
# 🧠 AuditAI - Agentic AI Website Auditor
|
| 246 |
+
**Powered by Google Gemini 1.5 Flash | Enhanced with Advanced Analytics**
|
| 247 |
+
|
| 248 |
+
Comprehensive website auditing with SEO, Performance, Accessibility, Security, and Mobile analysis.
|
| 249 |
+
""")
|
| 250 |
+
|
| 251 |
+
with gr.Row():
|
| 252 |
+
with gr.Column(scale=3):
|
| 253 |
+
url_input = gr.Textbox(
|
| 254 |
+
label="Website URL",
|
| 255 |
+
placeholder="https://example.com",
|
| 256 |
+
info="Enter the full URL of the website to audit"
|
| 257 |
+
)
|
| 258 |
+
with gr.Column(scale=1):
|
| 259 |
+
check_links_checkbox = gr.Checkbox(
|
| 260 |
+
label="Check for Broken Links",
|
| 261 |
+
value=True,
|
| 262 |
+
info="May take longer"
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
audit_btn = gr.Button("🚀 Start Audit", variant="primary", size="lg")
|
| 266 |
+
|
| 267 |
+
with gr.Tabs():
|
| 268 |
+
with gr.Tab("📊 Overview"):
|
| 269 |
+
summary_output = gr.Markdown(label="Audit Summary")
|
| 270 |
+
|
| 271 |
+
with gr.Row():
|
| 272 |
+
gauge_plot = gr.Plot(label="Overall Score")
|
| 273 |
+
radar_plot = gr.Plot(label="Health Radar")
|
| 274 |
+
|
| 275 |
+
with gr.Tab("📈 Metrics & Trends"):
|
| 276 |
+
metrics_plot = gr.Plot(label="Technical Metrics")
|
| 277 |
+
trend_plot = gr.Plot(label="Historical Trends")
|
| 278 |
+
|
| 279 |
+
with gr.Tab("⚠️ Issues"):
|
| 280 |
+
ai_issues_output = gr.Markdown(label="AI Detected Issues")
|
| 281 |
+
accessibility_output = gr.Markdown(label="Accessibility Issues")
|
| 282 |
+
mobile_output = gr.Markdown(label="Mobile Issues")
|
| 283 |
+
broken_links_output = gr.Markdown(label="Broken Links")
|
| 284 |
+
|
| 285 |
+
with gr.Tab("✅ Recommendations"):
|
| 286 |
+
ai_suggestions_output = gr.Markdown(label="AI Recommendations")
|
| 287 |
+
|
| 288 |
+
with gr.Tab("📄 PDF Report"):
|
| 289 |
+
gr.Markdown("### Download your comprehensive audit report")
|
| 290 |
+
pdf_output = gr.File(label="Download PDF Report")
|
| 291 |
+
|
| 292 |
+
# Event handler
|
| 293 |
+
audit_btn.click(
|
| 294 |
+
fn=audit_website,
|
| 295 |
+
inputs=[url_input, check_links_checkbox],
|
| 296 |
+
outputs=[
|
| 297 |
+
summary_output,
|
| 298 |
+
ai_issues_output,
|
| 299 |
+
ai_suggestions_output,
|
| 300 |
+
accessibility_output,
|
| 301 |
+
mobile_output,
|
| 302 |
+
broken_links_output,
|
| 303 |
+
gauge_plot,
|
| 304 |
+
radar_plot,
|
| 305 |
+
metrics_plot,
|
| 306 |
+
trend_plot,
|
| 307 |
+
pdf_output
|
| 308 |
+
]
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
gr.Markdown("""
|
| 312 |
+
---
|
| 313 |
+
### 👨💻 Built by Sakshi Gupta
|
| 314 |
+
**Features:** SEO Analysis • Performance Metrics • Accessibility Check • Broken Link Detection •
|
| 315 |
+
Mobile Responsiveness • AI-Powered Insights • PDF Reports • Historical Tracking
|
| 316 |
+
""")
|
| 317 |
+
|
| 318 |
+
if __name__ == "__main__":
|
| 319 |
+
demo.launch(share=True)
|
history_tracker.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
HISTORY_FILE = "audit_history.json"
|
| 6 |
+
|
| 7 |
+
def load_history():
|
| 8 |
+
"""Load audit history from JSON file"""
|
| 9 |
+
if os.path.exists(HISTORY_FILE):
|
| 10 |
+
try:
|
| 11 |
+
with open(HISTORY_FILE, 'r') as f:
|
| 12 |
+
return json.load(f)
|
| 13 |
+
except:
|
| 14 |
+
return []
|
| 15 |
+
return []
|
| 16 |
+
|
| 17 |
+
def save_audit(url, scan_data, ai_report, accessibility_data, mobile_data, link_data):
|
| 18 |
+
"""Save current audit to history"""
|
| 19 |
+
history = load_history()
|
| 20 |
+
|
| 21 |
+
audit_entry = {
|
| 22 |
+
'timestamp': datetime.now().isoformat(),
|
| 23 |
+
'url': url,
|
| 24 |
+
'overall_score': scan_data.get('overall_score', 0),
|
| 25 |
+
'seo_score': scan_data.get('seo_score', 0),
|
| 26 |
+
'performance_score': scan_data.get('performance_score', 0),
|
| 27 |
+
'accessibility_score': accessibility_data.get('accessibility_score', 0),
|
| 28 |
+
'security_score': scan_data.get('security_score', 0),
|
| 29 |
+
'mobile_score': mobile_data.get('mobile_score', 0),
|
| 30 |
+
'load_time': scan_data.get('load_time', 0),
|
| 31 |
+
'page_size_mb': scan_data.get('page_size_mb', 0),
|
| 32 |
+
'broken_links': link_data.get('broken_links_count', 0),
|
| 33 |
+
'https': scan_data.get('https', False)
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
history.append(audit_entry)
|
| 37 |
+
|
| 38 |
+
# Keep only last 100 audits
|
| 39 |
+
history = history[-100:]
|
| 40 |
+
|
| 41 |
+
with open(HISTORY_FILE, 'w') as f:
|
| 42 |
+
json.dump(history, f, indent=2)
|
| 43 |
+
|
| 44 |
+
return audit_entry
|
| 45 |
+
|
| 46 |
+
def get_site_history(url, limit=10):
|
| 47 |
+
"""Get history for a specific site"""
|
| 48 |
+
history = load_history()
|
| 49 |
+
site_history = [entry for entry in history if entry['url'] == url]
|
| 50 |
+
return site_history[-limit:]
|
| 51 |
+
|
| 52 |
+
def get_trend_data(url):
|
| 53 |
+
"""Get trend data for charts"""
|
| 54 |
+
site_history = get_site_history(url, limit=20)
|
| 55 |
+
|
| 56 |
+
if not site_history:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
dates = [entry['timestamp'][:10] for entry in site_history]
|
| 60 |
+
scores = {
|
| 61 |
+
'Overall': [entry['overall_score'] for entry in site_history],
|
| 62 |
+
'SEO': [entry['seo_score'] for entry in site_history],
|
| 63 |
+
'Performance': [entry['performance_score'] for entry in site_history],
|
| 64 |
+
'Accessibility': [entry['accessibility_score'] for entry in site_history],
|
| 65 |
+
'Security': [entry['security_score'] for entry in site_history],
|
| 66 |
+
'Mobile': [entry['mobile_score'] for entry in site_history]
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
return {'dates': dates, 'scores': scores}
|
link_checker.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from urllib.parse import urljoin, urlparse
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 5 |
+
|
| 6 |
+
def check_broken_links(url, soup, max_links=50, timeout=5):
|
| 7 |
+
"""
|
| 8 |
+
Checks for broken links on the page
|
| 9 |
+
Returns dict with broken links, total links checked, and status
|
| 10 |
+
"""
|
| 11 |
+
broken_links = []
|
| 12 |
+
working_links = 0
|
| 13 |
+
skipped_links = 0
|
| 14 |
+
|
| 15 |
+
# Extract all links
|
| 16 |
+
all_links = soup.find_all('a', href=True)
|
| 17 |
+
links_to_check = []
|
| 18 |
+
|
| 19 |
+
for link in all_links[:max_links]: # Limit to avoid overwhelming
|
| 20 |
+
href = link.get('href')
|
| 21 |
+
|
| 22 |
+
# Skip anchors, mailto, tel, javascript
|
| 23 |
+
if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
|
| 24 |
+
skipped_links += 1
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
+
# Convert relative URLs to absolute
|
| 28 |
+
full_url = urljoin(url, href)
|
| 29 |
+
|
| 30 |
+
# Only check HTTP/HTTPS
|
| 31 |
+
if full_url.startswith(('http://', 'https://')):
|
| 32 |
+
links_to_check.append((href, full_url))
|
| 33 |
+
|
| 34 |
+
# Check links in parallel for speed
|
| 35 |
+
def check_single_link(link_data):
|
| 36 |
+
original_href, full_url = link_data
|
| 37 |
+
try:
|
| 38 |
+
response = requests.head(full_url, timeout=timeout, allow_redirects=True,
|
| 39 |
+
headers={"User-Agent": "AI-Site-Auditor"})
|
| 40 |
+
|
| 41 |
+
# If HEAD fails, try GET
|
| 42 |
+
if response.status_code >= 400:
|
| 43 |
+
response = requests.get(full_url, timeout=timeout,
|
| 44 |
+
headers={"User-Agent": "AI-Site-Auditor"})
|
| 45 |
+
|
| 46 |
+
if response.status_code >= 400:
|
| 47 |
+
return {'broken': True, 'url': original_href, 'status': response.status_code}
|
| 48 |
+
else:
|
| 49 |
+
return {'broken': False}
|
| 50 |
+
except requests.exceptions.RequestException as e:
|
| 51 |
+
return {'broken': True, 'url': original_href, 'status': 'Error', 'error': str(e)[:50]}
|
| 52 |
+
|
| 53 |
+
# Use ThreadPoolExecutor for parallel checking
|
| 54 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 55 |
+
futures = {executor.submit(check_single_link, link): link for link in links_to_check}
|
| 56 |
+
|
| 57 |
+
for future in as_completed(futures):
|
| 58 |
+
result = future.result()
|
| 59 |
+
if result['broken']:
|
| 60 |
+
broken_links.append(result)
|
| 61 |
+
else:
|
| 62 |
+
working_links += 1
|
| 63 |
+
|
| 64 |
+
total_checked = len(links_to_check)
|
| 65 |
+
broken_count = len(broken_links)
|
| 66 |
+
|
| 67 |
+
return {
|
| 68 |
+
'total_links_checked': total_checked,
|
| 69 |
+
'working_links': working_links,
|
| 70 |
+
'broken_links_count': broken_count,
|
| 71 |
+
'broken_links_details': broken_links[:10], # Limit details to first 10
|
| 72 |
+
'skipped_links': skipped_links,
|
| 73 |
+
'link_health': 'Excellent' if broken_count == 0 else 'Good' if broken_count <= 2 else 'Needs Attention'
|
| 74 |
+
}
|
mobile_checker.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
|
| 3 |
+
def check_mobile_responsiveness(soup, page_size_mb):
|
| 4 |
+
"""
|
| 5 |
+
Checks mobile-friendliness and responsive design
|
| 6 |
+
Returns dict with mobile issues and score
|
| 7 |
+
"""
|
| 8 |
+
issues = []
|
| 9 |
+
score = 100
|
| 10 |
+
|
| 11 |
+
# Check viewport meta tag
|
| 12 |
+
viewport = soup.find('meta', attrs={'name': 'viewport'})
|
| 13 |
+
if not viewport:
|
| 14 |
+
issues.append("❌ Missing viewport meta tag - critical for mobile devices")
|
| 15 |
+
score -= 25
|
| 16 |
+
else:
|
| 17 |
+
content = viewport.get('content', '')
|
| 18 |
+
if 'width=device-width' not in content:
|
| 19 |
+
issues.append("⚠️ Viewport should include 'width=device-width'")
|
| 20 |
+
score -= 10
|
| 21 |
+
if 'initial-scale=1' not in content:
|
| 22 |
+
issues.append("⚠️ Viewport should include 'initial-scale=1'")
|
| 23 |
+
score -= 5
|
| 24 |
+
|
| 25 |
+
# Check for responsive images
|
| 26 |
+
images = soup.find_all('img')
|
| 27 |
+
responsive_images = [img for img in images if img.get('srcset') or img.get('sizes')]
|
| 28 |
+
if images and len(responsive_images) == 0:
|
| 29 |
+
issues.append("⚠️ No responsive images detected (consider using srcset)")
|
| 30 |
+
score -= 10
|
| 31 |
+
|
| 32 |
+
# Check page size for mobile
|
| 33 |
+
if page_size_mb > 3:
|
| 34 |
+
issues.append(f"❌ Page size ({page_size_mb:.2f}MB) too large for mobile - should be <3MB")
|
| 35 |
+
score -= 15
|
| 36 |
+
elif page_size_mb > 1.5:
|
| 37 |
+
issues.append(f"⚠️ Page size ({page_size_mb:.2f}MB) could be optimized for mobile")
|
| 38 |
+
score -= 5
|
| 39 |
+
|
| 40 |
+
# Check for mobile-unfriendly elements
|
| 41 |
+
flash = soup.find_all(['embed', 'object'], type='application/x-shockwave-flash')
|
| 42 |
+
if flash:
|
| 43 |
+
issues.append("❌ Flash content detected - not supported on mobile devices")
|
| 44 |
+
score -= 20
|
| 45 |
+
|
| 46 |
+
# Check for fixed width elements
|
| 47 |
+
tables = soup.find_all('table')
|
| 48 |
+
for table in tables:
|
| 49 |
+
if table.get('width') and 'px' in str(table.get('width')):
|
| 50 |
+
issues.append("⚠️ Fixed-width tables detected - may not be mobile-friendly")
|
| 51 |
+
score -= 5
|
| 52 |
+
break
|
| 53 |
+
|
| 54 |
+
# Check for touch-friendly elements
|
| 55 |
+
buttons = soup.find_all('button')
|
| 56 |
+
links = soup.find_all('a')
|
| 57 |
+
small_touch_targets = 0
|
| 58 |
+
for elem in buttons + links:
|
| 59 |
+
style = elem.get('style', '')
|
| 60 |
+
if 'font-size' in style and any(size in style for size in ['8px', '9px', '10px']):
|
| 61 |
+
small_touch_targets += 1
|
| 62 |
+
|
| 63 |
+
if small_touch_targets > 0:
|
| 64 |
+
issues.append(f"⚠️ {small_touch_targets} elements may have small touch targets")
|
| 65 |
+
score -= 10
|
| 66 |
+
|
| 67 |
+
# Check for media queries in stylesheets
|
| 68 |
+
styles = soup.find_all('style')
|
| 69 |
+
links_css = soup.find_all('link', rel='stylesheet')
|
| 70 |
+
has_media_queries = False
|
| 71 |
+
for style in styles:
|
| 72 |
+
if '@media' in style.get_text():
|
| 73 |
+
has_media_queries = True
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
if not has_media_queries and len(styles) > 0:
|
| 77 |
+
issues.append("⚠️ No media queries detected in inline styles")
|
| 78 |
+
score -= 10
|
| 79 |
+
|
| 80 |
+
# Check font sizes
|
| 81 |
+
if not soup.find_all(style=lambda x: x and 'font-size' in x and any(unit in x for unit in ['em', 'rem', '%'])):
|
| 82 |
+
issues.append("⚠️ Consider using relative font sizes (em, rem, %) for better mobile scaling")
|
| 83 |
+
score -= 5
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
'mobile_score': max(0, score),
|
| 87 |
+
'mobile_issues': issues if issues else ["✅ Good mobile responsiveness"],
|
| 88 |
+
'mobile_friendly': 'Yes' if score >= 80 else 'Partially' if score >= 60 else 'No'
|
| 89 |
+
}
|
report_generator.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fpdf import FPDF
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
class PDFReport(FPDF):
|
| 6 |
+
def header(self):
|
| 7 |
+
self.set_font('Arial', 'B', 16)
|
| 8 |
+
self.cell(0, 10, 'AuditAI - Website Audit Report', 0, 1, 'C')
|
| 9 |
+
self.set_font('Arial', 'I', 10)
|
| 10 |
+
self.cell(0, 5, f'Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', 0, 1, 'C')
|
| 11 |
+
self.ln(5)
|
| 12 |
+
|
| 13 |
+
def footer(self):
|
| 14 |
+
self.set_y(-15)
|
| 15 |
+
self.set_font('Arial', 'I', 8)
|
| 16 |
+
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
|
| 17 |
+
|
| 18 |
+
def generate_pdf_report(url, scan_data, ai_report, accessibility_data, mobile_data, link_data):
|
| 19 |
+
"""
|
| 20 |
+
Generates a comprehensive PDF audit report
|
| 21 |
+
Returns: PDF file path
|
| 22 |
+
"""
|
| 23 |
+
pdf = PDFReport()
|
| 24 |
+
pdf.add_page()
|
| 25 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
| 26 |
+
|
| 27 |
+
# Website URL
|
| 28 |
+
pdf.set_font('Arial', 'B', 14)
|
| 29 |
+
pdf.cell(0, 10, 'Website Analyzed:', 0, 1)
|
| 30 |
+
pdf.set_font('Arial', '', 12)
|
| 31 |
+
pdf.cell(0, 8, url, 0, 1)
|
| 32 |
+
pdf.ln(5)
|
| 33 |
+
|
| 34 |
+
# Overall Scores Section
|
| 35 |
+
pdf.set_font('Arial', 'B', 14)
|
| 36 |
+
pdf.set_fill_color(200, 220, 255)
|
| 37 |
+
pdf.cell(0, 10, 'Overall Performance Scores', 0, 1, 'L', True)
|
| 38 |
+
pdf.ln(2)
|
| 39 |
+
|
| 40 |
+
pdf.set_font('Arial', '', 11)
|
| 41 |
+
scores = [
|
| 42 |
+
('Overall Score', scan_data.get('overall_score', 0)),
|
| 43 |
+
('SEO Score', scan_data.get('seo_score', 0)),
|
| 44 |
+
('Performance Score', scan_data.get('performance_score', 0)),
|
| 45 |
+
('Accessibility Score', accessibility_data.get('accessibility_score', 0)),
|
| 46 |
+
('Security Score', scan_data.get('security_score', 0)),
|
| 47 |
+
('Mobile Score', mobile_data.get('mobile_score', 0))
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
for label, score in scores:
|
| 51 |
+
color = (0, 200, 0) if score >= 80 else (255, 165, 0) if score >= 60 else (255, 0, 0)
|
| 52 |
+
pdf.set_text_color(*color)
|
| 53 |
+
pdf.cell(100, 8, f'{label}:', 0, 0)
|
| 54 |
+
pdf.set_font('Arial', 'B', 11)
|
| 55 |
+
pdf.cell(0, 8, f'{score}/100', 0, 1)
|
| 56 |
+
pdf.set_font('Arial', '', 11)
|
| 57 |
+
|
| 58 |
+
pdf.set_text_color(0, 0, 0)
|
| 59 |
+
pdf.ln(5)
|
| 60 |
+
|
| 61 |
+
# Technical Metrics
|
| 62 |
+
pdf.set_font('Arial', 'B', 14)
|
| 63 |
+
pdf.set_fill_color(200, 220, 255)
|
| 64 |
+
pdf.cell(0, 10, 'Technical Metrics', 0, 1, 'L', True)
|
| 65 |
+
pdf.ln(2)
|
| 66 |
+
|
| 67 |
+
pdf.set_font('Arial', '', 11)
|
| 68 |
+
metrics = [
|
| 69 |
+
('Load Time', f"{scan_data.get('load_time', 0)} seconds"),
|
| 70 |
+
('Page Size', f"{scan_data.get('page_size_mb', 0):.2f} MB"),
|
| 71 |
+
('HTTPS Enabled', 'Yes' if scan_data.get('https') else 'No'),
|
| 72 |
+
('Status Code', str(scan_data.get('status_code', 'N/A'))),
|
| 73 |
+
('Total Links', str(scan_data.get('links_count', 0))),
|
| 74 |
+
('Internal Links', str(scan_data.get('internal_links', 0))),
|
| 75 |
+
('External Links', str(scan_data.get('external_links', 0))),
|
| 76 |
+
('Images without ALT', str(scan_data.get('images_without_alt', 0))),
|
| 77 |
+
('H1 Tags', str(scan_data.get('h1_count', 0))),
|
| 78 |
+
('Scripts', str(scan_data.get('scripts_count', 0)))
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
for label, value in metrics:
|
| 82 |
+
pdf.cell(95, 7, f'{label}:', 0, 0)
|
| 83 |
+
pdf.cell(0, 7, value, 0, 1)
|
| 84 |
+
|
| 85 |
+
pdf.ln(5)
|
| 86 |
+
|
| 87 |
+
# Link Health
|
| 88 |
+
pdf.set_font('Arial', 'B', 14)
|
| 89 |
+
pdf.set_fill_color(200, 220, 255)
|
| 90 |
+
pdf.cell(0, 10, 'Link Health Check', 0, 1, 'L', True)
|
| 91 |
+
pdf.ln(2)
|
| 92 |
+
|
| 93 |
+
pdf.set_font('Arial', '', 11)
|
| 94 |
+
pdf.cell(95, 7, 'Total Links Checked:', 0, 0)
|
| 95 |
+
pdf.cell(0, 7, str(link_data.get('total_links_checked', 0)), 0, 1)
|
| 96 |
+
pdf.cell(95, 7, 'Working Links:', 0, 0)
|
| 97 |
+
pdf.cell(0, 7, str(link_data.get('working_links', 0)), 0, 1)
|
| 98 |
+
pdf.cell(95, 7, 'Broken Links:', 0, 0)
|
| 99 |
+
pdf.set_text_color(255, 0, 0) if link_data.get('broken_links_count', 0) > 0 else pdf.set_text_color(0, 200, 0)
|
| 100 |
+
pdf.cell(0, 7, str(link_data.get('broken_links_count', 0)), 0, 1)
|
| 101 |
+
pdf.set_text_color(0, 0, 0)
|
| 102 |
+
pdf.ln(5)
|
| 103 |
+
|
| 104 |
+
# Broken Links Details
|
| 105 |
+
if link_data.get('broken_links_details'):
|
| 106 |
+
pdf.set_font('Arial', 'B', 12)
|
| 107 |
+
pdf.cell(0, 8, 'Broken Links Found:', 0, 1)
|
| 108 |
+
pdf.set_font('Arial', '', 9)
|
| 109 |
+
for broken in link_data['broken_links_details'][:10]:
|
| 110 |
+
pdf.multi_cell(0, 5, f"- {broken['url']} (Status: {broken['status']})")
|
| 111 |
+
pdf.ln(3)
|
| 112 |
+
|
| 113 |
+
# AI Detected Issues
|
| 114 |
+
pdf.add_page()
|
| 115 |
+
pdf.set_font('Arial', 'B', 14)
|
| 116 |
+
pdf.set_fill_color(255, 200, 200)
|
| 117 |
+
pdf.cell(0, 10, 'AI Detected Issues', 0, 1, 'L', True)
|
| 118 |
+
pdf.ln(2)
|
| 119 |
+
|
| 120 |
+
pdf.set_font('Arial', '', 10)
|
| 121 |
+
for issue in ai_report.get('issues', [])[:15]:
|
| 122 |
+
pdf.multi_cell(0, 6, f'- {issue}')
|
| 123 |
+
pdf.ln(5)
|
| 124 |
+
|
| 125 |
+
# Accessibility Issues
|
| 126 |
+
pdf.set_font('Arial', 'B', 14)
|
| 127 |
+
pdf.set_fill_color(255, 230, 200)
|
| 128 |
+
pdf.cell(0, 10, 'Accessibility Issues', 0, 1, 'L', True)
|
| 129 |
+
pdf.ln(2)
|
| 130 |
+
|
| 131 |
+
pdf.set_font('Arial', '', 10)
|
| 132 |
+
for issue in accessibility_data.get('accessibility_issues', [])[:15]:
|
| 133 |
+
pdf.multi_cell(0, 6, f'{issue}')
|
| 134 |
+
pdf.ln(5)
|
| 135 |
+
|
| 136 |
+
# Mobile Issues
|
| 137 |
+
pdf.set_font('Arial', 'B', 14)
|
| 138 |
+
pdf.set_fill_color(230, 200, 255)
|
| 139 |
+
pdf.cell(0, 10, 'Mobile Responsiveness Issues', 0, 1, 'L', True)
|
| 140 |
+
pdf.ln(2)
|
| 141 |
+
|
| 142 |
+
pdf.set_font('Arial', '', 10)
|
| 143 |
+
for issue in mobile_data.get('mobile_issues', [])[:15]:
|
| 144 |
+
pdf.multi_cell(0, 6, f'{issue}')
|
| 145 |
+
pdf.ln(5)
|
| 146 |
+
|
| 147 |
+
# AI Suggestions
|
| 148 |
+
pdf.add_page()
|
| 149 |
+
pdf.set_font('Arial', 'B', 14)
|
| 150 |
+
pdf.set_fill_color(200, 255, 200)
|
| 151 |
+
pdf.cell(0, 10, 'AI Recommendations', 0, 1, 'L', True)
|
| 152 |
+
pdf.ln(2)
|
| 153 |
+
|
| 154 |
+
pdf.set_font('Arial', '', 10)
|
| 155 |
+
for suggestion in ai_report.get('suggestions', [])[:20]:
|
| 156 |
+
pdf.multi_cell(0, 6, f'- {suggestion}')
|
| 157 |
+
|
| 158 |
+
# Save PDF
|
| 159 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 160 |
+
filename = f"audit_report_{timestamp}.pdf"
|
| 161 |
+
pdf.output(filename)
|
| 162 |
+
|
| 163 |
+
return filename
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
gradio
|
| 3 |
+
requests
|
| 4 |
+
beautifulsoup4
|
| 5 |
+
google-generativeai
|
| 6 |
+
python-dotenv
|
| 7 |
+
plotly
|
| 8 |
+
pandas
|
| 9 |
+
wordcloud
|
| 10 |
+
matplotlib
|
| 11 |
+
fpdf
|
scanner.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import time
|
| 3 |
+
from utils import safe_request
|
| 4 |
+
|
| 5 |
+
def scan_website(url):
|
| 6 |
+
data = {}
|
| 7 |
+
|
| 8 |
+
# Measure total load time including HTTP request
|
| 9 |
+
start = time.time()
|
| 10 |
+
response = safe_request(url)
|
| 11 |
+
if not response:
|
| 12 |
+
return {"error": "Unable to fetch URL", "score": 0}
|
| 13 |
+
|
| 14 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 15 |
+
load_time = round(time.time() - start, 2)
|
| 16 |
+
|
| 17 |
+
# Page size in MB
|
| 18 |
+
page_size_mb = len(response.content) / (1024*1024)
|
| 19 |
+
|
| 20 |
+
# Count internal vs external links
|
| 21 |
+
internal_links = 0
|
| 22 |
+
external_links = 0
|
| 23 |
+
for link in soup.find_all("a", href=True):
|
| 24 |
+
href = link.get("href")
|
| 25 |
+
if href.startswith("http") and url.split("//")[1] in href:
|
| 26 |
+
internal_links += 1
|
| 27 |
+
elif href.startswith("http"):
|
| 28 |
+
external_links += 1
|
| 29 |
+
|
| 30 |
+
# Heading counts
|
| 31 |
+
headings_count = {
|
| 32 |
+
"H1": len(soup.find_all("h1")),
|
| 33 |
+
"H2": len(soup.find_all("h2")),
|
| 34 |
+
"H3": len(soup.find_all("h3"))
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
data.update({
|
| 38 |
+
"status_code": response.status_code,
|
| 39 |
+
"load_time": load_time,
|
| 40 |
+
"https": url.startswith("https"),
|
| 41 |
+
"title": soup.title.string if soup.title else "Missing",
|
| 42 |
+
"meta_description": bool(soup.find("meta", attrs={"name": "description"})),
|
| 43 |
+
"h1_count": headings_count["H1"],
|
| 44 |
+
"h2_count": headings_count["H2"],
|
| 45 |
+
"h3_count": headings_count["H3"],
|
| 46 |
+
"headings_count": headings_count,
|
| 47 |
+
"images_without_alt": len([img for img in soup.find_all("img") if not img.get("alt")]),
|
| 48 |
+
"links_count": len(soup.find_all("a")),
|
| 49 |
+
"internal_links": internal_links,
|
| 50 |
+
"external_links": external_links,
|
| 51 |
+
"scripts_count": len(soup.find_all("script")),
|
| 52 |
+
"paragraph_count": len(soup.find_all("p")),
|
| 53 |
+
"page_size_mb": page_size_mb
|
| 54 |
+
})
|
| 55 |
+
|
| 56 |
+
return data
|
scoring.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def calculate_score(scan_data):
|
| 2 |
+
score = 0
|
| 3 |
+
score += 15 if scan_data.get("https") else 0
|
| 4 |
+
|
| 5 |
+
load_time = scan_data.get("load_time", 5)
|
| 6 |
+
if load_time <= 1: score += 15
|
| 7 |
+
elif load_time <= 3: score += 10
|
| 8 |
+
else: score += 5
|
| 9 |
+
|
| 10 |
+
score += 10 if scan_data.get("title") != "Missing" else 0
|
| 11 |
+
score += 10 if scan_data.get("meta_description") else 0
|
| 12 |
+
score += 10 if scan_data.get("h1_count", 0) >= 1 else 5
|
| 13 |
+
|
| 14 |
+
missing_alt = scan_data.get("images_without_alt", 0)
|
| 15 |
+
score += max(0, 10 - missing_alt*2)
|
| 16 |
+
|
| 17 |
+
score += min(5, scan_data.get("links_count", 0)*0.1)
|
| 18 |
+
score += min(5, scan_data.get("scripts_count", 0)*0.1)
|
| 19 |
+
|
| 20 |
+
paragraphs = scan_data.get("paragraph_count", 0)
|
| 21 |
+
score += 10 if paragraphs >= 3 else max(0, paragraphs*3)
|
| 22 |
+
|
| 23 |
+
score += 10 if scan_data.get("status_code") == 200 else 0
|
| 24 |
+
|
| 25 |
+
return round(min(score, 100), 2)
|
utils.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import requests
|
| 3 |
+
|
| 4 |
+
def normalize_url(url):
|
| 5 |
+
if not url.startswith(("http://", "https://")):
|
| 6 |
+
return "https://" + url
|
| 7 |
+
return url
|
| 8 |
+
|
| 9 |
+
def is_valid_url(url):
|
| 10 |
+
regex = re.compile(
|
| 11 |
+
r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$'
|
| 12 |
+
)
|
| 13 |
+
return re.match(regex, url) is not None
|
| 14 |
+
|
| 15 |
+
def safe_request(url, timeout=10):
|
| 16 |
+
try:
|
| 17 |
+
response = requests.get(
|
| 18 |
+
url,
|
| 19 |
+
timeout=timeout,
|
| 20 |
+
headers={"User-Agent": "AI-Site-Auditor"}
|
| 21 |
+
)
|
| 22 |
+
return response
|
| 23 |
+
except requests.exceptions.RequestException:
|
| 24 |
+
return None
|