Sakshi2005 commited on
Commit
27697ee
·
verified ·
1 Parent(s): 710b938

Upload folder using huggingface_hub

Browse files
.env.example ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Google Gemini API Configuration
2
+ GEMINI_API_KEY=your_gemini_api_key_here
3
+
4
+ # Get your API key from: https://aistudio.google.com/app/apikey
.gitignore ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+ .Python
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+ *.egg
14
+
15
+ # Virtual environments
16
+ venv/
17
+ env/
18
+ ENV/
19
+ .venv
20
+
21
+ # IDE
22
+ .vscode/
23
+ .idea/
24
+ *.swp
25
+ *.swo
26
+ *~
27
+
28
+ # OS
29
+ .DS_Store
30
+ Thumbs.db
31
+ desktop.ini
32
+
33
+ # Project specific
34
+ audit_history.json
35
+ audit_report_*.pdf
36
+ *.log
37
+
38
+ # Gradio
39
+ flagged/
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,94 @@
1
- ---
2
- title: Audit AI
3
- emoji: 👀
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Audit_AI
3
+ app_file: app_gradio.py
4
+ sdk: gradio
5
+ sdk_version: 5.47.2
6
+ ---
7
+ # 🧠 AuditAI — AI Website Auditor
8
+
9
+ An **Agentic AI-powered web application** built with **Gradio** that audits any website and provides **SEO, performance, accessibility, and security insights**, along with **AI-generated fixes and optimized HTML**.
10
+
11
+
12
+ ## 📌 Features
13
+
14
+ - 🔍 **Website Scanning**
15
+ - Page load time
16
+ - HTTPS detection
17
+ - Page size analysis
18
+ - Internal vs external links
19
+ - Headings structure (H1, H2, H3)
20
+ - Images without ALT attributes
21
+ - Scripts, paragraphs, and links count
22
+
23
+ - 🤖 **Agentic AI Analysis**
24
+ - Automatically detects website issues
25
+ - Provides actionable AI-powered suggestions
26
+ - Generates **HTML & SEO fix snippets**
27
+ - Produces **fully optimized HTML**
28
+ - Extracts top SEO keywords
29
+ - Analyzes heading hierarchy
30
+
31
+ - 📊 **Interactive Dashboard**
32
+ - Overall website score
33
+ - SEO, Performance, Accessibility & Security scores
34
+ - Gauge & radar charts
35
+ - Bar charts & pie charts
36
+ - Keyword word cloud
37
+ - Heading hierarchy treemap
38
+ - Page element heatmap
39
+
40
+ - ⬇️ **Download Optimized HTML**
41
+ - One-click download of AI-improved webpage
42
+
43
+ ---
44
+
45
+ ## 🔍 Usage
46
+
47
+ 1. Run the app locally using Gradio.
48
+ 2. Enter a valid website URL.
49
+ 3. Click **🚀 Start Audit**.
50
+ 4. View:
51
+ - ⚠️ Detected issues
52
+ - ✅ AI-generated suggestions
53
+ - 📊 Visual audit dashboard
54
+ - 🤖 Agentic AI fixes
55
+ - 📄 PDF Reports
56
+ 5. Download the **optimized HTML** or **PDF report** if available.
57
+
58
+ ---
59
+
60
+
61
+ ---
62
+
63
+ ## 📊 How It Works
64
+
65
+ 1. The app scans the website using **BeautifulSoup & Requests**.
66
+ 2. Raw metrics are calculated (SEO, performance, accessibility, mobile, security).
67
+ 3. Scan data is sent to **Google Gemini** for agentic analysis.
68
+ 4. AI returns:
69
+ - Issues
70
+ - Suggestions
71
+ - Fix snippets
72
+ - Optimized HTML
73
+ 5. Results are visualized in a rich Gradio dashboard.
74
+
75
+ ---
76
+
77
+ ## ⚙️ Tech Stack
78
+
79
+ - **Python 3.9+**
80
+ - **Gradio** — Web UI
81
+ - **Google Gemini API** — Agentic AI analysis
82
+ - **BeautifulSoup** — HTML parsing
83
+ - **Requests** — Web scraping
84
+ - **Plotly & Matplotlib** — Interactive charts
85
+ - **WordCloud** — Keyword visualization
86
+ - **FPDF** — PDF report generation
87
+ - **dotenv** — Environment variables
88
+
89
+ ---
90
+
91
+ ---
92
+
93
+ ## 👨‍💻 Author
94
+ **Sakshi Gupta**
README_GRADIO.md ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧠 AuditAI — Enhanced Agentic AI Website Auditor (Gradio Edition)
2
+
3
+ An **Agentic AI-powered web application** built with **Gradio** that provides comprehensive website audits including **SEO, performance, accessibility, security, mobile responsiveness**, and **broken link detection** with **AI-generated insights and PDF reports**.
4
+
5
+ ---
6
+
7
+ ## 🆕 What's New in Gradio Edition
8
+
9
+ ### **Enhanced Features:**
10
+ - ✅ **Accessibility Checker** - WCAG 2.1 compliance analysis
11
+ - ✅ **Mobile Responsiveness Analyzer** - Viewport, responsive images, touch targets
12
+ - ✅ **Broken Link Detection** - Parallel link checking with detailed reports
13
+ - ✅ **PDF Report Generation** - Professional downloadable audit reports
14
+ - ✅ **Historical Tracking** - Track score improvements over time
15
+ - ✅ **Trend Analysis** - Visualize performance changes across audits
16
+ - ✅ **Enhanced UI** - Modern Gradio tabbed interface with better UX
17
+
18
+ ### **Original Features (Retained):**
19
+ - 🔍 Website scanning (load time, HTTPS, page size, links, headings)
20
+ - 🤖 Agentic AI analysis with Google Gemini 1.5 Flash
21
+ - 📊 Interactive visualizations (gauges, radar charts, bar charts)
22
+ - ⬇️ Downloadable optimized HTML
23
+ - 💡 AI-powered suggestions and fix snippets
24
+
25
+ ---
26
+
27
+ ## 🚀 Quick Start
28
+
29
+ ### 1️⃣ Install Dependencies
30
+
31
+ ```bash
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ ### 2️⃣ Set Up Gemini API Key
36
+
37
+ Create a `.env` file in the project root:
38
+
39
+ ```env
40
+ GEMINI_API_KEY=your_gemini_api_key_here
41
+ ```
42
+
43
+ ### 3️⃣ Run the Gradio App
44
+
45
+ ```bash
46
+ python app_gradio.py
47
+ ```
48
+
49
+ The app will launch at `http://localhost:7860` with a shareable link.
50
+
51
+ ### 4️⃣ Run the Original Streamlit App (Optional)
52
+
53
+ ```bash
54
+ streamlit run app.py
55
+ ```
56
+
57
+ ---
58
+
59
+ ## 📋 New Features Details
60
+
61
+ ### **♿ Accessibility Checker** (`accessibility_checker.py`)
62
+ Analyzes WCAG 2.1 compliance:
63
+ - Missing alt text on images
64
+ - Proper heading hierarchy (H1-H6)
65
+ - Form labels and ARIA landmarks
66
+ - Link text quality
67
+ - Language attributes
68
+ - Skip navigation links
69
+ - Video captions
70
+
71
+ ### **📱 Mobile Responsiveness** (`mobile_checker.py`)
72
+ Checks mobile-friendliness:
73
+ - Viewport meta tag validation
74
+ - Responsive images (srcset/sizes)
75
+ - Page size optimization for mobile
76
+ - Flash content detection
77
+ - Fixed-width elements
78
+ - Touch target sizes
79
+ - Media queries analysis
80
+ - Relative font sizing
81
+
82
+ ### **🔗 Broken Link Detector** (`link_checker.py`)
83
+ Identifies broken links:
84
+ - Parallel processing for speed (10 concurrent workers)
85
+ - Checks up to 50 links per audit
86
+ - HTTP status code validation
87
+ - Internal vs external link tracking
88
+ - Detailed error reporting
89
+
90
+ ### **📄 PDF Report Generator** (`report_generator.py`)
91
+ Creates professional reports:
92
+ - Multi-page comprehensive audit summary
93
+ - Color-coded scores and metrics
94
+ - All detected issues organized by category
95
+ - AI recommendations
96
+ - Broken link details
97
+ - Timestamp and metadata
98
+
99
+ ### **📈 Historical Tracking** (`history_tracker.py`)
100
+ Tracks performance over time:
101
+ - JSON-based storage (last 100 audits)
102
+ - Per-site history retrieval
103
+ - Trend data for visualizations
104
+ - Score comparison across audits
105
+
106
+ ---
107
+
108
+ ## 🎨 Gradio UI Structure
109
+
110
+ The new interface uses **5 tabs**:
111
+
112
+ 1. **📊 Overview** - Summary, scores, gauge & radar charts
113
+ 2. **📈 Metrics & Trends** - Technical metrics and historical trends
114
+ 3. **⚠️ Issues** - AI, accessibility, mobile, and broken link issues
115
+ 4. **✅ Recommendations** - AI-powered suggestions
116
+ 5. **📄 PDF Report** - Download comprehensive report
117
+
118
+ ---
119
+
120
+ ## 📊 Scoring System
121
+
122
+ ### **Overall Score Calculation** (0-100)
123
+ Based on:
124
+ - HTTPS (15 points)
125
+ - Load time (5-15 points)
126
+ - Title presence (10 points)
127
+ - Meta description (10 points)
128
+ - H1 tags (5-10 points)
129
+ - Images with alt text (up to 10 points)
130
+ - Links & scripts (up to 10 points)
131
+ - Paragraph content (up to 10 points)
132
+ - HTTP status (10 points)
133
+
134
+ ### **Individual Scores**
135
+ - **SEO Score:** `100 - (images_without_alt × 5)`
136
+ - **Performance Score:** `100 - (load_time × 10)`
137
+ - **Accessibility Score:** WCAG compliance based (0-100)
138
+ - **Security Score:** 100 if HTTPS, else 50
139
+ - **Mobile Score:** Mobile-friendliness based (0-100)
140
+
141
+ ---
142
+
143
+ ## 🔧 Tech Stack
144
+
145
+ ### **Core Technologies**
146
+ - **Python 3.9+**
147
+ - **Gradio 4.x** — Modern web UI framework
148
+ - **Google Gemini API** — Gemini 1.5 Flash for AI analysis
149
+ - **BeautifulSoup4** — HTML parsing
150
+ - **Requests** — HTTP client
151
+
152
+ ### **Visualization & Reports**
153
+ - **Plotly** — Interactive charts (gauges, radar, bar)
154
+ - **Matplotlib** — Word clouds
155
+ - **Pandas** — Data manipulation
156
+ - **FPDF** — PDF generation
157
+
158
+ ### **Other**
159
+ - **python-dotenv** — Environment variables
160
+ - **concurrent.futures** — Parallel link checking
161
+
162
+ ---
163
+
164
+ ## 📁 Project Structure
165
+
166
+ ```
167
+ AuditAI-main/
168
+ ├── app.py # Original Streamlit app
169
+ ├── app_gradio.py # NEW: Gradio app
170
+ ├── scanner.py # Website scanner
171
+ ├── ai_analyzer.py # OpenAI integration
172
+ ├── scoring.py # Score calculation
173
+ ├── dashboard.py # Streamlit dashboard
174
+ ├── utils.py # Utility functions
175
+ ├── accessibility_checker.py # NEW: Accessibility analysis
176
+ ├── mobile_checker.py # NEW: Mobile responsiveness
177
+ ├── link_checker.py # NEW: Broken link detection
178
+ ├── report_generator.py # NEW: PDF generation
179
+ ├── history_tracker.py # NEW: Historical tracking
180
+ ├── requirements.txt # Dependencies
181
+ ├── README.md # Original readme
182
+ ├── README_GRADIO.md # This file
183
+ └── .env # API keys (create this)
184
+ ```
185
+
186
+ ---
187
+
188
+ ## 🎯 Usage Guide
189
+
190
+ 1. **Enter URL:** Input the website URL (e.g., `https://example.com`)
191
+ 2. **Choose Options:** Check/uncheck "Check for Broken Links" (optional, slower)
192
+ 3. **Click Audit:** Start the comprehensive analysis
193
+ 4. **View Results:**
194
+ - Overview tab shows summary and scores
195
+ - Issues tab lists all detected problems
196
+ - Recommendations tab shows AI suggestions
197
+ - PDF tab provides downloadable report
198
+ 5. **Track Progress:** Re-audit the same site to see trend improvements
199
+
200
+ ---
201
+
202
+ ## ⚡ Performance Notes
203
+
204
+ - **Broken Link Checking:** Uses parallel processing (10 workers) but can take 30-60s for 50 links
205
+ - **AI Analysis:** Powered by Google Gemini AI | Enhanced with Advanced Analytics
206
+ - **PDF Generation:** Instant (<1s)
207
+ - **Historical Trends:** Only show after 2+ audits of the same site
208
+
209
+ ---
210
+
211
+ ## 🔒 Environment Variables
212
+
213
+ Required in `.env` file:
214
+
215
+ ```env
216
+ GEMINI_API_KEY=your-gemini-key-here
217
+ ```
218
+
219
+ ---
220
+
221
+ ## 🆚 Gradio vs Streamlit
222
+
223
+ ### **Why Gradio?**
224
+ - ✅ Easier deployment (built-in sharing)
225
+ - ✅ Better tab organization
226
+ - ✅ Cleaner API for complex workflows
227
+ - ✅ Automatic shareable links
228
+ - ✅ Better mobile experience
229
+
230
+ ### **Keeping Streamlit?**
231
+ Both versions are maintained. Use:
232
+ - `app_gradio.py` for the enhanced version
233
+ - `app.py` for the original Streamlit version
234
+
235
+ ---
236
+
237
+ ## 👨‍💻 Author
238
+
239
+ **Mirza Yasir Abdullah Baig**
240
+
241
+ - 🌐 [Kaggle](https://www.kaggle.com/mirzayasirabdullah07)
242
+ - 💼 [LinkedIn](https://www.linkedin.com/in/mirza-yasir-abdullah-baig/)
243
+ - 💻 [GitHub](https://github.com/mirzayasirabdullahbaig07)
244
+
245
+ ---
246
+
247
+ ## 📝 License
248
+
249
+ Educational purposes. Not for commercial use without permission.
250
+
251
+ ---
252
+
253
+ ## 🐛 Troubleshooting
254
+
255
+ **Issue:** Gemini API errors
256
+ **Solution:** Check your API key in `.env` and get it from https://aistudio.google.com/app/apikey
257
+
258
+ **Issue:** Broken link checking takes too long
259
+ **Solution:** Uncheck the "Check for Broken Links" option
260
+
261
+ **Issue:** PDF generation fails
262
+ **Solution:** Ensure `fpdf` is installed: `pip install fpdf`
263
+
264
+ **Issue:** No trend data shown
265
+ **Solution:** Audit the same site multiple times to build history
266
+
267
+ ---
268
+
269
+ ## 🚀 Future Enhancements
270
+
271
+ - [ ] Multi-page website crawling
272
+ - [ ] Competitor comparison
273
+ - [ ] Lighthouse integration
274
+ - [ ] Email report scheduling
275
+ - [ ] Database storage (replace JSON)
276
+ - [ ] Custom scoring weights
277
+ - [ ] Screenshot capture
278
+ - [ ] Security header analysis
279
+
280
+ ---
281
+
282
+ ## 📸 Screenshots
283
+
284
+ Coming soon! Run the app to see the beautiful new Gradio interface.
285
+
286
+ ---
287
+
288
+ **Enjoy auditing! 🎉**
accessibility_checker.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+
3
+ def check_accessibility(soup, url):
4
+ """
5
+ Checks WCAG 2.1 accessibility guidelines
6
+ Returns dict with accessibility issues and score
7
+ """
8
+ issues = []
9
+ score = 100
10
+
11
+ # Check for missing alt text on images
12
+ images = soup.find_all('img')
13
+ images_without_alt = [img for img in images if not img.get('alt')]
14
+ if images_without_alt:
15
+ issues.append(f"❌ {len(images_without_alt)} images missing alt text")
16
+ score -= min(20, len(images_without_alt) * 2)
17
+
18
+ # Check for proper heading hierarchy
19
+ headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
20
+ h1_count = len(soup.find_all('h1'))
21
+ if h1_count == 0:
22
+ issues.append("❌ No H1 heading found - important for screen readers")
23
+ score -= 10
24
+ elif h1_count > 1:
25
+ issues.append(f"⚠️ Multiple H1 headings ({h1_count}) - should be unique")
26
+ score -= 5
27
+
28
+ # Check for form labels
29
+ forms = soup.find_all('form')
30
+ for form in forms:
31
+ inputs = form.find_all(['input', 'select', 'textarea'])
32
+ for input_elem in inputs:
33
+ if input_elem.get('type') not in ['submit', 'button', 'hidden']:
34
+ label_id = input_elem.get('id')
35
+ if not label_id or not form.find('label', {'for': label_id}):
36
+ issues.append("❌ Form inputs missing associated labels")
37
+ score -= 5
38
+ break
39
+
40
+ # Check for color contrast (basic check)
41
+ inline_styles = soup.find_all(style=True)
42
+ if inline_styles:
43
+ issues.append("⚠️ Inline styles detected - may affect accessibility")
44
+ score -= 3
45
+
46
+ # Check for ARIA landmarks
47
+ main_tag = soup.find('main')
48
+ nav_tag = soup.find('nav')
49
+ if not main_tag:
50
+ issues.append("⚠️ No <main> landmark - helps screen reader navigation")
51
+ score -= 5
52
+ if not nav_tag:
53
+ issues.append("⚠️ No <nav> landmark found")
54
+ score -= 3
55
+
56
+ # Check for link text
57
+ links = soup.find_all('a')
58
+ generic_link_text = ['click here', 'read more', 'here', 'link']
59
+ for link in links:
60
+ text = link.get_text().strip().lower()
61
+ if text in generic_link_text:
62
+ issues.append("❌ Generic link text found (e.g., 'click here') - use descriptive text")
63
+ score -= 5
64
+ break
65
+
66
+ # Check for lang attribute
67
+ html_tag = soup.find('html')
68
+ if html_tag and not html_tag.get('lang'):
69
+ issues.append("❌ Missing lang attribute on <html> tag")
70
+ score -= 10
71
+
72
+ # Check for skip links
73
+ skip_link = soup.find('a', href='#main') or soup.find('a', href='#content')
74
+ if not skip_link:
75
+ issues.append("⚠️ No skip navigation link found")
76
+ score -= 5
77
+
78
+ # Check for video captions
79
+ videos = soup.find_all('video')
80
+ for video in videos:
81
+ if not video.find('track', kind='captions'):
82
+ issues.append("❌ Videos missing captions/subtitles")
83
+ score -= 10
84
+ break
85
+
86
+ return {
87
+ 'accessibility_score': max(0, score),
88
+ 'accessibility_issues': issues if issues else ["✅ No major accessibility issues detected"],
89
+ 'wcag_compliance': 'Good' if score >= 80 else 'Needs Improvement' if score >= 60 else 'Poor'
90
+ }
ai_analyzer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import json
5
+ import re
6
+
7
+ load_dotenv()
8
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
9
+ model = genai.GenerativeModel('gemini-1.5-flash')
10
+
11
+ def analyze_with_ai(scan_data):
12
+ """
13
+ Returns:
14
+ - issues: list of problems
15
+ - suggestions: list of improvements
16
+ - fix_snippets: code snippets for fixes
17
+ - optimized_html: full HTML with improvements (agentic AI)
18
+ - keywords: top keywords
19
+ - headings_count: H1/H2/H3 count
20
+ """
21
+ # Generate dummy keywords from title
22
+ keywords = re.findall(r'\b\w+\b', scan_data.get("title", ""))[:10]
23
+
24
+ prompt = f"""
25
+ You are a website audit and optimization expert.
26
+ Analyze this website scan data and provide:
27
+ 1) issues (list)
28
+ 2) suggestions (list)
29
+ 3) fix_snippets (list of HTML/SEO fixes)
30
+ 4) optimized_html (full HTML content with improvements applied)
31
+ 5) keywords (list)
32
+ 6) headings_count (dict of H1, H2, H3 counts)
33
+
34
+ Respond ONLY in JSON format.
35
+
36
+ Scan Data:
37
+ {json.dumps(scan_data, indent=2)}
38
+ """
39
+ try:
40
+ response = model.generate_content(prompt)
41
+ content = response.text
42
+
43
+ # Clean markdown code blocks if present
44
+ if '```json' in content:
45
+ content = content.split('```json')[1].split('```')[0].strip()
46
+ elif '```' in content:
47
+ content = content.split('```')[1].split('```')[0].strip()
48
+
49
+ ai_report = json.loads(content)
50
+
51
+ # Fallbacks
52
+ ai_report.setdefault("keywords", keywords)
53
+ ai_report.setdefault("headings_count", scan_data.get("headings_count", {}))
54
+ ai_report.setdefault("fix_snippets", [])
55
+ ai_report.setdefault("optimized_html", "")
56
+ return ai_report
57
+
58
+ except Exception as e:
59
+ # Fallback
60
+ return {
61
+ "issues": [
62
+ f"H1 tags found: {scan_data.get('h1_count',0)}",
63
+ f"Images without ALT: {scan_data.get('images_without_alt',0)}",
64
+ f"Page load time: {scan_data.get('load_time',0)}s"
65
+ ],
66
+ "suggestions": [
67
+ "Add missing meta description",
68
+ "Optimize images and include ALT text",
69
+ "Improve page speed"
70
+ ],
71
+ "fix_snippets": [
72
+ "<meta name='description' content='Your description here'>",
73
+ "<img src='image.jpg' alt='Descriptive text'>"
74
+ ],
75
+ "optimized_html": "<!-- Add optimized HTML here -->",
76
+ "keywords": keywords,
77
+ "headings_count": scan_data.get("headings_count", {})
78
+ }
app_gradio.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scanner import scan_website
3
+ from ai_analyzer import analyze_with_ai
4
+ from utils import normalize_url, is_valid_url
5
+ from scoring import calculate_score
6
+ from accessibility_checker import check_accessibility
7
+ from mobile_checker import check_mobile_responsiveness
8
+ from link_checker import check_broken_links
9
+ from report_generator import generate_pdf_report
10
+ from history_tracker import save_audit, get_trend_data
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ import pandas as pd
14
+ from bs4 import BeautifulSoup
15
+ import requests
16
+
17
+ def create_gauge_chart(score, title):
18
+ """Create a gauge chart for scores"""
19
+ fig = go.Figure(go.Indicator(
20
+ mode="gauge+number",
21
+ value=score,
22
+ title={'text': title},
23
+ gauge={
24
+ 'axis': {'range': [0, 100]},
25
+ 'bar': {'color': "darkblue"},
26
+ 'steps': [
27
+ {'range': [0, 50], 'color': "lightcoral"},
28
+ {'range': [50, 80], 'color': "lightyellow"},
29
+ {'range': [80, 100], 'color': "lightgreen"}
30
+ ],
31
+ 'threshold': {
32
+ 'line': {'color': "red", 'width': 4},
33
+ 'thickness': 0.75,
34
+ 'value': 90
35
+ }
36
+ }
37
+ ))
38
+ fig.update_layout(height=300)
39
+ return fig
40
+
41
+ def create_radar_chart(scores_dict):
42
+ """Create radar chart for all scores"""
43
+ categories = list(scores_dict.keys())
44
+ values = list(scores_dict.values())
45
+
46
+ fig = go.Figure()
47
+ fig.add_trace(go.Scatterpolar(
48
+ r=values,
49
+ theta=categories,
50
+ fill='toself',
51
+ name='Audit Scores'
52
+ ))
53
+ fig.update_layout(
54
+ polar=dict(radialaxis=dict(range=[0, 100])),
55
+ title="Overall Website Health Radar",
56
+ height=400
57
+ )
58
+ return fig
59
+
60
+ def create_metrics_bar_chart(scan_data):
61
+ """Create bar chart for SEO metrics"""
62
+ metrics_data = pd.DataFrame({
63
+ 'Metric': ['H1 Tags', 'H2 Tags', 'H3 Tags', 'Images w/o ALT', 'Links', 'Scripts'],
64
+ 'Value': [
65
+ scan_data.get('h1_count', 0),
66
+ scan_data.get('h2_count', 0),
67
+ scan_data.get('h3_count', 0),
68
+ scan_data.get('images_without_alt', 0),
69
+ scan_data.get('links_count', 0),
70
+ scan_data.get('scripts_count', 0)
71
+ ]
72
+ })
73
+
74
+ fig = px.bar(metrics_data, x='Metric', y='Value',
75
+ title='SEO & Technical Metrics',
76
+ color='Value',
77
+ color_continuous_scale='Viridis')
78
+ fig.update_layout(height=400)
79
+ return fig
80
+
81
+ def create_trend_chart(url):
82
+ """Create trend chart from history"""
83
+ trend_data = get_trend_data(url)
84
+
85
+ if not trend_data:
86
+ return None
87
+
88
+ df = pd.DataFrame(trend_data['scores'])
89
+ df['Date'] = trend_data['dates']
90
+
91
+ fig = go.Figure()
92
+ for col in df.columns[:-1]:
93
+ fig.add_trace(go.Scatter(x=df['Date'], y=df[col], mode='lines+markers', name=col))
94
+
95
+ fig.update_layout(
96
+ title='Score Trends Over Time',
97
+ xaxis_title='Date',
98
+ yaxis_title='Score',
99
+ height=400
100
+ )
101
+ return fig
102
+
103
+ def audit_website(url, check_links=True):
104
+ """Main audit function"""
105
+ if not url or not is_valid_url(url):
106
+ return ("❌ Invalid URL", None, None, None, None, None, None, None, None, None, None)
107
+
108
+ url = normalize_url(url)
109
+ status_msg = f"🔍 Scanning {url}..."
110
+
111
+ # Step 1: Scan website
112
+ scan_data = scan_website(url)
113
+
114
+ if "error" in scan_data:
115
+ return (f"❌ Error: {scan_data['error']}", None, None, None, None, None, None, None, None, None, None)
116
+
117
+ # Step 2: Get page content for additional checks
118
+ try:
119
+ response = requests.get(url, timeout=10, headers={"User-Agent": "AI-Site-Auditor"})
120
+ soup = BeautifulSoup(response.text, 'html.parser')
121
+ except:
122
+ return ("❌ Failed to fetch page content", None, None, None, None, None, None, None, None, None, None)
123
+
124
+ # Step 3: Run all checks
125
+ accessibility_data = check_accessibility(soup, url)
126
+ mobile_data = check_mobile_responsiveness(soup, scan_data.get('page_size_mb', 0))
127
+
128
+ if check_links:
129
+ link_data = check_broken_links(url, soup, max_links=50)
130
+ else:
131
+ link_data = {'total_links_checked': 0, 'working_links': 0, 'broken_links_count': 0,
132
+ 'broken_links_details': [], 'link_health': 'Skipped'}
133
+
134
+ # Step 4: Calculate scores
135
+ overall_score = calculate_score(scan_data)
136
+ scan_data["overall_score"] = overall_score
137
+ scan_data["seo_score"] = max(0, 100 - scan_data.get("images_without_alt", 0) * 5)
138
+ scan_data["performance_score"] = max(0, 100 - scan_data.get("load_time", 5) * 10)
139
+ scan_data["security_score"] = 100 if scan_data.get("https") else 50
140
+
141
+ # Step 5: AI Analysis
142
+ ai_report = analyze_with_ai(scan_data)
143
+
144
+ # Step 6: Save to history
145
+ save_audit(url, scan_data, ai_report, accessibility_data, mobile_data, link_data)
146
+
147
+ # Step 7: Create visualizations
148
+ scores_dict = {
149
+ 'SEO': scan_data["seo_score"],
150
+ 'Performance': scan_data["performance_score"],
151
+ 'Accessibility': accessibility_data['accessibility_score'],
152
+ 'Security': scan_data["security_score"],
153
+ 'Mobile': mobile_data['mobile_score']
154
+ }
155
+
156
+ gauge_overall = create_gauge_chart(overall_score, "Overall Score")
157
+ radar_chart = create_radar_chart(scores_dict)
158
+ metrics_chart = create_metrics_bar_chart(scan_data)
159
+ trend_chart = create_trend_chart(url)
160
+
161
+ # Step 8: Format results
162
+ summary = f"""
163
+ # 🎯 Audit Summary for {url}
164
+
165
+ ## 📊 Scores
166
+ - **Overall Score:** {overall_score}/100
167
+ - **SEO Score:** {scan_data['seo_score']}/100
168
+ - **Performance Score:** {scan_data['performance_score']}/100
169
+ - **Accessibility Score:** {accessibility_data['accessibility_score']}/100
170
+ - **Security Score:** {scan_data['security_score']}/100
171
+ - **Mobile Score:** {mobile_data['mobile_score']}/100
172
+
173
+ ## 🔧 Technical Metrics
174
+ - **Load Time:** {scan_data.get('load_time', 0)}s
175
+ - **Page Size:** {scan_data.get('page_size_mb', 0):.2f} MB
176
+ - **HTTPS:** {'✅ Yes' if scan_data.get('https') else '❌ No'}
177
+ - **Status Code:** {scan_data.get('status_code', 'N/A')}
178
+
179
+ ## 🔗 Link Health
180
+ - **Total Links Checked:** {link_data['total_links_checked']}
181
+ - **Working Links:** {link_data['working_links']}
182
+ - **Broken Links:** {link_data['broken_links_count']}
183
+ - **Health Status:** {link_data['link_health']}
184
+
185
+ ## 📱 Mobile Friendliness
186
+ - **Status:** {mobile_data['mobile_friendly']}
187
+
188
+ ## ♿ Accessibility
189
+ - **WCAG Compliance:** {accessibility_data['wcag_compliance']}
190
+ """
191
+
192
+ # Format AI Issues
193
+ ai_issues_text = "## ⚠️ AI Detected Issues\n\n"
194
+ for issue in ai_report.get('issues', [])[:10]:
195
+ ai_issues_text += f"- {issue}\n"
196
+
197
+ # Format AI Suggestions
198
+ ai_suggestions_text = "## ✅ AI Recommendations\n\n"
199
+ for suggestion in ai_report.get('suggestions', [])[:10]:
200
+ ai_suggestions_text += f"- {suggestion}\n"
201
+
202
+ # Format Accessibility Issues
203
+ accessibility_text = "## ♿ Accessibility Issues\n\n"
204
+ for issue in accessibility_data.get('accessibility_issues', []):
205
+ accessibility_text += f"{issue}\n\n"
206
+
207
+ # Format Mobile Issues
208
+ mobile_text = "## 📱 Mobile Issues\n\n"
209
+ for issue in mobile_data.get('mobile_issues', []):
210
+ mobile_text += f"{issue}\n\n"
211
+
212
+ # Format Broken Links
213
+ broken_links_text = "## 🔗 Broken Links Details\n\n"
214
+ if link_data['broken_links_details']:
215
+ for broken in link_data['broken_links_details']:
216
+ broken_links_text += f"- **URL:** {broken['url']}\n"
217
+ broken_links_text += f" **Status:** {broken['status']}\n\n"
218
+ else:
219
+ broken_links_text += "✅ No broken links detected!\n"
220
+
221
+ # Generate PDF
222
+ try:
223
+ pdf_path = generate_pdf_report(url, scan_data, ai_report, accessibility_data, mobile_data, link_data)
224
+ except:
225
+ pdf_path = None
226
+
227
+ return (
228
+ summary,
229
+ ai_issues_text,
230
+ ai_suggestions_text,
231
+ accessibility_text,
232
+ mobile_text,
233
+ broken_links_text,
234
+ gauge_overall,
235
+ radar_chart,
236
+ metrics_chart,
237
+ trend_chart if trend_chart else None,
238
+ pdf_path
239
+ )
240
+
241
+ # Create Gradio Interface
242
+ with gr.Blocks(title="AuditAI - Agentic Website Auditor", theme=gr.themes.Soft()) as demo:
243
+
244
+ gr.Markdown("""
245
+ # 🧠 AuditAI - Agentic AI Website Auditor
246
+ **Powered by Google Gemini 1.5 Flash | Enhanced with Advanced Analytics**
247
+
248
+ Comprehensive website auditing with SEO, Performance, Accessibility, Security, and Mobile analysis.
249
+ """)
250
+
251
+ with gr.Row():
252
+ with gr.Column(scale=3):
253
+ url_input = gr.Textbox(
254
+ label="Website URL",
255
+ placeholder="https://example.com",
256
+ info="Enter the full URL of the website to audit"
257
+ )
258
+ with gr.Column(scale=1):
259
+ check_links_checkbox = gr.Checkbox(
260
+ label="Check for Broken Links",
261
+ value=True,
262
+ info="May take longer"
263
+ )
264
+
265
+ audit_btn = gr.Button("🚀 Start Audit", variant="primary", size="lg")
266
+
267
+ with gr.Tabs():
268
+ with gr.Tab("📊 Overview"):
269
+ summary_output = gr.Markdown(label="Audit Summary")
270
+
271
+ with gr.Row():
272
+ gauge_plot = gr.Plot(label="Overall Score")
273
+ radar_plot = gr.Plot(label="Health Radar")
274
+
275
+ with gr.Tab("📈 Metrics & Trends"):
276
+ metrics_plot = gr.Plot(label="Technical Metrics")
277
+ trend_plot = gr.Plot(label="Historical Trends")
278
+
279
+ with gr.Tab("⚠️ Issues"):
280
+ ai_issues_output = gr.Markdown(label="AI Detected Issues")
281
+ accessibility_output = gr.Markdown(label="Accessibility Issues")
282
+ mobile_output = gr.Markdown(label="Mobile Issues")
283
+ broken_links_output = gr.Markdown(label="Broken Links")
284
+
285
+ with gr.Tab("✅ Recommendations"):
286
+ ai_suggestions_output = gr.Markdown(label="AI Recommendations")
287
+
288
+ with gr.Tab("📄 PDF Report"):
289
+ gr.Markdown("### Download your comprehensive audit report")
290
+ pdf_output = gr.File(label="Download PDF Report")
291
+
292
+ # Event handler
293
+ audit_btn.click(
294
+ fn=audit_website,
295
+ inputs=[url_input, check_links_checkbox],
296
+ outputs=[
297
+ summary_output,
298
+ ai_issues_output,
299
+ ai_suggestions_output,
300
+ accessibility_output,
301
+ mobile_output,
302
+ broken_links_output,
303
+ gauge_plot,
304
+ radar_plot,
305
+ metrics_plot,
306
+ trend_plot,
307
+ pdf_output
308
+ ]
309
+ )
310
+
311
+ gr.Markdown("""
312
+ ---
313
+ ### 👨‍💻 Built by Sakshi Gupta
314
+ **Features:** SEO Analysis • Performance Metrics • Accessibility Check • Broken Link Detection •
315
+ Mobile Responsiveness • AI-Powered Insights • PDF Reports • Historical Tracking
316
+ """)
317
+
318
+ if __name__ == "__main__":
319
+ demo.launch(share=True)
history_tracker.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+ HISTORY_FILE = "audit_history.json"
6
+
7
+ def load_history():
8
+ """Load audit history from JSON file"""
9
+ if os.path.exists(HISTORY_FILE):
10
+ try:
11
+ with open(HISTORY_FILE, 'r') as f:
12
+ return json.load(f)
13
+ except:
14
+ return []
15
+ return []
16
+
17
+ def save_audit(url, scan_data, ai_report, accessibility_data, mobile_data, link_data):
18
+ """Save current audit to history"""
19
+ history = load_history()
20
+
21
+ audit_entry = {
22
+ 'timestamp': datetime.now().isoformat(),
23
+ 'url': url,
24
+ 'overall_score': scan_data.get('overall_score', 0),
25
+ 'seo_score': scan_data.get('seo_score', 0),
26
+ 'performance_score': scan_data.get('performance_score', 0),
27
+ 'accessibility_score': accessibility_data.get('accessibility_score', 0),
28
+ 'security_score': scan_data.get('security_score', 0),
29
+ 'mobile_score': mobile_data.get('mobile_score', 0),
30
+ 'load_time': scan_data.get('load_time', 0),
31
+ 'page_size_mb': scan_data.get('page_size_mb', 0),
32
+ 'broken_links': link_data.get('broken_links_count', 0),
33
+ 'https': scan_data.get('https', False)
34
+ }
35
+
36
+ history.append(audit_entry)
37
+
38
+ # Keep only last 100 audits
39
+ history = history[-100:]
40
+
41
+ with open(HISTORY_FILE, 'w') as f:
42
+ json.dump(history, f, indent=2)
43
+
44
+ return audit_entry
45
+
46
+ def get_site_history(url, limit=10):
47
+ """Get history for a specific site"""
48
+ history = load_history()
49
+ site_history = [entry for entry in history if entry['url'] == url]
50
+ return site_history[-limit:]
51
+
52
+ def get_trend_data(url):
53
+ """Get trend data for charts"""
54
+ site_history = get_site_history(url, limit=20)
55
+
56
+ if not site_history:
57
+ return None
58
+
59
+ dates = [entry['timestamp'][:10] for entry in site_history]
60
+ scores = {
61
+ 'Overall': [entry['overall_score'] for entry in site_history],
62
+ 'SEO': [entry['seo_score'] for entry in site_history],
63
+ 'Performance': [entry['performance_score'] for entry in site_history],
64
+ 'Accessibility': [entry['accessibility_score'] for entry in site_history],
65
+ 'Security': [entry['security_score'] for entry in site_history],
66
+ 'Mobile': [entry['mobile_score'] for entry in site_history]
67
+ }
68
+
69
+ return {'dates': dates, 'scores': scores}
link_checker.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin, urlparse
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+ def check_broken_links(url, soup, max_links=50, timeout=5):
7
+ """
8
+ Checks for broken links on the page
9
+ Returns dict with broken links, total links checked, and status
10
+ """
11
+ broken_links = []
12
+ working_links = 0
13
+ skipped_links = 0
14
+
15
+ # Extract all links
16
+ all_links = soup.find_all('a', href=True)
17
+ links_to_check = []
18
+
19
+ for link in all_links[:max_links]: # Limit to avoid overwhelming
20
+ href = link.get('href')
21
+
22
+ # Skip anchors, mailto, tel, javascript
23
+ if href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
24
+ skipped_links += 1
25
+ continue
26
+
27
+ # Convert relative URLs to absolute
28
+ full_url = urljoin(url, href)
29
+
30
+ # Only check HTTP/HTTPS
31
+ if full_url.startswith(('http://', 'https://')):
32
+ links_to_check.append((href, full_url))
33
+
34
+ # Check links in parallel for speed
35
+ def check_single_link(link_data):
36
+ original_href, full_url = link_data
37
+ try:
38
+ response = requests.head(full_url, timeout=timeout, allow_redirects=True,
39
+ headers={"User-Agent": "AI-Site-Auditor"})
40
+
41
+ # If HEAD fails, try GET
42
+ if response.status_code >= 400:
43
+ response = requests.get(full_url, timeout=timeout,
44
+ headers={"User-Agent": "AI-Site-Auditor"})
45
+
46
+ if response.status_code >= 400:
47
+ return {'broken': True, 'url': original_href, 'status': response.status_code}
48
+ else:
49
+ return {'broken': False}
50
+ except requests.exceptions.RequestException as e:
51
+ return {'broken': True, 'url': original_href, 'status': 'Error', 'error': str(e)[:50]}
52
+
53
+ # Use ThreadPoolExecutor for parallel checking
54
+ with ThreadPoolExecutor(max_workers=10) as executor:
55
+ futures = {executor.submit(check_single_link, link): link for link in links_to_check}
56
+
57
+ for future in as_completed(futures):
58
+ result = future.result()
59
+ if result['broken']:
60
+ broken_links.append(result)
61
+ else:
62
+ working_links += 1
63
+
64
+ total_checked = len(links_to_check)
65
+ broken_count = len(broken_links)
66
+
67
+ return {
68
+ 'total_links_checked': total_checked,
69
+ 'working_links': working_links,
70
+ 'broken_links_count': broken_count,
71
+ 'broken_links_details': broken_links[:10], # Limit details to first 10
72
+ 'skipped_links': skipped_links,
73
+ 'link_health': 'Excellent' if broken_count == 0 else 'Good' if broken_count <= 2 else 'Needs Attention'
74
+ }
mobile_checker.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+
3
+ def check_mobile_responsiveness(soup, page_size_mb):
4
+ """
5
+ Checks mobile-friendliness and responsive design
6
+ Returns dict with mobile issues and score
7
+ """
8
+ issues = []
9
+ score = 100
10
+
11
+ # Check viewport meta tag
12
+ viewport = soup.find('meta', attrs={'name': 'viewport'})
13
+ if not viewport:
14
+ issues.append("❌ Missing viewport meta tag - critical for mobile devices")
15
+ score -= 25
16
+ else:
17
+ content = viewport.get('content', '')
18
+ if 'width=device-width' not in content:
19
+ issues.append("⚠️ Viewport should include 'width=device-width'")
20
+ score -= 10
21
+ if 'initial-scale=1' not in content:
22
+ issues.append("⚠️ Viewport should include 'initial-scale=1'")
23
+ score -= 5
24
+
25
+ # Check for responsive images
26
+ images = soup.find_all('img')
27
+ responsive_images = [img for img in images if img.get('srcset') or img.get('sizes')]
28
+ if images and len(responsive_images) == 0:
29
+ issues.append("⚠️ No responsive images detected (consider using srcset)")
30
+ score -= 10
31
+
32
+ # Check page size for mobile
33
+ if page_size_mb > 3:
34
+ issues.append(f"❌ Page size ({page_size_mb:.2f}MB) too large for mobile - should be <3MB")
35
+ score -= 15
36
+ elif page_size_mb > 1.5:
37
+ issues.append(f"⚠️ Page size ({page_size_mb:.2f}MB) could be optimized for mobile")
38
+ score -= 5
39
+
40
+ # Check for mobile-unfriendly elements
41
+ flash = soup.find_all(['embed', 'object'], type='application/x-shockwave-flash')
42
+ if flash:
43
+ issues.append("❌ Flash content detected - not supported on mobile devices")
44
+ score -= 20
45
+
46
+ # Check for fixed width elements
47
+ tables = soup.find_all('table')
48
+ for table in tables:
49
+ if table.get('width') and 'px' in str(table.get('width')):
50
+ issues.append("⚠️ Fixed-width tables detected - may not be mobile-friendly")
51
+ score -= 5
52
+ break
53
+
54
+ # Check for touch-friendly elements
55
+ buttons = soup.find_all('button')
56
+ links = soup.find_all('a')
57
+ small_touch_targets = 0
58
+ for elem in buttons + links:
59
+ style = elem.get('style', '')
60
+ if 'font-size' in style and any(size in style for size in ['8px', '9px', '10px']):
61
+ small_touch_targets += 1
62
+
63
+ if small_touch_targets > 0:
64
+ issues.append(f"⚠️ {small_touch_targets} elements may have small touch targets")
65
+ score -= 10
66
+
67
+ # Check for media queries in stylesheets
68
+ styles = soup.find_all('style')
69
+ links_css = soup.find_all('link', rel='stylesheet')
70
+ has_media_queries = False
71
+ for style in styles:
72
+ if '@media' in style.get_text():
73
+ has_media_queries = True
74
+ break
75
+
76
+ if not has_media_queries and len(styles) > 0:
77
+ issues.append("⚠️ No media queries detected in inline styles")
78
+ score -= 10
79
+
80
+ # Check font sizes
81
+ if not soup.find_all(style=lambda x: x and 'font-size' in x and any(unit in x for unit in ['em', 'rem', '%'])):
82
+ issues.append("⚠️ Consider using relative font sizes (em, rem, %) for better mobile scaling")
83
+ score -= 5
84
+
85
+ return {
86
+ 'mobile_score': max(0, score),
87
+ 'mobile_issues': issues if issues else ["✅ Good mobile responsiveness"],
88
+ 'mobile_friendly': 'Yes' if score >= 80 else 'Partially' if score >= 60 else 'No'
89
+ }
report_generator.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fpdf import FPDF
2
+ from datetime import datetime
3
+ import json
4
+
5
+ class PDFReport(FPDF):
6
+ def header(self):
7
+ self.set_font('Arial', 'B', 16)
8
+ self.cell(0, 10, 'AuditAI - Website Audit Report', 0, 1, 'C')
9
+ self.set_font('Arial', 'I', 10)
10
+ self.cell(0, 5, f'Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', 0, 1, 'C')
11
+ self.ln(5)
12
+
13
+ def footer(self):
14
+ self.set_y(-15)
15
+ self.set_font('Arial', 'I', 8)
16
+ self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
17
+
18
+ def generate_pdf_report(url, scan_data, ai_report, accessibility_data, mobile_data, link_data):
19
+ """
20
+ Generates a comprehensive PDF audit report
21
+ Returns: PDF file path
22
+ """
23
+ pdf = PDFReport()
24
+ pdf.add_page()
25
+ pdf.set_auto_page_break(auto=True, margin=15)
26
+
27
+ # Website URL
28
+ pdf.set_font('Arial', 'B', 14)
29
+ pdf.cell(0, 10, 'Website Analyzed:', 0, 1)
30
+ pdf.set_font('Arial', '', 12)
31
+ pdf.cell(0, 8, url, 0, 1)
32
+ pdf.ln(5)
33
+
34
+ # Overall Scores Section
35
+ pdf.set_font('Arial', 'B', 14)
36
+ pdf.set_fill_color(200, 220, 255)
37
+ pdf.cell(0, 10, 'Overall Performance Scores', 0, 1, 'L', True)
38
+ pdf.ln(2)
39
+
40
+ pdf.set_font('Arial', '', 11)
41
+ scores = [
42
+ ('Overall Score', scan_data.get('overall_score', 0)),
43
+ ('SEO Score', scan_data.get('seo_score', 0)),
44
+ ('Performance Score', scan_data.get('performance_score', 0)),
45
+ ('Accessibility Score', accessibility_data.get('accessibility_score', 0)),
46
+ ('Security Score', scan_data.get('security_score', 0)),
47
+ ('Mobile Score', mobile_data.get('mobile_score', 0))
48
+ ]
49
+
50
+ for label, score in scores:
51
+ color = (0, 200, 0) if score >= 80 else (255, 165, 0) if score >= 60 else (255, 0, 0)
52
+ pdf.set_text_color(*color)
53
+ pdf.cell(100, 8, f'{label}:', 0, 0)
54
+ pdf.set_font('Arial', 'B', 11)
55
+ pdf.cell(0, 8, f'{score}/100', 0, 1)
56
+ pdf.set_font('Arial', '', 11)
57
+
58
+ pdf.set_text_color(0, 0, 0)
59
+ pdf.ln(5)
60
+
61
+ # Technical Metrics
62
+ pdf.set_font('Arial', 'B', 14)
63
+ pdf.set_fill_color(200, 220, 255)
64
+ pdf.cell(0, 10, 'Technical Metrics', 0, 1, 'L', True)
65
+ pdf.ln(2)
66
+
67
+ pdf.set_font('Arial', '', 11)
68
+ metrics = [
69
+ ('Load Time', f"{scan_data.get('load_time', 0)} seconds"),
70
+ ('Page Size', f"{scan_data.get('page_size_mb', 0):.2f} MB"),
71
+ ('HTTPS Enabled', 'Yes' if scan_data.get('https') else 'No'),
72
+ ('Status Code', str(scan_data.get('status_code', 'N/A'))),
73
+ ('Total Links', str(scan_data.get('links_count', 0))),
74
+ ('Internal Links', str(scan_data.get('internal_links', 0))),
75
+ ('External Links', str(scan_data.get('external_links', 0))),
76
+ ('Images without ALT', str(scan_data.get('images_without_alt', 0))),
77
+ ('H1 Tags', str(scan_data.get('h1_count', 0))),
78
+ ('Scripts', str(scan_data.get('scripts_count', 0)))
79
+ ]
80
+
81
+ for label, value in metrics:
82
+ pdf.cell(95, 7, f'{label}:', 0, 0)
83
+ pdf.cell(0, 7, value, 0, 1)
84
+
85
+ pdf.ln(5)
86
+
87
+ # Link Health
88
+ pdf.set_font('Arial', 'B', 14)
89
+ pdf.set_fill_color(200, 220, 255)
90
+ pdf.cell(0, 10, 'Link Health Check', 0, 1, 'L', True)
91
+ pdf.ln(2)
92
+
93
+ pdf.set_font('Arial', '', 11)
94
+ pdf.cell(95, 7, 'Total Links Checked:', 0, 0)
95
+ pdf.cell(0, 7, str(link_data.get('total_links_checked', 0)), 0, 1)
96
+ pdf.cell(95, 7, 'Working Links:', 0, 0)
97
+ pdf.cell(0, 7, str(link_data.get('working_links', 0)), 0, 1)
98
+ pdf.cell(95, 7, 'Broken Links:', 0, 0)
99
+ pdf.set_text_color(255, 0, 0) if link_data.get('broken_links_count', 0) > 0 else pdf.set_text_color(0, 200, 0)
100
+ pdf.cell(0, 7, str(link_data.get('broken_links_count', 0)), 0, 1)
101
+ pdf.set_text_color(0, 0, 0)
102
+ pdf.ln(5)
103
+
104
+ # Broken Links Details
105
+ if link_data.get('broken_links_details'):
106
+ pdf.set_font('Arial', 'B', 12)
107
+ pdf.cell(0, 8, 'Broken Links Found:', 0, 1)
108
+ pdf.set_font('Arial', '', 9)
109
+ for broken in link_data['broken_links_details'][:10]:
110
+ pdf.multi_cell(0, 5, f"- {broken['url']} (Status: {broken['status']})")
111
+ pdf.ln(3)
112
+
113
+ # AI Detected Issues
114
+ pdf.add_page()
115
+ pdf.set_font('Arial', 'B', 14)
116
+ pdf.set_fill_color(255, 200, 200)
117
+ pdf.cell(0, 10, 'AI Detected Issues', 0, 1, 'L', True)
118
+ pdf.ln(2)
119
+
120
+ pdf.set_font('Arial', '', 10)
121
+ for issue in ai_report.get('issues', [])[:15]:
122
+ pdf.multi_cell(0, 6, f'- {issue}')
123
+ pdf.ln(5)
124
+
125
+ # Accessibility Issues
126
+ pdf.set_font('Arial', 'B', 14)
127
+ pdf.set_fill_color(255, 230, 200)
128
+ pdf.cell(0, 10, 'Accessibility Issues', 0, 1, 'L', True)
129
+ pdf.ln(2)
130
+
131
+ pdf.set_font('Arial', '', 10)
132
+ for issue in accessibility_data.get('accessibility_issues', [])[:15]:
133
+ pdf.multi_cell(0, 6, f'{issue}')
134
+ pdf.ln(5)
135
+
136
+ # Mobile Issues
137
+ pdf.set_font('Arial', 'B', 14)
138
+ pdf.set_fill_color(230, 200, 255)
139
+ pdf.cell(0, 10, 'Mobile Responsiveness Issues', 0, 1, 'L', True)
140
+ pdf.ln(2)
141
+
142
+ pdf.set_font('Arial', '', 10)
143
+ for issue in mobile_data.get('mobile_issues', [])[:15]:
144
+ pdf.multi_cell(0, 6, f'{issue}')
145
+ pdf.ln(5)
146
+
147
+ # AI Suggestions
148
+ pdf.add_page()
149
+ pdf.set_font('Arial', 'B', 14)
150
+ pdf.set_fill_color(200, 255, 200)
151
+ pdf.cell(0, 10, 'AI Recommendations', 0, 1, 'L', True)
152
+ pdf.ln(2)
153
+
154
+ pdf.set_font('Arial', '', 10)
155
+ for suggestion in ai_report.get('suggestions', [])[:20]:
156
+ pdf.multi_cell(0, 6, f'- {suggestion}')
157
+
158
+ # Save PDF
159
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
160
+ filename = f"audit_report_{timestamp}.pdf"
161
+ pdf.output(filename)
162
+
163
+ return filename
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ gradio
3
+ requests
4
+ beautifulsoup4
5
+ google-generativeai
6
+ python-dotenv
7
+ plotly
8
+ pandas
9
+ wordcloud
10
+ matplotlib
11
+ fpdf
scanner.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import time
3
+ from utils import safe_request
4
+
5
+ def scan_website(url):
6
+ data = {}
7
+
8
+ # Measure total load time including HTTP request
9
+ start = time.time()
10
+ response = safe_request(url)
11
+ if not response:
12
+ return {"error": "Unable to fetch URL", "score": 0}
13
+
14
+ soup = BeautifulSoup(response.text, "html.parser")
15
+ load_time = round(time.time() - start, 2)
16
+
17
+ # Page size in MB
18
+ page_size_mb = len(response.content) / (1024*1024)
19
+
20
+ # Count internal vs external links
21
+ internal_links = 0
22
+ external_links = 0
23
+ for link in soup.find_all("a", href=True):
24
+ href = link.get("href")
25
+ if href.startswith("http") and url.split("//")[1] in href:
26
+ internal_links += 1
27
+ elif href.startswith("http"):
28
+ external_links += 1
29
+
30
+ # Heading counts
31
+ headings_count = {
32
+ "H1": len(soup.find_all("h1")),
33
+ "H2": len(soup.find_all("h2")),
34
+ "H3": len(soup.find_all("h3"))
35
+ }
36
+
37
+ data.update({
38
+ "status_code": response.status_code,
39
+ "load_time": load_time,
40
+ "https": url.startswith("https"),
41
+ "title": soup.title.string if soup.title else "Missing",
42
+ "meta_description": bool(soup.find("meta", attrs={"name": "description"})),
43
+ "h1_count": headings_count["H1"],
44
+ "h2_count": headings_count["H2"],
45
+ "h3_count": headings_count["H3"],
46
+ "headings_count": headings_count,
47
+ "images_without_alt": len([img for img in soup.find_all("img") if not img.get("alt")]),
48
+ "links_count": len(soup.find_all("a")),
49
+ "internal_links": internal_links,
50
+ "external_links": external_links,
51
+ "scripts_count": len(soup.find_all("script")),
52
+ "paragraph_count": len(soup.find_all("p")),
53
+ "page_size_mb": page_size_mb
54
+ })
55
+
56
+ return data
scoring.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def calculate_score(scan_data):
2
+ score = 0
3
+ score += 15 if scan_data.get("https") else 0
4
+
5
+ load_time = scan_data.get("load_time", 5)
6
+ if load_time <= 1: score += 15
7
+ elif load_time <= 3: score += 10
8
+ else: score += 5
9
+
10
+ score += 10 if scan_data.get("title") != "Missing" else 0
11
+ score += 10 if scan_data.get("meta_description") else 0
12
+ score += 10 if scan_data.get("h1_count", 0) >= 1 else 5
13
+
14
+ missing_alt = scan_data.get("images_without_alt", 0)
15
+ score += max(0, 10 - missing_alt*2)
16
+
17
+ score += min(5, scan_data.get("links_count", 0)*0.1)
18
+ score += min(5, scan_data.get("scripts_count", 0)*0.1)
19
+
20
+ paragraphs = scan_data.get("paragraph_count", 0)
21
+ score += 10 if paragraphs >= 3 else max(0, paragraphs*3)
22
+
23
+ score += 10 if scan_data.get("status_code") == 200 else 0
24
+
25
+ return round(min(score, 100), 2)
utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+
4
+ def normalize_url(url):
5
+ if not url.startswith(("http://", "https://")):
6
+ return "https://" + url
7
+ return url
8
+
9
+ def is_valid_url(url):
10
+ regex = re.compile(
11
+ r'^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([\/\w .-]*)*\/?$'
12
+ )
13
+ return re.match(regex, url) is not None
14
+
15
+ def safe_request(url, timeout=10):
16
+ try:
17
+ response = requests.get(
18
+ url,
19
+ timeout=timeout,
20
+ headers={"User-Agent": "AI-Site-Auditor"}
21
+ )
22
+ return response
23
+ except requests.exceptions.RequestException:
24
+ return None