diamond-in commited on
Commit
492241f
·
verified ·
1 Parent(s): a8acddd

Update features/analysis.py

Browse files
Files changed (1) hide show
  1. features/analysis.py +433 -0
features/analysis.py CHANGED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Advanced analysis features: visual testing, link extraction, structured data
3
+ """
4
+ import json
5
+ import time
6
+ import logging
7
+ from datetime import datetime
8
+ from browser.driver import get_driver, cleanup_driver, create_driver
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def extract_structured_data(url: str, use_persistent: bool = False) -> str:
13
+ """Extract structured data (JSON-LD, microdata, meta tags) from page"""
14
+ driver = None
15
+ try:
16
+ driver = get_driver(url, use_persistent)
17
+
18
+ # Extract various types of structured data
19
+ structured_data = driver.execute_script("""
20
+ const data = {
21
+ jsonld: [],
22
+ meta: {},
23
+ opengraph: {},
24
+ twitter: {},
25
+ microdata: [],
26
+ schema_org: []
27
+ };
28
+
29
+ // Extract JSON-LD
30
+ document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
31
+ try {
32
+ const parsed = JSON.parse(script.textContent);
33
+ data.jsonld.push(parsed);
34
+ // Also add to schema.org if it's schema.org data
35
+ if (parsed['@context'] && parsed['@context'].includes('schema.org')) {
36
+ data.schema_org.push(parsed);
37
+ }
38
+ } catch(e) {
39
+ console.error('Failed to parse JSON-LD:', e);
40
+ }
41
+ });
42
+
43
+ // Extract meta tags
44
+ document.querySelectorAll('meta').forEach(meta => {
45
+ const name = meta.getAttribute('name') || meta.getAttribute('property');
46
+ const content = meta.getAttribute('content');
47
+ if (name && content) {
48
+ if (name.startsWith('og:')) {
49
+ data.opengraph[name] = content;
50
+ } else if (name.startsWith('twitter:')) {
51
+ data.twitter[name] = content;
52
+ } else {
53
+ data.meta[name] = content;
54
+ }
55
+ }
56
+ });
57
+
58
+ // Extract microdata
59
+ document.querySelectorAll('[itemscope]').forEach(item => {
60
+ const itemData = {
61
+ type: item.getAttribute('itemtype'),
62
+ properties: {}
63
+ };
64
+ item.querySelectorAll('[itemprop]').forEach(prop => {
65
+ const propName = prop.getAttribute('itemprop');
66
+ const propValue = prop.getAttribute('content') ||
67
+ prop.getAttribute('href') ||
68
+ prop.textContent.trim();
69
+ itemData.properties[propName] = propValue;
70
+ });
71
+ data.microdata.push(itemData);
72
+ });
73
+
74
+ return data;
75
+ """)
76
+
77
+ # Add summary
78
+ structured_data['summary'] = {
79
+ 'has_jsonld': len(structured_data['jsonld']) > 0,
80
+ 'has_opengraph': len(structured_data['opengraph']) > 0,
81
+ 'has_twitter_cards': len(structured_data['twitter']) > 0,
82
+ 'has_microdata': len(structured_data['microdata']) > 0,
83
+ 'total_meta_tags': len(structured_data['meta'])
84
+ }
85
+
86
+ return json.dumps(structured_data, indent=2)
87
+ except Exception as e:
88
+ logger.error(f"Error in extract_structured_data: {e}")
89
+ return f"Error: {e}"
90
+ finally:
91
+ cleanup_driver(driver, use_persistent)
92
+
93
+ def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str:
94
+ """Compare two URLs visually for differences"""
95
+ driver = None
96
+ try:
97
+ driver = create_driver(persistent=False)
98
+
99
+ # Take screenshot of first URL
100
+ driver.get(url1)
101
+ time.sleep(3) # Wait for page to stabilize
102
+ screenshot1_path = "/tmp/screenshot1.png"
103
+ driver.save_screenshot(screenshot1_path)
104
+ page1_info = {
105
+ "title": driver.title,
106
+ "url": driver.current_url
107
+ }
108
+
109
+ # Take screenshot of second URL
110
+ driver.get(url2)
111
+ time.sleep(3) # Wait for page to stabilize
112
+ screenshot2_path = "/tmp/screenshot2.png"
113
+ driver.save_screenshot(screenshot2_path)
114
+ page2_info = {
115
+ "title": driver.title,
116
+ "url": driver.current_url
117
+ }
118
+
119
+ # Get page dimensions for comparison
120
+ dimensions1 = driver.execute_script("""
121
+ return {
122
+ width: document.documentElement.scrollWidth,
123
+ height: document.documentElement.scrollHeight,
124
+ viewport: {
125
+ width: window.innerWidth,
126
+ height: window.innerHeight
127
+ }
128
+ }
129
+ """)
130
+
131
+ driver.quit()
132
+
133
+ # Create comparison result
134
+ result = {
135
+ "url1": url1,
136
+ "url2": url2,
137
+ "page1_info": page1_info,
138
+ "page2_info": page2_info,
139
+ "screenshots": {
140
+ "screenshot1": screenshot1_path,
141
+ "screenshot2": screenshot2_path
142
+ },
143
+ "dimensions_match": dimensions1,
144
+ "threshold": threshold,
145
+ "timestamp": datetime.now().isoformat(),
146
+ "note": "Visual comparison requires external image processing. Screenshots saved for manual review."
147
+ }
148
+
149
+ return json.dumps(result, indent=2)
150
+ except Exception as e:
151
+ logger.error(f"Error in visual_regression_test: {e}")
152
+ if driver:
153
+ try:
154
+ driver.quit()
155
+ except:
156
+ pass
157
+ return f"Error: {e}"
158
+
159
+ def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str:
160
+ """Extract all links from a page with categorization"""
161
+ driver = None
162
+ try:
163
+ driver = get_driver(url, use_persistent)
164
+
165
+ # Extract and categorize links
166
+ links_data = driver.execute_script(f"""
167
+ const currentDomain = new URL(window.location.href).hostname;
168
+ const links = {{
169
+ internal: [],
170
+ external: [],
171
+ email: [],
172
+ phone: [],
173
+ javascript: [],
174
+ anchor: [],
175
+ file_downloads: []
176
+ }};
177
+
178
+ // Common file extensions for downloads
179
+ const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt'];
180
+
181
+ document.querySelectorAll('a[href]').forEach(a => {{
182
+ const href = a.getAttribute('href');
183
+ const text = a.textContent.trim();
184
+ const linkData = {{
185
+ href: href,
186
+ text: text.substring(0, 100),
187
+ title: a.title,
188
+ target: a.target,
189
+ rel: a.rel
190
+ }};
191
+
192
+ if (href.startsWith('mailto:')) {{
193
+ links.email.push(linkData);
194
+ }} else if (href.startsWith('tel:')) {{
195
+ links.phone.push(linkData);
196
+ }} else if (href.startsWith('javascript:')) {{
197
+ links.javascript.push(linkData);
198
+ }} else if (href.startsWith('#')) {{
199
+ links.anchor.push(linkData);
200
+ }} else {{
201
+ try {{
202
+ const linkUrl = new URL(href, window.location.href);
203
+
204
+ // Check if it's a file download
205
+ const isFileDownload = fileExtensions.some(ext =>
206
+ linkUrl.pathname.toLowerCase().endsWith(ext)
207
+ );
208
+
209
+ if (isFileDownload) {{
210
+ links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}});
211
+ }} else if (linkUrl.hostname === currentDomain) {{
212
+ links.internal.push({{...linkData, absoluteUrl: linkUrl.href}});
213
+ }} else if ({str(include_external).lower()}) {{
214
+ links.external.push({{...linkData, absoluteUrl: linkUrl.href}});
215
+ }}
216
+ }} catch(e) {{
217
+ // Invalid URL, add to javascript category
218
+ links.javascript.push(linkData);
219
+ }}
220
+ }}
221
+ }});
222
+
223
+ return {{
224
+ links: links,
225
+ summary: {{
226
+ total: document.querySelectorAll('a[href]').length,
227
+ internal: links.internal.length,
228
+ external: links.external.length,
229
+ email: links.email.length,
230
+ phone: links.phone.length,
231
+ javascript: links.javascript.length,
232
+ anchor: links.anchor.length,
233
+ file_downloads: links.file_downloads.length
234
+ }},
235
+ page_info: {{
236
+ title: document.title,
237
+ url: window.location.href,
238
+ domain: currentDomain
239
+ }}
240
+ }};
241
+ """)
242
+
243
+ return json.dumps(links_data, indent=2)
244
+ except Exception as e:
245
+ logger.error(f"Error in extract_all_links: {e}")
246
+ return f"Error: {e}"
247
+ finally:
248
+ cleanup_driver(driver, use_persistent)
249
+
250
+ def seo_analysis(url: str, use_persistent: bool = False) -> str:
251
+ """Perform SEO analysis on a page"""
252
+ driver = None
253
+ try:
254
+ driver = get_driver(url, use_persistent)
255
+
256
+ # Perform SEO analysis
257
+ seo_data = driver.execute_script("""
258
+ const analysis = {
259
+ title: {
260
+ content: document.title,
261
+ length: document.title.length,
262
+ issues: []
263
+ },
264
+ meta_description: {
265
+ content: null,
266
+ length: 0,
267
+ issues: []
268
+ },
269
+ headings: {
270
+ h1_count: 0,
271
+ h1_texts: [],
272
+ hierarchy: [],
273
+ issues: []
274
+ },
275
+ images: {
276
+ total: 0,
277
+ without_alt: 0,
278
+ issues: []
279
+ },
280
+ links: {
281
+ total: 0,
282
+ external: 0,
283
+ nofollow: 0
284
+ },
285
+ canonical: null,
286
+ robots: null,
287
+ lang: document.documentElement.lang,
288
+ structured_data_count: 0
289
+ };
290
+
291
+ // Check title
292
+ if (analysis.title.length < 30) {
293
+ analysis.title.issues.push('Title too short (recommended: 30-60 characters)');
294
+ } else if (analysis.title.length > 60) {
295
+ analysis.title.issues.push('Title too long (recommended: 30-60 characters)');
296
+ }
297
+
298
+ // Check meta description
299
+ const metaDesc = document.querySelector('meta[name="description"]');
300
+ if (metaDesc) {
301
+ analysis.meta_description.content = metaDesc.content;
302
+ analysis.meta_description.length = metaDesc.content.length;
303
+
304
+ if (metaDesc.content.length < 120) {
305
+ analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)');
306
+ } else if (metaDesc.content.length > 160) {
307
+ analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)');
308
+ }
309
+ } else {
310
+ analysis.meta_description.issues.push('No meta description found');
311
+ }
312
+
313
+ // Check headings
314
+ const h1s = document.querySelectorAll('h1');
315
+ analysis.headings.h1_count = h1s.length;
316
+ h1s.forEach(h1 => {
317
+ analysis.headings.h1_texts.push(h1.textContent.trim());
318
+ });
319
+
320
+ if (h1s.length === 0) {
321
+ analysis.headings.issues.push('No H1 tag found');
322
+ } else if (h1s.length > 1) {
323
+ analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)');
324
+ }
325
+
326
+ // Get heading hierarchy
327
+ const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
328
+ allHeadings.forEach(h => {
329
+ analysis.headings.hierarchy.push({
330
+ level: h.tagName,
331
+ text: h.textContent.trim().substring(0, 50)
332
+ });
333
+ });
334
+
335
+ // Check images
336
+ const images = document.querySelectorAll('img');
337
+ analysis.images.total = images.length;
338
+ images.forEach(img => {
339
+ if (!img.alt) {
340
+ analysis.images.without_alt++;
341
+ }
342
+ });
343
+
344
+ if (analysis.images.without_alt > 0) {
345
+ analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`);
346
+ }
347
+
348
+ // Check links
349
+ const links = document.querySelectorAll('a[href]');
350
+ analysis.links.total = links.length;
351
+ links.forEach(link => {
352
+ try {
353
+ const linkUrl = new URL(link.href, window.location.href);
354
+ if (linkUrl.hostname !== window.location.hostname) {
355
+ analysis.links.external++;
356
+ }
357
+ if (link.rel && link.rel.includes('nofollow')) {
358
+ analysis.links.nofollow++;
359
+ }
360
+ } catch(e) {}
361
+ });
362
+
363
+ // Check canonical
364
+ const canonical = document.querySelector('link[rel="canonical"]');
365
+ if (canonical) {
366
+ analysis.canonical = canonical.href;
367
+ }
368
+
369
+ // Check robots meta
370
+ const robots = document.querySelector('meta[name="robots"]');
371
+ if (robots) {
372
+ analysis.robots = robots.content;
373
+ }
374
+
375
+ // Count structured data
376
+ analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length;
377
+
378
+ return analysis;
379
+ """)
380
+
381
+ # Calculate SEO score
382
+ score = 100
383
+ total_issues = 0
384
+
385
+ for key in ['title', 'meta_description', 'headings', 'images']:
386
+ if key in seo_data and 'issues' in seo_data[key]:
387
+ issues = len(seo_data[key]['issues'])
388
+ total_issues += issues
389
+ score -= (issues * 10)
390
+
391
+ score = max(0, score)
392
+
393
+ result = {
394
+ "url": url,
395
+ "seo_score": score,
396
+ "analysis": seo_data,
397
+ "total_issues": total_issues,
398
+ "recommendations": get_seo_recommendations(seo_data)
399
+ }
400
+
401
+ return json.dumps(result, indent=2)
402
+ except Exception as e:
403
+ logger.error(f"Error in seo_analysis: {e}")
404
+ return f"Error: {e}"
405
+ finally:
406
+ cleanup_driver(driver, use_persistent)
407
+
408
+ def get_seo_recommendations(seo_data):
409
+ """Get SEO recommendations based on analysis"""
410
+ recommendations = []
411
+
412
+ if seo_data['title']['issues']:
413
+ recommendations.extend(seo_data['title']['issues'])
414
+
415
+ if seo_data['meta_description']['issues']:
416
+ recommendations.extend(seo_data['meta_description']['issues'])
417
+
418
+ if seo_data['headings']['issues']:
419
+ recommendations.extend(seo_data['headings']['issues'])
420
+
421
+ if seo_data['images']['issues']:
422
+ recommendations.extend(seo_data['images']['issues'])
423
+
424
+ if not seo_data['canonical']:
425
+ recommendations.append("Add canonical URL to prevent duplicate content issues")
426
+
427
+ if not seo_data['lang']:
428
+ recommendations.append("Add lang attribute to HTML tag for better internationalization")
429
+
430
+ if seo_data['structured_data_count'] == 0:
431
+ recommendations.append("Add structured data (JSON-LD) for better search engine understanding")
432
+
433
+ return recommendations