moizshah956 commited on
Commit
7c77f56
·
verified ·
1 Parent(s): a38a440

Create seo_bot.py

Browse files
Files changed (1) hide show
  1. seo_bot.py +392 -0
seo_bot.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # seo_bot.py
2
+ import os
3
+ import csv
4
+ import json
5
+ import re
6
+ import time
7
+ import uuid
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import urljoin, urlparse
11
+ from collections import Counter
12
+ import textstat
13
+
14
+ # Optional grammar check
15
+ try:
16
+ import language_tool_python
17
+ LT_AVAILABLE = True
18
+ except Exception:
19
+ LT_AVAILABLE = False
20
+
21
+ # Optional OpenAI client (modern package)
22
+ try:
23
+ from openai import OpenAI
24
+ OPENAI_AVAILABLE = True
25
+ except Exception:
26
+ OPENAI_AVAILABLE = False
27
+
28
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
29
+
30
+
31
+ # ==============================
32
+ # OpenAI Client & Suggestion
33
+ # ==============================
34
+ def make_client():
35
+ """
36
+ Initialize OpenAI client if OPENAI_API_KEY is present and OpenAI package available.
37
+ Returns None if not available — code will gracefully fall back.
38
+ """
39
+ api_key = os.getenv("OPENAI_API_KEY")
40
+ if not api_key:
41
+ print("⚠️ OPENAI_API_KEY not set — AI suggestions will be disabled.")
42
+ return None
43
+ if not OPENAI_AVAILABLE:
44
+ print("⚠️ OpenAI package not available in environment — AI suggestions disabled.")
45
+ return None
46
+
47
+ try:
48
+ # Use the modern OpenAI client initialization (no proxies kwarg)
49
+ client = OpenAI(api_key=api_key)
50
+ print("✅ OpenAI client initialized.")
51
+ return client
52
+ except Exception as e:
53
+ print("⚠️ Error initializing OpenAI:", str(e))
54
+ return None
55
+
56
+
57
+ def generate_ai_suggestion(client, title, meta_description, keywords, issue_type):
58
+ """
59
+ Returns a short AI suggestion string using the provided OpenAI client.
60
+ If client is None or API call fails, returns a friendly fallback string.
61
+ """
62
+ if client is None:
63
+ return "(AI disabled — set OPENAI_API_KEY to enable suggestions)"
64
+ try:
65
+ prompt = f"""
66
+ You are a professional SEO consultant. Provide a concise (1-2 sentences) practical suggestion.
67
+ Title: {title}
68
+ Meta Description: {meta_description}
69
+ Keywords: {keywords}
70
+ Problem: {issue_type}
71
+ """
72
+ resp = client.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {"role": "system", "content": "You are an expert SEO consultant."},
76
+ {"role": "user", "content": prompt}
77
+ ],
78
+ max_tokens=120,
79
+ temperature=0.7,
80
+ top_p=0.95,
81
+ )
82
+ # defensive access
83
+ try:
84
+ return resp.choices[0].message.content.strip()
85
+ except Exception:
86
+ return "(AI suggestion unavailable: malformed response)"
87
+ except Exception as e:
88
+ # don't crash the whole scan if OpenAI fails temporarily
89
+ return f"(AI suggestion unavailable: {str(e)})"
90
+
91
+
92
+ # ==============================
93
+ # Utility Functions
94
+ # ==============================
95
+ def keyword_density(text):
96
+ words = re.findall(r'\b\w+\b', (text or "").lower())
97
+ freq = Counter(w for w in words if len(w) > 3)
98
+ total = sum(freq.values()) or 1
99
+ items = sorted([(k, round(v / total * 100, 2)) for k, v in freq.items() if v > 1],
100
+ key=lambda x: -x[1])[:10]
101
+ return ", ".join([f"{k}:{p}%" for k, p in items])
102
+
103
+
104
+ def get_image_size_kb(src_url, base_url):
105
+ try:
106
+ full_url = urljoin(base_url, src_url)
107
+ res = requests.get(full_url, headers=HEADERS, timeout=5)
108
+ if res.status_code != 200:
109
+ return 0.0
110
+ size_kb = len(res.content) / 1024
111
+ return round(size_kb, 1)
112
+ except Exception:
113
+ return 0.0
114
+
115
+
116
+ # ==============================
117
+ # Main SEO Analyzer
118
+ # ==============================
119
+ def run_seo_and_suggestions(base_url, max_pages=30, tmp_dir="/tmp"):
120
+ """
121
+ Crawl site (uses sitemap if present), analyze each page, compute SEO score,
122
+ and produce AI suggestions (if OpenAI key is configured).
123
+ Returns: (results_list, csv_path)
124
+ """
125
+ if not base_url:
126
+ raise ValueError("base_url is required")
127
+
128
+ domain = urlparse(base_url).netloc
129
+ sitemap_links = set()
130
+ visited = set()
131
+
132
+ def get_sitemap_links():
133
+ sitemap_url = urljoin(base_url, "sitemap.xml")
134
+ try:
135
+ r = requests.get(sitemap_url, headers=HEADERS, timeout=8)
136
+ if r.status_code == 200 and r.text:
137
+ soup = BeautifulSoup(r.text, "xml")
138
+ for loc in soup.find_all("loc"):
139
+ href = loc.text.strip()
140
+ if href:
141
+ sitemap_links.add(href)
142
+ except Exception:
143
+ # ignore sitemap errors
144
+ pass
145
+
146
+ def get_robots_txt():
147
+ robots_url = urljoin(base_url, "robots.txt")
148
+ try:
149
+ r = requests.get(robots_url, headers=HEADERS, timeout=5)
150
+ if r.status_code == 200:
151
+ return r.text
152
+ except Exception:
153
+ pass
154
+ return ""
155
+
156
+ def crawl_site():
157
+ to_visit = list(sitemap_links) if sitemap_links else [base_url]
158
+ all_urls = []
159
+ while to_visit and len(all_urls) < max_pages:
160
+ u = to_visit.pop(0)
161
+ if u in visited:
162
+ continue
163
+ visited.add(u)
164
+ try:
165
+ r = requests.get(u, headers=HEADERS, timeout=10)
166
+ if r.status_code != 200 or not r.text:
167
+ continue
168
+ soup = BeautifulSoup(r.text, "html.parser")
169
+ all_urls.append(u)
170
+ # extract same-domain links
171
+ for a in soup.find_all("a", href=True):
172
+ href = urljoin(u, a["href"]).split("#")[0].split("?")[0]
173
+ parsed = urlparse(href)
174
+ if parsed.netloc == domain and href not in visited and href not in to_visit:
175
+ to_visit.append(href)
176
+ except Exception:
177
+ # skip on any error (timeout, connection error, bad HTML)
178
+ continue
179
+ return all_urls
180
+
181
+ # --- start
182
+ get_sitemap_links()
183
+ robots_txt = get_robots_txt()
184
+ pages = crawl_site()
185
+
186
+ # prepare optional grammar tool
187
+ grammar_tool = None
188
+ if LT_AVAILABLE:
189
+ try:
190
+ # instantiate default LanguageTool (locally installed server not required)
191
+ grammar_tool = language_tool_python.LanguageTool('en-US')
192
+ except Exception:
193
+ grammar_tool = None
194
+
195
+ # prepare OpenAI client if available
196
+ openai_client = make_client()
197
+
198
+ results = []
199
+
200
+ for i, page_url in enumerate(pages):
201
+ try:
202
+ r = requests.get(page_url, headers=HEADERS, timeout=12)
203
+ if r.status_code != 200 or not r.text:
204
+ continue
205
+ html = r.text
206
+ soup = BeautifulSoup(html, "html.parser")
207
+
208
+ title_tag = soup.title
209
+ meta_desc_tag = soup.find("meta", attrs={"name": "description"})
210
+ canonical_tag = soup.find("link", rel="canonical")
211
+ robots_tag = soup.find("meta", attrs={"name": "robots"})
212
+ viewport_tag = soup.find("meta", attrs={"name": "viewport"})
213
+ text = soup.get_text(separator=" ", strip=True)
214
+ html_str = str(soup)
215
+
216
+ # links
217
+ anchors = soup.find_all("a", href=True)
218
+ internal = external = 0
219
+ for a in anchors:
220
+ href = urljoin(page_url, a['href'])
221
+ if domain in href:
222
+ internal += 1
223
+ else:
224
+ external += 1
225
+
226
+ # images
227
+ imgs = soup.find_all("img")
228
+ missing_alt = small_images = large_images = ideal_images = 0
229
+ for img in imgs:
230
+ if not img.get("alt"):
231
+ missing_alt += 1
232
+ src = img.get("src")
233
+ if not src:
234
+ continue
235
+ size_kb = get_image_size_kb(src, page_url)
236
+ if size_kb < 5:
237
+ small_images += 1
238
+ elif size_kb > 250:
239
+ large_images += 1
240
+ else:
241
+ ideal_images += 1
242
+
243
+ # headings
244
+ heading_tags = soup.find_all(re.compile('^h[1-6]$'))
245
+ heading_order = [h.name for h in heading_tags]
246
+ h1_count = len(soup.find_all("h1"))
247
+
248
+ # schema
249
+ schema_types = []
250
+ for tag in soup.find_all("script", type="application/ld+json"):
251
+ try:
252
+ if not tag.string:
253
+ continue
254
+ data = json.loads(tag.string)
255
+ if isinstance(data, dict) and "@type" in data:
256
+ schema_types.append(data["@type"])
257
+ elif isinstance(data, list):
258
+ for d in data:
259
+ if isinstance(d, dict) and "@type" in d:
260
+ schema_types.append(d["@type"])
261
+ except Exception:
262
+ continue
263
+
264
+ # metrics
265
+ try:
266
+ readability_score = textstat.flesch_reading_ease(text)
267
+ except Exception:
268
+ readability_score = 0
269
+
270
+ word_count = len((text or "").split())
271
+ grammar_errors = 0
272
+ try:
273
+ if grammar_tool and text:
274
+ grammar_errors = len(grammar_tool.check(text[:1000]))
275
+ except Exception:
276
+ grammar_errors = 0
277
+
278
+ top_keywords = keyword_density(text)
279
+ ratio = round((len(text) / len(html_str)) if html_str else 0, 3)
280
+
281
+ page = {
282
+ "url": page_url,
283
+ "title": (title_tag.text.strip() if title_tag and title_tag.text else ""),
284
+ "meta_description": (meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""),
285
+ "h1_count": h1_count,
286
+ "heading_order": ", ".join(heading_order),
287
+ "missing_alt_tags": missing_alt,
288
+ "total_images": len(imgs),
289
+ "small_images": small_images,
290
+ "large_images": large_images,
291
+ "ideal_images": ideal_images,
292
+ "internal_links": internal,
293
+ "external_links": external,
294
+ "canonical_tag": bool(canonical_tag),
295
+ "robots_meta": (robots_tag.get("content", "") if robots_tag else ""),
296
+ "viewport_present": ("width=device-width" in viewport_tag.get("content", "") if viewport_tag else False),
297
+ "schema_types": ", ".join(schema_types),
298
+ "opengraph_tags": len(soup.find_all("meta", property=re.compile("^og:"))),
299
+ "twitter_tags": len(soup.find_all("meta", attrs={"name": re.compile("^twitter:")})),
300
+ "word_count": word_count,
301
+ "readability_score": readability_score,
302
+ "grammar_errors": grammar_errors,
303
+ "text_to_html_ratio": ratio,
304
+ "top_keywords": top_keywords
305
+ }
306
+
307
+ results.append(page)
308
+
309
+ except Exception:
310
+ # keep scanning other pages even if one fails
311
+ continue
312
+
313
+ # scoring function
314
+ def calculate_seo_score(page):
315
+ score = 0
316
+ if page.get('title'): score += 10
317
+ if page.get('meta_description'): score += 10
318
+ if page.get('h1_count', 0) == 1: score += 5
319
+ if page.get('viewport_present', False): score += 5
320
+ if page.get('missing_alt_tags', 0) == 0: score += 5
321
+ if page.get('canonical_tag', False): score += 5
322
+ if page.get('robots_meta', False): score += 3
323
+ if page.get('schema_types'): score += 5
324
+ if page.get('readability_score', 0) > 50: score += 5
325
+ if page.get('top_keywords'): score += 5
326
+ return min(score, 100)
327
+
328
+ # Attach scores and generate suggestions (AI if available)
329
+ for p in results:
330
+ p["seo_score"] = calculate_seo_score(p)
331
+
332
+ title = str(p.get("title", "") or "")
333
+ meta = str(p.get("meta_description", "") or "")
334
+ keywords = str(p.get("top_keywords", "") or "")
335
+
336
+ suggestions = []
337
+
338
+ # Title suggestion
339
+ if not title or len(title) < 30 or len(title) > 65:
340
+ suggestions.append("Suggested Title: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Title length issue"))
341
+
342
+ # Meta suggestion
343
+ if not meta or len(meta) < 70 or len(meta) > 160:
344
+ suggestions.append("Suggested Meta Description: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Meta description length issue"))
345
+
346
+ # Readability suggestion
347
+ try:
348
+ if float(p.get("readability_score", 0) or 0) < 50:
349
+ suggestions.append("Readability: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Improve readability"))
350
+ except Exception:
351
+ pass
352
+
353
+ # Missing alt tags
354
+ if int(p.get("missing_alt_tags", 0) or 0) > 0:
355
+ suggestions.append(f"{int(p.get('missing_alt_tags', 0))} images missing alt tags. Example: 'Product image showing [keyword]'")
356
+
357
+ # Schema
358
+ if not str(p.get("schema_types", "") or "").strip():
359
+ suggestions.append("Add structured data (schema.org): Product/Article/BreadcrumbList")
360
+
361
+ # Word count
362
+ try:
363
+ if int(p.get("word_count", 0) or 0) < 300:
364
+ suggestions.append("Page has low content. Expand to 300+ words with keyword-rich helpful content.")
365
+ except Exception:
366
+ pass
367
+
368
+ p["seo_suggestions"] = " | ".join(suggestions) if suggestions else "No major suggestions."
369
+
370
+ # persist CSV (safe)
371
+ os.makedirs(tmp_dir, exist_ok=True)
372
+ filename = os.path.join(tmp_dir, f"seo_report_{uuid.uuid4().hex}.csv")
373
+
374
+ if not results:
375
+ empty_msg = [{
376
+ "url": base_url,
377
+ "error": "No pages analyzed. Site may block crawlers or sitemap was empty.",
378
+ "seo_suggestions": "Try allowing bots or check robots.txt configuration."
379
+ }]
380
+ with open(filename, "w", newline="", encoding="utf-8") as f:
381
+ writer = csv.DictWriter(f, fieldnames=empty_msg[0].keys())
382
+ writer.writeheader()
383
+ writer.writerows(empty_msg)
384
+ return empty_msg, filename
385
+
386
+ keys = list(results[0].keys())
387
+ with open(filename, "w", newline="", encoding="utf-8") as f:
388
+ writer = csv.DictWriter(f, fieldnames=keys)
389
+ writer.writeheader()
390
+ writer.writerows(results)
391
+
392
+ return results, filename