SEO / seo_bot.py
moizshah956's picture
Create seo_bot.py
7c77f56 verified
# seo_bot.py
import os
import csv
import json
import re
import time
import uuid
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import Counter
import textstat
# Optional grammar check
try:
import language_tool_python
LT_AVAILABLE = True
except Exception:
LT_AVAILABLE = False
# Optional OpenAI client (modern package)
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except Exception:
OPENAI_AVAILABLE = False
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
# ==============================
# OpenAI Client & Suggestion
# ==============================
def make_client():
"""
Initialize OpenAI client if OPENAI_API_KEY is present and OpenAI package available.
Returns None if not available — code will gracefully fall back.
"""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("⚠️ OPENAI_API_KEY not set — AI suggestions will be disabled.")
return None
if not OPENAI_AVAILABLE:
print("⚠️ OpenAI package not available in environment — AI suggestions disabled.")
return None
try:
# Use the modern OpenAI client initialization (no proxies kwarg)
client = OpenAI(api_key=api_key)
print("✅ OpenAI client initialized.")
return client
except Exception as e:
print("⚠️ Error initializing OpenAI:", str(e))
return None
def generate_ai_suggestion(client, title, meta_description, keywords, issue_type):
"""
Returns a short AI suggestion string using the provided OpenAI client.
If client is None or API call fails, returns a friendly fallback string.
"""
if client is None:
return "(AI disabled — set OPENAI_API_KEY to enable suggestions)"
try:
prompt = f"""
You are a professional SEO consultant. Provide a concise (1-2 sentences) practical suggestion.
Title: {title}
Meta Description: {meta_description}
Keywords: {keywords}
Problem: {issue_type}
"""
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are an expert SEO consultant."},
{"role": "user", "content": prompt}
],
max_tokens=120,
temperature=0.7,
top_p=0.95,
)
# defensive access
try:
return resp.choices[0].message.content.strip()
except Exception:
return "(AI suggestion unavailable: malformed response)"
except Exception as e:
# don't crash the whole scan if OpenAI fails temporarily
return f"(AI suggestion unavailable: {str(e)})"
# ==============================
# Utility Functions
# ==============================
def keyword_density(text):
words = re.findall(r'\b\w+\b', (text or "").lower())
freq = Counter(w for w in words if len(w) > 3)
total = sum(freq.values()) or 1
items = sorted([(k, round(v / total * 100, 2)) for k, v in freq.items() if v > 1],
key=lambda x: -x[1])[:10]
return ", ".join([f"{k}:{p}%" for k, p in items])
def get_image_size_kb(src_url, base_url):
try:
full_url = urljoin(base_url, src_url)
res = requests.get(full_url, headers=HEADERS, timeout=5)
if res.status_code != 200:
return 0.0
size_kb = len(res.content) / 1024
return round(size_kb, 1)
except Exception:
return 0.0
# ==============================
# Main SEO Analyzer
# ==============================
def run_seo_and_suggestions(base_url, max_pages=30, tmp_dir="/tmp"):
"""
Crawl site (uses sitemap if present), analyze each page, compute SEO score,
and produce AI suggestions (if OpenAI key is configured).
Returns: (results_list, csv_path)
"""
if not base_url:
raise ValueError("base_url is required")
domain = urlparse(base_url).netloc
sitemap_links = set()
visited = set()
def get_sitemap_links():
sitemap_url = urljoin(base_url, "sitemap.xml")
try:
r = requests.get(sitemap_url, headers=HEADERS, timeout=8)
if r.status_code == 200 and r.text:
soup = BeautifulSoup(r.text, "xml")
for loc in soup.find_all("loc"):
href = loc.text.strip()
if href:
sitemap_links.add(href)
except Exception:
# ignore sitemap errors
pass
def get_robots_txt():
robots_url = urljoin(base_url, "robots.txt")
try:
r = requests.get(robots_url, headers=HEADERS, timeout=5)
if r.status_code == 200:
return r.text
except Exception:
pass
return ""
def crawl_site():
to_visit = list(sitemap_links) if sitemap_links else [base_url]
all_urls = []
while to_visit and len(all_urls) < max_pages:
u = to_visit.pop(0)
if u in visited:
continue
visited.add(u)
try:
r = requests.get(u, headers=HEADERS, timeout=10)
if r.status_code != 200 or not r.text:
continue
soup = BeautifulSoup(r.text, "html.parser")
all_urls.append(u)
# extract same-domain links
for a in soup.find_all("a", href=True):
href = urljoin(u, a["href"]).split("#")[0].split("?")[0]
parsed = urlparse(href)
if parsed.netloc == domain and href not in visited and href not in to_visit:
to_visit.append(href)
except Exception:
# skip on any error (timeout, connection error, bad HTML)
continue
return all_urls
# --- start
get_sitemap_links()
robots_txt = get_robots_txt()
pages = crawl_site()
# prepare optional grammar tool
grammar_tool = None
if LT_AVAILABLE:
try:
# instantiate default LanguageTool (locally installed server not required)
grammar_tool = language_tool_python.LanguageTool('en-US')
except Exception:
grammar_tool = None
# prepare OpenAI client if available
openai_client = make_client()
results = []
for i, page_url in enumerate(pages):
try:
r = requests.get(page_url, headers=HEADERS, timeout=12)
if r.status_code != 200 or not r.text:
continue
html = r.text
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.title
meta_desc_tag = soup.find("meta", attrs={"name": "description"})
canonical_tag = soup.find("link", rel="canonical")
robots_tag = soup.find("meta", attrs={"name": "robots"})
viewport_tag = soup.find("meta", attrs={"name": "viewport"})
text = soup.get_text(separator=" ", strip=True)
html_str = str(soup)
# links
anchors = soup.find_all("a", href=True)
internal = external = 0
for a in anchors:
href = urljoin(page_url, a['href'])
if domain in href:
internal += 1
else:
external += 1
# images
imgs = soup.find_all("img")
missing_alt = small_images = large_images = ideal_images = 0
for img in imgs:
if not img.get("alt"):
missing_alt += 1
src = img.get("src")
if not src:
continue
size_kb = get_image_size_kb(src, page_url)
if size_kb < 5:
small_images += 1
elif size_kb > 250:
large_images += 1
else:
ideal_images += 1
# headings
heading_tags = soup.find_all(re.compile('^h[1-6]$'))
heading_order = [h.name for h in heading_tags]
h1_count = len(soup.find_all("h1"))
# schema
schema_types = []
for tag in soup.find_all("script", type="application/ld+json"):
try:
if not tag.string:
continue
data = json.loads(tag.string)
if isinstance(data, dict) and "@type" in data:
schema_types.append(data["@type"])
elif isinstance(data, list):
for d in data:
if isinstance(d, dict) and "@type" in d:
schema_types.append(d["@type"])
except Exception:
continue
# metrics
try:
readability_score = textstat.flesch_reading_ease(text)
except Exception:
readability_score = 0
word_count = len((text or "").split())
grammar_errors = 0
try:
if grammar_tool and text:
grammar_errors = len(grammar_tool.check(text[:1000]))
except Exception:
grammar_errors = 0
top_keywords = keyword_density(text)
ratio = round((len(text) / len(html_str)) if html_str else 0, 3)
page = {
"url": page_url,
"title": (title_tag.text.strip() if title_tag and title_tag.text else ""),
"meta_description": (meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""),
"h1_count": h1_count,
"heading_order": ", ".join(heading_order),
"missing_alt_tags": missing_alt,
"total_images": len(imgs),
"small_images": small_images,
"large_images": large_images,
"ideal_images": ideal_images,
"internal_links": internal,
"external_links": external,
"canonical_tag": bool(canonical_tag),
"robots_meta": (robots_tag.get("content", "") if robots_tag else ""),
"viewport_present": ("width=device-width" in viewport_tag.get("content", "") if viewport_tag else False),
"schema_types": ", ".join(schema_types),
"opengraph_tags": len(soup.find_all("meta", property=re.compile("^og:"))),
"twitter_tags": len(soup.find_all("meta", attrs={"name": re.compile("^twitter:")})),
"word_count": word_count,
"readability_score": readability_score,
"grammar_errors": grammar_errors,
"text_to_html_ratio": ratio,
"top_keywords": top_keywords
}
results.append(page)
except Exception:
# keep scanning other pages even if one fails
continue
# scoring function
def calculate_seo_score(page):
score = 0
if page.get('title'): score += 10
if page.get('meta_description'): score += 10
if page.get('h1_count', 0) == 1: score += 5
if page.get('viewport_present', False): score += 5
if page.get('missing_alt_tags', 0) == 0: score += 5
if page.get('canonical_tag', False): score += 5
if page.get('robots_meta', False): score += 3
if page.get('schema_types'): score += 5
if page.get('readability_score', 0) > 50: score += 5
if page.get('top_keywords'): score += 5
return min(score, 100)
# Attach scores and generate suggestions (AI if available)
for p in results:
p["seo_score"] = calculate_seo_score(p)
title = str(p.get("title", "") or "")
meta = str(p.get("meta_description", "") or "")
keywords = str(p.get("top_keywords", "") or "")
suggestions = []
# Title suggestion
if not title or len(title) < 30 or len(title) > 65:
suggestions.append("Suggested Title: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Title length issue"))
# Meta suggestion
if not meta or len(meta) < 70 or len(meta) > 160:
suggestions.append("Suggested Meta Description: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Meta description length issue"))
# Readability suggestion
try:
if float(p.get("readability_score", 0) or 0) < 50:
suggestions.append("Readability: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Improve readability"))
except Exception:
pass
# Missing alt tags
if int(p.get("missing_alt_tags", 0) or 0) > 0:
suggestions.append(f"{int(p.get('missing_alt_tags', 0))} images missing alt tags. Example: 'Product image showing [keyword]'")
# Schema
if not str(p.get("schema_types", "") or "").strip():
suggestions.append("Add structured data (schema.org): Product/Article/BreadcrumbList")
# Word count
try:
if int(p.get("word_count", 0) or 0) < 300:
suggestions.append("Page has low content. Expand to 300+ words with keyword-rich helpful content.")
except Exception:
pass
p["seo_suggestions"] = " | ".join(suggestions) if suggestions else "No major suggestions."
# persist CSV (safe)
os.makedirs(tmp_dir, exist_ok=True)
filename = os.path.join(tmp_dir, f"seo_report_{uuid.uuid4().hex}.csv")
if not results:
empty_msg = [{
"url": base_url,
"error": "No pages analyzed. Site may block crawlers or sitemap was empty.",
"seo_suggestions": "Try allowing bots or check robots.txt configuration."
}]
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=empty_msg[0].keys())
writer.writeheader()
writer.writerows(empty_msg)
return empty_msg, filename
keys = list(results[0].keys())
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(results)
return results, filename