Spaces:

moizshah956
/

SEO

Build error

App Files Files Community

SEO / seo_bot.py

moizshah956

Create seo_bot.py

7c77f56 verified 7 months ago

raw

history blame contribute delete

14.4 kB

	# seo_bot.py
	import os
	import csv
	import json
	import re
	import time
	import uuid
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from collections import Counter
	import textstat

	# Optional grammar check
	try:
	import language_tool_python
	LT_AVAILABLE = True
	except Exception:
	LT_AVAILABLE = False

	# Optional OpenAI client (modern package)
	try:
	from openai import OpenAI
	OPENAI_AVAILABLE = True
	except Exception:
	OPENAI_AVAILABLE = False

	HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}


	# ==============================
	# OpenAI Client & Suggestion
	# ==============================
	def make_client():
	"""
	Initialize OpenAI client if OPENAI_API_KEY is present and OpenAI package available.
	Returns None if not available — code will gracefully fall back.
	"""
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	print("⚠️ OPENAI_API_KEY not set — AI suggestions will be disabled.")
	return None
	if not OPENAI_AVAILABLE:
	print("⚠️ OpenAI package not available in environment — AI suggestions disabled.")
	return None

	try:
	# Use the modern OpenAI client initialization (no proxies kwarg)
	client = OpenAI(api_key=api_key)
	print("✅ OpenAI client initialized.")
	return client
	except Exception as e:
	print("⚠️ Error initializing OpenAI:", str(e))
	return None


	def generate_ai_suggestion(client, title, meta_description, keywords, issue_type):
	"""
	Returns a short AI suggestion string using the provided OpenAI client.
	If client is None or API call fails, returns a friendly fallback string.
	"""
	if client is None:
	return "(AI disabled — set OPENAI_API_KEY to enable suggestions)"
	try:
	prompt = f"""
	You are a professional SEO consultant. Provide a concise (1-2 sentences) practical suggestion.
	Title: {title}
	Meta Description: {meta_description}
	Keywords: {keywords}
	Problem: {issue_type}
	"""
	resp = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are an expert SEO consultant."},
	{"role": "user", "content": prompt}
	],
	max_tokens=120,
	temperature=0.7,
	top_p=0.95,
	)
	# defensive access
	try:
	return resp.choices[0].message.content.strip()
	except Exception:
	return "(AI suggestion unavailable: malformed response)"
	except Exception as e:
	# don't crash the whole scan if OpenAI fails temporarily
	return f"(AI suggestion unavailable: {str(e)})"


	# ==============================
	# Utility Functions
	# ==============================
	def keyword_density(text):
	words = re.findall(r'\b\w+\b', (text or "").lower())
	freq = Counter(w for w in words if len(w) > 3)
	total = sum(freq.values()) or 1
	items = sorted([(k, round(v / total * 100, 2)) for k, v in freq.items() if v > 1],
	key=lambda x: -x[1])[:10]
	return ", ".join([f"{k}:{p}%" for k, p in items])


	def get_image_size_kb(src_url, base_url):
	try:
	full_url = urljoin(base_url, src_url)
	res = requests.get(full_url, headers=HEADERS, timeout=5)
	if res.status_code != 200:
	return 0.0
	size_kb = len(res.content) / 1024
	return round(size_kb, 1)
	except Exception:
	return 0.0


	# ==============================
	# Main SEO Analyzer
	# ==============================
	def run_seo_and_suggestions(base_url, max_pages=30, tmp_dir="/tmp"):
	"""
	Crawl site (uses sitemap if present), analyze each page, compute SEO score,
	and produce AI suggestions (if OpenAI key is configured).
	Returns: (results_list, csv_path)
	"""
	if not base_url:
	raise ValueError("base_url is required")

	domain = urlparse(base_url).netloc
	sitemap_links = set()
	visited = set()

	def get_sitemap_links():
	sitemap_url = urljoin(base_url, "sitemap.xml")
	try:
	r = requests.get(sitemap_url, headers=HEADERS, timeout=8)
	if r.status_code == 200 and r.text:
	soup = BeautifulSoup(r.text, "xml")
	for loc in soup.find_all("loc"):
	href = loc.text.strip()
	if href:
	sitemap_links.add(href)
	except Exception:
	# ignore sitemap errors
	pass

	def get_robots_txt():
	robots_url = urljoin(base_url, "robots.txt")
	try:
	r = requests.get(robots_url, headers=HEADERS, timeout=5)
	if r.status_code == 200:
	return r.text
	except Exception:
	pass
	return ""

	def crawl_site():
	to_visit = list(sitemap_links) if sitemap_links else [base_url]
	all_urls = []
	while to_visit and len(all_urls) < max_pages:
	u = to_visit.pop(0)
	if u in visited:
	continue
	visited.add(u)
	try:
	r = requests.get(u, headers=HEADERS, timeout=10)
	if r.status_code != 200 or not r.text:
	continue
	soup = BeautifulSoup(r.text, "html.parser")
	all_urls.append(u)
	# extract same-domain links
	for a in soup.find_all("a", href=True):
	href = urljoin(u, a["href"]).split("#")[0].split("?")[0]
	parsed = urlparse(href)
	if parsed.netloc == domain and href not in visited and href not in to_visit:
	to_visit.append(href)
	except Exception:
	# skip on any error (timeout, connection error, bad HTML)
	continue
	return all_urls

	# --- start
	get_sitemap_links()
	robots_txt = get_robots_txt()
	pages = crawl_site()

	# prepare optional grammar tool
	grammar_tool = None
	if LT_AVAILABLE:
	try:
	# instantiate default LanguageTool (locally installed server not required)
	grammar_tool = language_tool_python.LanguageTool('en-US')
	except Exception:
	grammar_tool = None

	# prepare OpenAI client if available
	openai_client = make_client()

	results = []

	for i, page_url in enumerate(pages):
	try:
	r = requests.get(page_url, headers=HEADERS, timeout=12)
	if r.status_code != 200 or not r.text:
	continue
	html = r.text
	soup = BeautifulSoup(html, "html.parser")

	title_tag = soup.title
	meta_desc_tag = soup.find("meta", attrs={"name": "description"})
	canonical_tag = soup.find("link", rel="canonical")
	robots_tag = soup.find("meta", attrs={"name": "robots"})
	viewport_tag = soup.find("meta", attrs={"name": "viewport"})
	text = soup.get_text(separator=" ", strip=True)
	html_str = str(soup)

	# links
	anchors = soup.find_all("a", href=True)
	internal = external = 0
	for a in anchors:
	href = urljoin(page_url, a['href'])
	if domain in href:
	internal += 1
	else:
	external += 1

	# images
	imgs = soup.find_all("img")
	missing_alt = small_images = large_images = ideal_images = 0
	for img in imgs:
	if not img.get("alt"):
	missing_alt += 1
	src = img.get("src")
	if not src:
	continue
	size_kb = get_image_size_kb(src, page_url)
	if size_kb < 5:
	small_images += 1
	elif size_kb > 250:
	large_images += 1
	else:
	ideal_images += 1

	# headings
	heading_tags = soup.find_all(re.compile('^h[1-6]$'))
	heading_order = [h.name for h in heading_tags]
	h1_count = len(soup.find_all("h1"))

	# schema
	schema_types = []
	for tag in soup.find_all("script", type="application/ld+json"):
	try:
	if not tag.string:
	continue
	data = json.loads(tag.string)
	if isinstance(data, dict) and "@type" in data:
	schema_types.append(data["@type"])
	elif isinstance(data, list):
	for d in data:
	if isinstance(d, dict) and "@type" in d:
	schema_types.append(d["@type"])
	except Exception:
	continue

	# metrics
	try:
	readability_score = textstat.flesch_reading_ease(text)
	except Exception:
	readability_score = 0

	word_count = len((text or "").split())
	grammar_errors = 0
	try:
	if grammar_tool and text:
	grammar_errors = len(grammar_tool.check(text[:1000]))
	except Exception:
	grammar_errors = 0

	top_keywords = keyword_density(text)
	ratio = round((len(text) / len(html_str)) if html_str else 0, 3)

	page = {
	"url": page_url,
	"title": (title_tag.text.strip() if title_tag and title_tag.text else ""),
	"meta_description": (meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""),
	"h1_count": h1_count,
	"heading_order": ", ".join(heading_order),
	"missing_alt_tags": missing_alt,
	"total_images": len(imgs),
	"small_images": small_images,
	"large_images": large_images,
	"ideal_images": ideal_images,
	"internal_links": internal,
	"external_links": external,
	"canonical_tag": bool(canonical_tag),
	"robots_meta": (robots_tag.get("content", "") if robots_tag else ""),
	"viewport_present": ("width=device-width" in viewport_tag.get("content", "") if viewport_tag else False),
	"schema_types": ", ".join(schema_types),
	"opengraph_tags": len(soup.find_all("meta", property=re.compile("^og:"))),
	"twitter_tags": len(soup.find_all("meta", attrs={"name": re.compile("^twitter:")})),
	"word_count": word_count,
	"readability_score": readability_score,
	"grammar_errors": grammar_errors,
	"text_to_html_ratio": ratio,
	"top_keywords": top_keywords
	}

	results.append(page)

	except Exception:
	# keep scanning other pages even if one fails
	continue

	# scoring function
	def calculate_seo_score(page):
	score = 0
	if page.get('title'): score += 10
	if page.get('meta_description'): score += 10
	if page.get('h1_count', 0) == 1: score += 5
	if page.get('viewport_present', False): score += 5
	if page.get('missing_alt_tags', 0) == 0: score += 5
	if page.get('canonical_tag', False): score += 5
	if page.get('robots_meta', False): score += 3
	if page.get('schema_types'): score += 5
	if page.get('readability_score', 0) > 50: score += 5
	if page.get('top_keywords'): score += 5
	return min(score, 100)

	# Attach scores and generate suggestions (AI if available)
	for p in results:
	p["seo_score"] = calculate_seo_score(p)

	title = str(p.get("title", "") or "")
	meta = str(p.get("meta_description", "") or "")
	keywords = str(p.get("top_keywords", "") or "")

	suggestions = []

	# Title suggestion
	if not title or len(title) < 30 or len(title) > 65:
	suggestions.append("Suggested Title: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Title length issue"))

	# Meta suggestion
	if not meta or len(meta) < 70 or len(meta) > 160:
	suggestions.append("Suggested Meta Description: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Meta description length issue"))

	# Readability suggestion
	try:
	if float(p.get("readability_score", 0) or 0) < 50:
	suggestions.append("Readability: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Improve readability"))
	except Exception:
	pass

	# Missing alt tags
	if int(p.get("missing_alt_tags", 0) or 0) > 0:
	suggestions.append(f"{int(p.get('missing_alt_tags', 0))} images missing alt tags. Example: 'Product image showing [keyword]'")

	# Schema
	if not str(p.get("schema_types", "") or "").strip():
	suggestions.append("Add structured data (schema.org): Product/Article/BreadcrumbList")

	# Word count
	try:
	if int(p.get("word_count", 0) or 0) < 300:
	suggestions.append("Page has low content. Expand to 300+ words with keyword-rich helpful content.")
	except Exception:
	pass

	p["seo_suggestions"] = " \| ".join(suggestions) if suggestions else "No major suggestions."

	# persist CSV (safe)
	os.makedirs(tmp_dir, exist_ok=True)
	filename = os.path.join(tmp_dir, f"seo_report_{uuid.uuid4().hex}.csv")

	if not results:
	empty_msg = [{
	"url": base_url,
	"error": "No pages analyzed. Site may block crawlers or sitemap was empty.",
	"seo_suggestions": "Try allowing bots or check robots.txt configuration."
	}]
	with open(filename, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=empty_msg[0].keys())
	writer.writeheader()
	writer.writerows(empty_msg)
	return empty_msg, filename

	keys = list(results[0].keys())
	with open(filename, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=keys)
	writer.writeheader()
	writer.writerows(results)

	return results, filename