Spaces:
Build error
Build error
| # seo_bot.py | |
| import os | |
| import csv | |
| import json | |
| import re | |
| import time | |
| import uuid | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from collections import Counter | |
| import textstat | |
| # Optional grammar check | |
| try: | |
| import language_tool_python | |
| LT_AVAILABLE = True | |
| except Exception: | |
| LT_AVAILABLE = False | |
| # Optional OpenAI client (modern package) | |
| try: | |
| from openai import OpenAI | |
| OPENAI_AVAILABLE = True | |
| except Exception: | |
| OPENAI_AVAILABLE = False | |
| HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} | |
| # ============================== | |
| # OpenAI Client & Suggestion | |
| # ============================== | |
| def make_client(): | |
| """ | |
| Initialize OpenAI client if OPENAI_API_KEY is present and OpenAI package available. | |
| Returns None if not available — code will gracefully fall back. | |
| """ | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| print("⚠️ OPENAI_API_KEY not set — AI suggestions will be disabled.") | |
| return None | |
| if not OPENAI_AVAILABLE: | |
| print("⚠️ OpenAI package not available in environment — AI suggestions disabled.") | |
| return None | |
| try: | |
| # Use the modern OpenAI client initialization (no proxies kwarg) | |
| client = OpenAI(api_key=api_key) | |
| print("✅ OpenAI client initialized.") | |
| return client | |
| except Exception as e: | |
| print("⚠️ Error initializing OpenAI:", str(e)) | |
| return None | |
| def generate_ai_suggestion(client, title, meta_description, keywords, issue_type): | |
| """ | |
| Returns a short AI suggestion string using the provided OpenAI client. | |
| If client is None or API call fails, returns a friendly fallback string. | |
| """ | |
| if client is None: | |
| return "(AI disabled — set OPENAI_API_KEY to enable suggestions)" | |
| try: | |
| prompt = f""" | |
| You are a professional SEO consultant. Provide a concise (1-2 sentences) practical suggestion. | |
| Title: {title} | |
| Meta Description: {meta_description} | |
| Keywords: {keywords} | |
| Problem: {issue_type} | |
| """ | |
| resp = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You are an expert SEO consultant."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=120, | |
| temperature=0.7, | |
| top_p=0.95, | |
| ) | |
| # defensive access | |
| try: | |
| return resp.choices[0].message.content.strip() | |
| except Exception: | |
| return "(AI suggestion unavailable: malformed response)" | |
| except Exception as e: | |
| # don't crash the whole scan if OpenAI fails temporarily | |
| return f"(AI suggestion unavailable: {str(e)})" | |
| # ============================== | |
| # Utility Functions | |
| # ============================== | |
| def keyword_density(text): | |
| words = re.findall(r'\b\w+\b', (text or "").lower()) | |
| freq = Counter(w for w in words if len(w) > 3) | |
| total = sum(freq.values()) or 1 | |
| items = sorted([(k, round(v / total * 100, 2)) for k, v in freq.items() if v > 1], | |
| key=lambda x: -x[1])[:10] | |
| return ", ".join([f"{k}:{p}%" for k, p in items]) | |
| def get_image_size_kb(src_url, base_url): | |
| try: | |
| full_url = urljoin(base_url, src_url) | |
| res = requests.get(full_url, headers=HEADERS, timeout=5) | |
| if res.status_code != 200: | |
| return 0.0 | |
| size_kb = len(res.content) / 1024 | |
| return round(size_kb, 1) | |
| except Exception: | |
| return 0.0 | |
| # ============================== | |
| # Main SEO Analyzer | |
| # ============================== | |
| def run_seo_and_suggestions(base_url, max_pages=30, tmp_dir="/tmp"): | |
| """ | |
| Crawl site (uses sitemap if present), analyze each page, compute SEO score, | |
| and produce AI suggestions (if OpenAI key is configured). | |
| Returns: (results_list, csv_path) | |
| """ | |
| if not base_url: | |
| raise ValueError("base_url is required") | |
| domain = urlparse(base_url).netloc | |
| sitemap_links = set() | |
| visited = set() | |
| def get_sitemap_links(): | |
| sitemap_url = urljoin(base_url, "sitemap.xml") | |
| try: | |
| r = requests.get(sitemap_url, headers=HEADERS, timeout=8) | |
| if r.status_code == 200 and r.text: | |
| soup = BeautifulSoup(r.text, "xml") | |
| for loc in soup.find_all("loc"): | |
| href = loc.text.strip() | |
| if href: | |
| sitemap_links.add(href) | |
| except Exception: | |
| # ignore sitemap errors | |
| pass | |
| def get_robots_txt(): | |
| robots_url = urljoin(base_url, "robots.txt") | |
| try: | |
| r = requests.get(robots_url, headers=HEADERS, timeout=5) | |
| if r.status_code == 200: | |
| return r.text | |
| except Exception: | |
| pass | |
| return "" | |
| def crawl_site(): | |
| to_visit = list(sitemap_links) if sitemap_links else [base_url] | |
| all_urls = [] | |
| while to_visit and len(all_urls) < max_pages: | |
| u = to_visit.pop(0) | |
| if u in visited: | |
| continue | |
| visited.add(u) | |
| try: | |
| r = requests.get(u, headers=HEADERS, timeout=10) | |
| if r.status_code != 200 or not r.text: | |
| continue | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| all_urls.append(u) | |
| # extract same-domain links | |
| for a in soup.find_all("a", href=True): | |
| href = urljoin(u, a["href"]).split("#")[0].split("?")[0] | |
| parsed = urlparse(href) | |
| if parsed.netloc == domain and href not in visited and href not in to_visit: | |
| to_visit.append(href) | |
| except Exception: | |
| # skip on any error (timeout, connection error, bad HTML) | |
| continue | |
| return all_urls | |
| # --- start | |
| get_sitemap_links() | |
| robots_txt = get_robots_txt() | |
| pages = crawl_site() | |
| # prepare optional grammar tool | |
| grammar_tool = None | |
| if LT_AVAILABLE: | |
| try: | |
| # instantiate default LanguageTool (locally installed server not required) | |
| grammar_tool = language_tool_python.LanguageTool('en-US') | |
| except Exception: | |
| grammar_tool = None | |
| # prepare OpenAI client if available | |
| openai_client = make_client() | |
| results = [] | |
| for i, page_url in enumerate(pages): | |
| try: | |
| r = requests.get(page_url, headers=HEADERS, timeout=12) | |
| if r.status_code != 200 or not r.text: | |
| continue | |
| html = r.text | |
| soup = BeautifulSoup(html, "html.parser") | |
| title_tag = soup.title | |
| meta_desc_tag = soup.find("meta", attrs={"name": "description"}) | |
| canonical_tag = soup.find("link", rel="canonical") | |
| robots_tag = soup.find("meta", attrs={"name": "robots"}) | |
| viewport_tag = soup.find("meta", attrs={"name": "viewport"}) | |
| text = soup.get_text(separator=" ", strip=True) | |
| html_str = str(soup) | |
| # links | |
| anchors = soup.find_all("a", href=True) | |
| internal = external = 0 | |
| for a in anchors: | |
| href = urljoin(page_url, a['href']) | |
| if domain in href: | |
| internal += 1 | |
| else: | |
| external += 1 | |
| # images | |
| imgs = soup.find_all("img") | |
| missing_alt = small_images = large_images = ideal_images = 0 | |
| for img in imgs: | |
| if not img.get("alt"): | |
| missing_alt += 1 | |
| src = img.get("src") | |
| if not src: | |
| continue | |
| size_kb = get_image_size_kb(src, page_url) | |
| if size_kb < 5: | |
| small_images += 1 | |
| elif size_kb > 250: | |
| large_images += 1 | |
| else: | |
| ideal_images += 1 | |
| # headings | |
| heading_tags = soup.find_all(re.compile('^h[1-6]$')) | |
| heading_order = [h.name for h in heading_tags] | |
| h1_count = len(soup.find_all("h1")) | |
| # schema | |
| schema_types = [] | |
| for tag in soup.find_all("script", type="application/ld+json"): | |
| try: | |
| if not tag.string: | |
| continue | |
| data = json.loads(tag.string) | |
| if isinstance(data, dict) and "@type" in data: | |
| schema_types.append(data["@type"]) | |
| elif isinstance(data, list): | |
| for d in data: | |
| if isinstance(d, dict) and "@type" in d: | |
| schema_types.append(d["@type"]) | |
| except Exception: | |
| continue | |
| # metrics | |
| try: | |
| readability_score = textstat.flesch_reading_ease(text) | |
| except Exception: | |
| readability_score = 0 | |
| word_count = len((text or "").split()) | |
| grammar_errors = 0 | |
| try: | |
| if grammar_tool and text: | |
| grammar_errors = len(grammar_tool.check(text[:1000])) | |
| except Exception: | |
| grammar_errors = 0 | |
| top_keywords = keyword_density(text) | |
| ratio = round((len(text) / len(html_str)) if html_str else 0, 3) | |
| page = { | |
| "url": page_url, | |
| "title": (title_tag.text.strip() if title_tag and title_tag.text else ""), | |
| "meta_description": (meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""), | |
| "h1_count": h1_count, | |
| "heading_order": ", ".join(heading_order), | |
| "missing_alt_tags": missing_alt, | |
| "total_images": len(imgs), | |
| "small_images": small_images, | |
| "large_images": large_images, | |
| "ideal_images": ideal_images, | |
| "internal_links": internal, | |
| "external_links": external, | |
| "canonical_tag": bool(canonical_tag), | |
| "robots_meta": (robots_tag.get("content", "") if robots_tag else ""), | |
| "viewport_present": ("width=device-width" in viewport_tag.get("content", "") if viewport_tag else False), | |
| "schema_types": ", ".join(schema_types), | |
| "opengraph_tags": len(soup.find_all("meta", property=re.compile("^og:"))), | |
| "twitter_tags": len(soup.find_all("meta", attrs={"name": re.compile("^twitter:")})), | |
| "word_count": word_count, | |
| "readability_score": readability_score, | |
| "grammar_errors": grammar_errors, | |
| "text_to_html_ratio": ratio, | |
| "top_keywords": top_keywords | |
| } | |
| results.append(page) | |
| except Exception: | |
| # keep scanning other pages even if one fails | |
| continue | |
| # scoring function | |
| def calculate_seo_score(page): | |
| score = 0 | |
| if page.get('title'): score += 10 | |
| if page.get('meta_description'): score += 10 | |
| if page.get('h1_count', 0) == 1: score += 5 | |
| if page.get('viewport_present', False): score += 5 | |
| if page.get('missing_alt_tags', 0) == 0: score += 5 | |
| if page.get('canonical_tag', False): score += 5 | |
| if page.get('robots_meta', False): score += 3 | |
| if page.get('schema_types'): score += 5 | |
| if page.get('readability_score', 0) > 50: score += 5 | |
| if page.get('top_keywords'): score += 5 | |
| return min(score, 100) | |
| # Attach scores and generate suggestions (AI if available) | |
| for p in results: | |
| p["seo_score"] = calculate_seo_score(p) | |
| title = str(p.get("title", "") or "") | |
| meta = str(p.get("meta_description", "") or "") | |
| keywords = str(p.get("top_keywords", "") or "") | |
| suggestions = [] | |
| # Title suggestion | |
| if not title or len(title) < 30 or len(title) > 65: | |
| suggestions.append("Suggested Title: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Title length issue")) | |
| # Meta suggestion | |
| if not meta or len(meta) < 70 or len(meta) > 160: | |
| suggestions.append("Suggested Meta Description: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Meta description length issue")) | |
| # Readability suggestion | |
| try: | |
| if float(p.get("readability_score", 0) or 0) < 50: | |
| suggestions.append("Readability: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Improve readability")) | |
| except Exception: | |
| pass | |
| # Missing alt tags | |
| if int(p.get("missing_alt_tags", 0) or 0) > 0: | |
| suggestions.append(f"{int(p.get('missing_alt_tags', 0))} images missing alt tags. Example: 'Product image showing [keyword]'") | |
| # Schema | |
| if not str(p.get("schema_types", "") or "").strip(): | |
| suggestions.append("Add structured data (schema.org): Product/Article/BreadcrumbList") | |
| # Word count | |
| try: | |
| if int(p.get("word_count", 0) or 0) < 300: | |
| suggestions.append("Page has low content. Expand to 300+ words with keyword-rich helpful content.") | |
| except Exception: | |
| pass | |
| p["seo_suggestions"] = " | ".join(suggestions) if suggestions else "No major suggestions." | |
| # persist CSV (safe) | |
| os.makedirs(tmp_dir, exist_ok=True) | |
| filename = os.path.join(tmp_dir, f"seo_report_{uuid.uuid4().hex}.csv") | |
| if not results: | |
| empty_msg = [{ | |
| "url": base_url, | |
| "error": "No pages analyzed. Site may block crawlers or sitemap was empty.", | |
| "seo_suggestions": "Try allowing bots or check robots.txt configuration." | |
| }] | |
| with open(filename, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=empty_msg[0].keys()) | |
| writer.writeheader() | |
| writer.writerows(empty_msg) | |
| return empty_msg, filename | |
| keys = list(results[0].keys()) | |
| with open(filename, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter(f, fieldnames=keys) | |
| writer.writeheader() | |
| writer.writerows(results) | |
| return results, filename |