Spaces:

moizshah956
/

SEO

Build error

App Files Files Community

moizshah956 commited on Oct 8, 2025

Commit

7c77f56

verified ·

1 Parent(s): a38a440

Create seo_bot.py

Browse files

Files changed (1) hide show

seo_bot.py +392 -0

seo_bot.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# seo_bot.py
+import os
+import csv
+import json
+import re
+import time
+import uuid
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from collections import Counter
+import textstat
+# Optional grammar check
+try:
+    import language_tool_python
+    LT_AVAILABLE = True
+except Exception:
+    LT_AVAILABLE = False
+# Optional OpenAI client (modern package)
+try:
+    from openai import OpenAI
+    OPENAI_AVAILABLE = True
+except Exception:
+    OPENAI_AVAILABLE = False
+HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
+# ==============================
+# OpenAI Client & Suggestion
+# ==============================
+def make_client():
+    """
+    Initialize OpenAI client if OPENAI_API_KEY is present and OpenAI package available.
+    Returns None if not available — code will gracefully fall back.
+    """
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("⚠️ OPENAI_API_KEY not set — AI suggestions will be disabled.")
+        return None
+    if not OPENAI_AVAILABLE:
+        print("⚠️ OpenAI package not available in environment — AI suggestions disabled.")
+        return None
+    try:
+        # Use the modern OpenAI client initialization (no proxies kwarg)
+        client = OpenAI(api_key=api_key)
+        print("✅ OpenAI client initialized.")
+        return client
+    except Exception as e:
+        print("⚠️ Error initializing OpenAI:", str(e))
+        return None
+def generate_ai_suggestion(client, title, meta_description, keywords, issue_type):
+    """
+    Returns a short AI suggestion string using the provided OpenAI client.
+    If client is None or API call fails, returns a friendly fallback string.
+    """
+    if client is None:
+        return "(AI disabled — set OPENAI_API_KEY to enable suggestions)"
+    try:
+        prompt = f"""
+You are a professional SEO consultant. Provide a concise (1-2 sentences) practical suggestion.
+Title: {title}
+Meta Description: {meta_description}
+Keywords: {keywords}
+Problem: {issue_type}
+"""
+        resp = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are an expert SEO consultant."},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=120,
+            temperature=0.7,
+            top_p=0.95,
+        )
+        # defensive access
+        try:
+            return resp.choices[0].message.content.strip()
+        except Exception:
+            return "(AI suggestion unavailable: malformed response)"
+    except Exception as e:
+        # don't crash the whole scan if OpenAI fails temporarily
+        return f"(AI suggestion unavailable: {str(e)})"
+# ==============================
+# Utility Functions
+# ==============================
+def keyword_density(text):
+    words = re.findall(r'\b\w+\b', (text or "").lower())
+    freq = Counter(w for w in words if len(w) > 3)
+    total = sum(freq.values()) or 1
+    items = sorted([(k, round(v / total * 100, 2)) for k, v in freq.items() if v > 1],
+                   key=lambda x: -x[1])[:10]
+    return ", ".join([f"{k}:{p}%" for k, p in items])
+def get_image_size_kb(src_url, base_url):
+    try:
+        full_url = urljoin(base_url, src_url)
+        res = requests.get(full_url, headers=HEADERS, timeout=5)
+        if res.status_code != 200:
+            return 0.0
+        size_kb = len(res.content) / 1024
+        return round(size_kb, 1)
+    except Exception:
+        return 0.0
+# ==============================
+# Main SEO Analyzer
+# ==============================
+def run_seo_and_suggestions(base_url, max_pages=30, tmp_dir="/tmp"):
+    """
+    Crawl site (uses sitemap if present), analyze each page, compute SEO score,
+    and produce AI suggestions (if OpenAI key is configured).
+    Returns: (results_list, csv_path)
+    """
+    if not base_url:
+        raise ValueError("base_url is required")
+    domain = urlparse(base_url).netloc
+    sitemap_links = set()
+    visited = set()
+    def get_sitemap_links():
+        sitemap_url = urljoin(base_url, "sitemap.xml")
+        try:
+            r = requests.get(sitemap_url, headers=HEADERS, timeout=8)
+            if r.status_code == 200 and r.text:
+                soup = BeautifulSoup(r.text, "xml")
+                for loc in soup.find_all("loc"):
+                    href = loc.text.strip()
+                    if href:
+                        sitemap_links.add(href)
+        except Exception:
+            # ignore sitemap errors
+            pass
+    def get_robots_txt():
+        robots_url = urljoin(base_url, "robots.txt")
+        try:
+            r = requests.get(robots_url, headers=HEADERS, timeout=5)
+            if r.status_code == 200:
+                return r.text
+        except Exception:
+            pass
+        return ""
+    def crawl_site():
+        to_visit = list(sitemap_links) if sitemap_links else [base_url]
+        all_urls = []
+        while to_visit and len(all_urls) < max_pages:
+            u = to_visit.pop(0)
+            if u in visited:
+                continue
+            visited.add(u)
+            try:
+                r = requests.get(u, headers=HEADERS, timeout=10)
+                if r.status_code != 200 or not r.text:
+                    continue
+                soup = BeautifulSoup(r.text, "html.parser")
+                all_urls.append(u)
+                # extract same-domain links
+                for a in soup.find_all("a", href=True):
+                    href = urljoin(u, a["href"]).split("#")[0].split("?")[0]
+                    parsed = urlparse(href)
+                    if parsed.netloc == domain and href not in visited and href not in to_visit:
+                        to_visit.append(href)
+            except Exception:
+                # skip on any error (timeout, connection error, bad HTML)
+                continue
+        return all_urls
+    # --- start
+    get_sitemap_links()
+    robots_txt = get_robots_txt()
+    pages = crawl_site()
+    # prepare optional grammar tool
+    grammar_tool = None
+    if LT_AVAILABLE:
+        try:
+            # instantiate default LanguageTool (locally installed server not required)
+            grammar_tool = language_tool_python.LanguageTool('en-US')
+        except Exception:
+            grammar_tool = None
+    # prepare OpenAI client if available
+    openai_client = make_client()
+    results = []
+    for i, page_url in enumerate(pages):
+        try:
+            r = requests.get(page_url, headers=HEADERS, timeout=12)
+            if r.status_code != 200 or not r.text:
+                continue
+            html = r.text
+            soup = BeautifulSoup(html, "html.parser")
+            title_tag = soup.title
+            meta_desc_tag = soup.find("meta", attrs={"name": "description"})
+            canonical_tag = soup.find("link", rel="canonical")
+            robots_tag = soup.find("meta", attrs={"name": "robots"})
+            viewport_tag = soup.find("meta", attrs={"name": "viewport"})
+            text = soup.get_text(separator=" ", strip=True)
+            html_str = str(soup)
+            # links
+            anchors = soup.find_all("a", href=True)
+            internal = external = 0
+            for a in anchors:
+                href = urljoin(page_url, a['href'])
+                if domain in href:
+                    internal += 1
+                else:
+                    external += 1
+            # images
+            imgs = soup.find_all("img")
+            missing_alt = small_images = large_images = ideal_images = 0
+            for img in imgs:
+                if not img.get("alt"):
+                    missing_alt += 1
+                src = img.get("src")
+                if not src:
+                    continue
+                size_kb = get_image_size_kb(src, page_url)
+                if size_kb < 5:
+                    small_images += 1
+                elif size_kb > 250:
+                    large_images += 1
+                else:
+                    ideal_images += 1
+            # headings
+            heading_tags = soup.find_all(re.compile('^h[1-6]$'))
+            heading_order = [h.name for h in heading_tags]
+            h1_count = len(soup.find_all("h1"))
+            # schema
+            schema_types = []
+            for tag in soup.find_all("script", type="application/ld+json"):
+                try:
+                    if not tag.string:
+                        continue
+                    data = json.loads(tag.string)
+                    if isinstance(data, dict) and "@type" in data:
+                        schema_types.append(data["@type"])
+                    elif isinstance(data, list):
+                        for d in data:
+                            if isinstance(d, dict) and "@type" in d:
+                                schema_types.append(d["@type"])
+                except Exception:
+                    continue
+            # metrics
+            try:
+                readability_score = textstat.flesch_reading_ease(text)
+            except Exception:
+                readability_score = 0
+            word_count = len((text or "").split())
+            grammar_errors = 0
+            try:
+                if grammar_tool and text:
+                    grammar_errors = len(grammar_tool.check(text[:1000]))
+            except Exception:
+                grammar_errors = 0
+            top_keywords = keyword_density(text)
+            ratio = round((len(text) / len(html_str)) if html_str else 0, 3)
+            page = {
+                "url": page_url,
+                "title": (title_tag.text.strip() if title_tag and title_tag.text else ""),
+                "meta_description": (meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""),
+                "h1_count": h1_count,
+                "heading_order": ", ".join(heading_order),
+                "missing_alt_tags": missing_alt,
+                "total_images": len(imgs),
+                "small_images": small_images,
+                "large_images": large_images,
+                "ideal_images": ideal_images,
+                "internal_links": internal,
+                "external_links": external,
+                "canonical_tag": bool(canonical_tag),
+                "robots_meta": (robots_tag.get("content", "") if robots_tag else ""),
+                "viewport_present": ("width=device-width" in viewport_tag.get("content", "") if viewport_tag else False),
+                "schema_types": ", ".join(schema_types),
+                "opengraph_tags": len(soup.find_all("meta", property=re.compile("^og:"))),
+                "twitter_tags": len(soup.find_all("meta", attrs={"name": re.compile("^twitter:")})),
+                "word_count": word_count,
+                "readability_score": readability_score,
+                "grammar_errors": grammar_errors,
+                "text_to_html_ratio": ratio,
+                "top_keywords": top_keywords
+            }
+            results.append(page)
+        except Exception:
+            # keep scanning other pages even if one fails
+            continue
+    # scoring function
+    def calculate_seo_score(page):
+        score = 0
+        if page.get('title'): score += 10
+        if page.get('meta_description'): score += 10
+        if page.get('h1_count', 0) == 1: score += 5
+        if page.get('viewport_present', False): score += 5
+        if page.get('missing_alt_tags', 0) == 0: score += 5
+        if page.get('canonical_tag', False): score += 5
+        if page.get('robots_meta', False): score += 3
+        if page.get('schema_types'): score += 5
+        if page.get('readability_score', 0) > 50: score += 5
+        if page.get('top_keywords'): score += 5
+        return min(score, 100)
+    # Attach scores and generate suggestions (AI if available)
+    for p in results:
+        p["seo_score"] = calculate_seo_score(p)
+        title = str(p.get("title", "") or "")
+        meta = str(p.get("meta_description", "") or "")
+        keywords = str(p.get("top_keywords", "") or "")
+        suggestions = []
+        # Title suggestion
+        if not title or len(title) < 30 or len(title) > 65:
+            suggestions.append("Suggested Title: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Title length issue"))
+        # Meta suggestion
+        if not meta or len(meta) < 70 or len(meta) > 160:
+            suggestions.append("Suggested Meta Description: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Meta description length issue"))
+        # Readability suggestion
+        try:
+            if float(p.get("readability_score", 0) or 0) < 50:
+                suggestions.append("Readability: " + generate_ai_suggestion(openai_client, title, meta, keywords, "Improve readability"))
+        except Exception:
+            pass
+        # Missing alt tags
+        if int(p.get("missing_alt_tags", 0) or 0) > 0:
+            suggestions.append(f"{int(p.get('missing_alt_tags', 0))} images missing alt tags. Example: 'Product image showing [keyword]'")
+        # Schema
+        if not str(p.get("schema_types", "") or "").strip():
+            suggestions.append("Add structured data (schema.org): Product/Article/BreadcrumbList")
+        # Word count
+        try:
+            if int(p.get("word_count", 0) or 0) < 300:
+                suggestions.append("Page has low content. Expand to 300+ words with keyword-rich helpful content.")
+        except Exception:
+            pass
+        p["seo_suggestions"] = " | ".join(suggestions) if suggestions else "No major suggestions."
+    # persist CSV (safe)
+    os.makedirs(tmp_dir, exist_ok=True)
+    filename = os.path.join(tmp_dir, f"seo_report_{uuid.uuid4().hex}.csv")
+    if not results:
+        empty_msg = [{
+            "url": base_url,
+            "error": "No pages analyzed. Site may block crawlers or sitemap was empty.",
+            "seo_suggestions": "Try allowing bots or check robots.txt configuration."
+        }]
+        with open(filename, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=empty_msg[0].keys())
+            writer.writeheader()
+            writer.writerows(empty_msg)
+        return empty_msg, filename
+    keys = list(results[0].keys())
+    with open(filename, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=keys)
+        writer.writeheader()
+        writer.writerows(results)
+    return results, filename