Spaces:
Running
Running
| # app.py | |
| import re | |
| from dataclasses import dataclass, asdict | |
| from typing import Dict, List, Optional, Tuple | |
| from urllib.parse import urlparse, urljoin | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.robotparser import RobotFileParser | |
| import gradio as gr | |
| # --- Known / common AI user agents (extend as needed) --- | |
| DEFAULT_AI_AGENTS = [ | |
| # Major AI-specific or AI-extended crawlers | |
| "GPTBot", # OpenAI | |
| "Google-Extended", # Google for AI training opt-out | |
| "Anthropic-ai", # Anthropic | |
| "PerplexityBot", # Perplexity | |
| "Claude-Web", # Anthropic variant seen in the wild | |
| "CCBot", # Common Crawl (often used for model datasets) | |
| "DuckAssistBot", # DuckDuckGo AI assistant | |
| "Bytespider", # ByteDance (ML datasets) | |
| "Amazonbot", # Amazon web crawler (AI datasets reported) | |
| "FacebookBot", # Meta (generic) | |
| # Fallbacks that some sites target | |
| "ai-bot", "aibot" | |
| ] | |
| # Tokens that imply AI-blocking (non-standard but widely used) | |
| AI_BLOCK_TOKENS = {"noai", "noimageai"} | |
| # Standard robots tokens that imply non-crawlability of content | |
| ROBOTS_BLOCK_TOKENS = {"noindex", "nofollow", "none", "noarchive"} | |
| # Allowed content types for HTML pages | |
| ALLOWED_CONTENT_TYPES = {"text/html", "application/xhtml+xml"} | |
| class RobotsCheck: | |
| fetched: bool | |
| url: Optional[str] | |
| allowed_map: Dict[str, bool] | |
| disallow_matches: Dict[str, List[str]] | |
| errors: List[str] | |
| class MetaRobotsCheck: | |
| found: bool | |
| directives: List[str] | |
| ai_block_tokens: List[str] | |
| robots_block_tokens: List[str] | |
| source: str # "html" or "header" | |
| raw: str | |
| class HttpCheck: | |
| status: int | |
| final_url: str | |
| redirects: List[Tuple[int, str]] | |
| content_type: str | |
| class CrawlabilityResult: | |
| url: str | |
| verdict: str # "AI-crawlable", "AI-blocked", "Indeterminate" | |
| reasons: List[str] | |
| http: HttpCheck | |
| robots: RobotsCheck | |
| meta_html: Optional[MetaRobotsCheck] | |
| xrobots: Optional[MetaRobotsCheck] | |
| analyzed_agents: List[str] | |
| summary: str | |
| def _get_robots_url(page_url: str) -> str: | |
| parsed = urlparse(page_url) | |
| base = f"{parsed.scheme}://{parsed.netloc}" | |
| return urljoin(base, "/robots.txt") | |
| def _fetch_with_redirects(url: str, timeout: int = 15) -> HttpCheck: | |
| redirects = [] | |
| try: | |
| resp = requests.get(url, timeout=timeout, allow_redirects=True, | |
| headers={"User-Agent": "Mozilla/5.0 AI-Crawlability/1.0"}) | |
| # Collect redirect chain | |
| for r in resp.history: | |
| redirects.append((r.status_code, r.headers.get("Location", r.url))) | |
| content_type = resp.headers.get("Content-Type", "").split(";")[0].strip().lower() | |
| return HttpCheck(status=resp.status_code, final_url=resp.url, | |
| redirects=redirects, content_type=content_type) | |
| except requests.RequestException as e: | |
| return HttpCheck(status=0, final_url=url, redirects=redirects, content_type=str(e)) | |
| def _parse_meta_robots_from_html(html: str) -> MetaRobotsCheck: | |
| soup = BeautifulSoup(html, "html.parser") | |
| metas = soup.find_all("meta", attrs={"name": re.compile( | |
| r"^(robots|googlebot|gptbot|anthropic-ai|perplexitybot)$", re.I)}) | |
| directives = [] | |
| raw = [] | |
| for m in metas: | |
| content = (m.get("content") or "").strip() | |
| if content: | |
| raw.append(content) | |
| directives.extend([d.strip().lower() for d in content.split(",")]) | |
| ai_tokens = sorted(set(d for d in directives if d in AI_BLOCK_TOKENS)) | |
| robots_tokens = sorted(set(d for d in directives if d in ROBOTS_BLOCK_TOKENS)) | |
| return MetaRobotsCheck( | |
| found=bool(directives), | |
| directives=sorted(set(directives)), | |
| ai_block_tokens=ai_tokens, | |
| robots_block_tokens=robots_tokens, | |
| source="html", | |
| raw=" | ".join(raw) | |
| ) | |
| def _parse_xrobots_from_headers(headers: Dict[str, str]) -> Optional[MetaRobotsCheck]: | |
| # X-Robots-Tag can appear multiple times; normalize | |
| values = [] | |
| for k, v in headers.items(): | |
| if k.lower() == "x-robots-tag": | |
| values.append(v) | |
| if not values: | |
| return None | |
| directives = [] | |
| for val in values: | |
| # Pattern: "<agent>: <rules>" OR "<rules>" | |
| parts = [p.strip() for p in val.split(",")] | |
| directives.extend([p.lower() for p in parts]) | |
| ai_tokens = sorted(set(d for d in directives if d in AI_BLOCK_TOKENS)) | |
| robots_tokens = sorted(set(d for d in directives if d in ROBOTS_BLOCK_TOKENS)) | |
| return MetaRobotsCheck( | |
| found=True, | |
| directives=sorted(set(directives)), | |
| ai_block_tokens=ai_tokens, | |
| robots_block_tokens=robots_tokens, | |
| source="header", | |
| raw=" | ".join(values) | |
| ) | |
| def _fetch_text(url: str, timeout: int = 15) -> Tuple[Optional[str], Dict[str, str], int]: | |
| try: | |
| resp = requests.get(url, timeout=timeout, | |
| headers={"User-Agent": "Mozilla/5.0 AI-Crawlability/1.0"}) | |
| return (resp.text if resp.status_code == 200 else None, resp.headers, resp.status_code) | |
| except requests.RequestException: | |
| return (None, {}, 0) | |
| def _check_robots(page_url: str, ai_agents: List[str]) -> RobotsCheck: | |
| robots_url = _get_robots_url(page_url) | |
| errors = [] | |
| allowed_map = {} | |
| disallow_matches = {} | |
| rp = RobotFileParser() | |
| rp.set_url(robots_url) | |
| try: | |
| rp.read() | |
| fetched = True | |
| except Exception as e: | |
| fetched = False | |
| errors.append(f"robots.txt fetch error: {e}") | |
| # For better diagnostics, fetch raw robots.txt to inspect Disallow lines for each agent | |
| robots_txt, _, status = _fetch_text(robots_url) | |
| if status == 404: | |
| errors.append("robots.txt not found (404) — defaulting to allowed unless meta/header blocks apply.") | |
| elif status != 200 and status != 0: | |
| errors.append(f"robots.txt unexpected HTTP {status}") | |
| for agent in ai_agents: | |
| try: | |
| allowed_map[agent] = rp.can_fetch(agent, page_url) if fetched else True | |
| except Exception as e: | |
| allowed_map[agent] = True | |
| errors.append(f"robotparser error for agent '{agent}': {e}") | |
| # Collect Disallow matching lines (informational) | |
| disallow_list = [] | |
| if robots_txt: | |
| section_regex = re.compile(rf"(?i)User-agent:\s*{re.escape(agent)}\s*(.*?)\n(?=User-agent:|$)", re.S) | |
| # Fallback: global section | |
| global_regex = re.compile(r"(?i)User-agent:\s*\*\s*(.*?)\n(?=User-agent:|$)", re.S) | |
| matches = section_regex.findall(robots_txt) or global_regex.findall(robots_txt) | |
| for block in matches: | |
| for line in block.splitlines(): | |
| if line.strip().lower().startswith("disallow:"): | |
| disallow_list.append(line.strip()) | |
| disallow_matches[agent] = disallow_list | |
| return RobotsCheck( | |
| fetched=fetched, | |
| url=robots_url, | |
| allowed_map=allowed_map, | |
| disallow_matches=disallow_matches, | |
| errors=errors | |
| ) | |
| def check_ai_crawlability( | |
| url: str, | |
| ai_agents: Optional[List[str]] = None, | |
| timeout: int = 15 | |
| ) -> Dict: | |
| """ | |
| Returns a dict with keys: | |
| - verdict: "AI-crawlable" | "AI-blocked" | "Indeterminate" | |
| - reasons: list of strings explaining decision | |
| - http, robots, meta_html, xrobots: detailed sub-objects | |
| - analyzed_agents: list of agents considered | |
| - summary: one-line summary | |
| """ | |
| ai_agents = ai_agents or DEFAULT_AI_AGENTS | |
| # Step 1: HTTP fetch (final URL, status, content type) | |
| http_info = _fetch_with_redirects(url, timeout=timeout) | |
| reasons = [] | |
| if http_info.status != 200: | |
| reasons.append(f"Non-200 status ({http_info.status}) for final URL: {http_info.final_url}") | |
| if http_info.content_type and http_info.content_type not in ALLOWED_CONTENT_TYPES: | |
| reasons.append(f"Non-HTML content-type '{http_info.content_type}'") | |
| # Step 2: robots.txt checks | |
| robots_info = _check_robots(http_info.final_url, ai_agents) | |
| # Step 3: fetch final page to read HTML meta + headers (for X-Robots-Tag) | |
| page_text, headers, _ = _fetch_text(http_info.final_url, timeout=timeout) | |
| meta_html = _parse_meta_robots_from_html(page_text) if page_text else None | |
| xrobots = _parse_xrobots_from_headers(headers) if headers else None | |
| # Decision logic | |
| any_disallowed = any(not allowed for allowed in robots_info.allowed_map.values()) | |
| any_allowed = any(robots_info.allowed_map.values()) | |
| if any_disallowed: | |
| denied_agents = [a for a, ok in robots_info.allowed_map.items() if not ok] | |
| reasons.append(f"robots.txt disallows for agents: {', '.join(denied_agents)}") | |
| if meta_html and (meta_html.ai_block_tokens or meta_html.robots_block_tokens): | |
| reasons.append(f"HTML meta robots contains: {', '.join(meta_html.ai_block_tokens + meta_html.robots_block_tokens)}") | |
| if xrobots and (xrobots.ai_block_tokens or xrobots.robots_block_tokens): | |
| reasons.append(f"X-Robots-Tag contains: {', '.join(xrobots.ai_block_tokens + xrobots.robots_block_tokens)}") | |
| if (meta_html and any(t in AI_BLOCK_TOKENS for t in meta_html.ai_block_tokens)) or \ | |
| (xrobots and any(t in AI_BLOCK_TOKENS for t in xrobots.ai_block_tokens)) or \ | |
| any_disallowed: | |
| verdict = "AI-blocked" | |
| elif http_info.status == 200 and (any_allowed or robots_info.fetched is False): | |
| verdict = "AI-crawlable" | |
| else: | |
| verdict = "Indeterminate" | |
| if verdict == "AI-blocked": | |
| summary = "Page is NOT AI-crawlable due to robots/meta/header restrictions." | |
| elif verdict == "AI-crawlable": | |
| summary = "Page appears AI-crawlable." | |
| else: | |
| summary = "AI crawlability is indeterminate; manual review recommended." | |
| return { | |
| "url": url, | |
| "verdict": verdict, | |
| "reasons": reasons or ["No explicit blocks found."], | |
| "http": asdict(http_info), | |
| "robots": asdict(robots_info), | |
| "meta_html": asdict(meta_html) if meta_html else None, | |
| "xrobots": asdict(xrobots) if xrobots else None, | |
| "analyzed_agents": ai_agents, | |
| "summary": summary, | |
| } | |
| # -------- Gradio UI -------- | |
| def run_check(url: str, agents_csv: str, timeout: int): | |
| url = (url or "").strip() | |
| if not url: | |
| return "Please enter a URL.", {} | |
| agents = None | |
| if agents_csv: | |
| agents = [a.strip() for a in agents_csv.split(",") if a.strip()] | |
| try: | |
| result = check_ai_crawlability(url, ai_agents=agents, timeout=timeout) | |
| # Human-readable summary | |
| readable = [] | |
| readable.append(f"Verdict: {result['verdict']}") | |
| readable.append(f"Summary: {result['summary']}") | |
| readable.append("—") | |
| http = result.get("http", {}) | |
| readable.append(f"HTTP: status={http.get('status')} final_url={http.get('final_url')} content_type={http.get('content_type')}") | |
| redirects = http.get("redirects") or [] | |
| if redirects: | |
| readable.append("Redirect chain:") | |
| for code, loc in redirects: | |
| readable.append(f" {code} -> {loc}") | |
| robots = result.get("robots", {}) | |
| readable.append(f"robots.txt: fetched={robots.get('fetched')} url={robots.get('url')}") | |
| allowed_map = robots.get("allowed_map") or {} | |
| if allowed_map: | |
| readable.append("Agent allowances:") | |
| for agent, allowed in allowed_map.items(): | |
| readable.append(f" {agent}: {'allowed' if allowed else 'DISALLOWED'}") | |
| errors = robots.get("errors") or [] | |
| if errors: | |
| readable.append("robots.txt errors:") | |
| for e in errors: | |
| readable.append(f" - {e}") | |
| reasons = result.get("reasons") or [] | |
| if reasons: | |
| readable.append("Decision reasons:") | |
| for r in reasons: | |
| readable.append(f" - {r}") | |
| return "\n".join(readable), result | |
| except Exception as e: | |
| return f"Error: {e}", {} | |
| CSS = """ | |
| .wrap { max-width: 1200px; margin: 0 auto; } | |
| .small { font-size: 0.9em; color: #555; } | |
| """ | |
| with gr.Blocks(css=CSS, title="Generative Engine Optimization Checker") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🔍 Generative Engine Optimization Checker | |
| Enter a page URL to inspect its AI crawlability for common AI user agents. | |
| The app checks redirects, `robots.txt`, HTML meta robots, and `X-Robots-Tag` headers. | |
| **Note:** Some sites block crawlers; network errors may occur. | |
| """ | |
| ) | |
| with gr.Row(): | |
| url = gr.Textbox(label="Page URL", placeholder="https://example.com/article", scale=3) | |
| with gr.Row(): | |
| agents = gr.Textbox( | |
| label="AI agents (comma-separated, optional)", | |
| placeholder="GPTBot,Anthropic-ai,PerplexityBot", | |
| value="", | |
| scale=2 | |
| ) | |
| timeout = gr.Slider(5, 30, value=15, step=1, label="Timeout (seconds)", scale=1) | |
| with gr.Row(): | |
| run_btn = gr.Button("Run Check ✅", variant="primary") | |
| with gr.Row(): | |
| summary = gr.Textbox(label="Summary", lines=12) | |
| with gr.Row(): | |
| json_out = gr.JSON(label="Raw JSON Result") | |
| examples = gr.Examples( | |
| examples=[ | |
| ["https://openai.com/", "GPTBot,Google-Extended", 15], | |
| ["https://example.com/", "", 15], | |
| ["https://www.anthropic.com/", "Anthropic-ai", 15], | |
| ], | |
| inputs=[url, agents, timeout], | |
| label="Examples" | |
| ) | |
| run_btn.click(run_check, inputs=[url, agents, timeout], outputs=[summary, json_out]) | |
| if __name__ == "__main__": | |
| demo.launch() | |