Spaces:

Alinabil1
/

last_edit

Sleeping

last_edit / src /robots_check.py

Moharek

Deploy Moharek GEO Platform

a74b879 about 1 month ago

1.8 kB

	import requests
	from urllib.parse import urljoin

	AI_USER_AGENTS = ['GPTBot', 'Google-Extended', 'PerplexityBot', 'ClaudeBot', 'CCBot']


	def is_ai_allowed(base_url: str) -> dict:
	"""Fetch robots.txt and report whether common AI crawlers are allowed or disallowed.

	Returns a dict { 'robots_url': ..., 'allowed': {agent: True/False/None}, 'raw': '...' }
	"""
	robots_url = urljoin(base_url, '/robots.txt')
	try:
	r = requests.get(robots_url, timeout=8)
	if r.status_code != 200:
	return {'robots_url': robots_url, 'allowed': {}, 'raw': None, 'reason': f'status {r.status_code}'}
	txt = r.text
	allowed = {}
	lines = [ln.strip() for ln in txt.splitlines() if ln.strip() and not ln.strip().startswith('#')]
	# simple parse: track last user-agent block
	ua = None
	rules = {}
	for ln in lines:
	if ln.lower().startswith('user-agent:'):
	ua = ln.split(':',1)[1].strip()
	rules.setdefault(ua, [])
	elif ua and (ln.lower().startswith('disallow:') or ln.lower().startswith('allow:')):
	rules[ua].append(ln)

	for a in AI_USER_AGENTS:
	# check exact agent then wildcard
	val = None
	if a in rules:
	# if any Disallow: / present -> blocked
	dis = any('disallow: /' in r.lower() for r in rules[a])
	val = not dis
	elif '*' in rules:
	dis = any('disallow: /' in r.lower() for r in rules['*'])
	val = not dis
	allowed[a] = val

	return {'robots_url': robots_url, 'allowed': allowed, 'raw': txt}
	except Exception as e:
	return {'robots_url': robots_url, 'allowed': {}, 'raw': None, 'reason': str(e)}