File size: 1,638 Bytes
88bff1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests

def hygiene_score(entity: str) -> int:
    """
    If 'entity' is a URL:
      • robots.txt allows GPTBot → +30
      • Response time < 0.3s → +20
      • <link rel="canonical"> present → +20
      • <img alt="..."> present → +20
      • No paywall (“subscribe”/“login” not found) → +10
      Cap at 100.
    Else (non-URL), return default 50.
    """
    if not entity.startswith("http"):
        return 50

    score = 0
    try:
        # 1) Check robots.txt
        robots_url = entity.rstrip("/") + "/robots.txt"
        r = requests.get(robots_url, timeout=5)
        if r.status_code == 200 and "GPTBot" in r.text:
            score += 30

        # 2) Page response time
        page_resp = requests.get(entity, timeout=5)
        if page_resp.elapsed.total_seconds() < 0.3:
            score += 20

        # 3) Canonical tag
        if 'rel="canonical"' in page_resp.text:
            score += 20

        # 4) Alt-text on images
        if "<img" in page_resp.text and "alt=" in page_resp.text:
            score += 20

        # 5) Paywall check
        lower = page_resp.text.lower()
        if "subscribe" not in lower and "login" not in lower:
            score += 10

        return min(score, 100)
    except Exception:
        return 40

def hygiene_recommendation(entity: str, score: int) -> str:
    if score < 50:
        return (
            "Ensure robots.txt allows GPTBot, add a canonical tag, "
            "include alt-text for all images, and remove any paywalls."
        )
    return "Technical hygiene is solid; continue monitoring robots.txt and server speed."