File size: 4,400 Bytes
23a3b80
66db317
0da71ca
23a3b80
 
 
 
66db317
 
23a3b80
 
 
 
 
 
 
 
 
 
 
66db317
23a3b80
 
 
 
 
 
 
 
 
 
 
 
0d6cf2b
23a3b80
 
 
 
 
 
 
 
0da71ca
23a3b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da71ca
23a3b80
 
0da71ca
23a3b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import { singleton } from 'tsyringe';
import { URL } from 'url';
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
import { AsyncService } from 'civkit/async-service';
import { HashManager } from 'civkit/hash';
import { marshalErrorLike } from 'civkit/lang';

import { GlobalLogger } from './logger';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { Threaded } from '../services/threaded';


export const md5Hasher = new HashManager('md5', 'hex');

@singleton()
export class RobotsTxtService extends AsyncService {

    logger = this.globalLogger.child({ service: this.constructor.name });

    constructor(
        protected globalLogger: GlobalLogger,
        protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
    ) {
        super(...arguments);
    }

    override async init() {
        await this.dependencyReady();
        this.emit('ready');
    }

    async getCachedRobotTxt(origin: string) {
        const digest = md5Hasher.hash(origin.toLowerCase());
        const cacheLoc = `robots-txt/${digest}`;
        let buff;
        buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
        if (buff) {
            return buff.toString();
        }

        const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
        if (!r.ok) {
            throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
        }
        buff = Buffer.from(await r.arrayBuffer());

        this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
            contentType: 'text/plain'
        }).catch((err) => {
            this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) });
        });

        return buff.toString();
    }

    @Threaded()
    async assertAccessAllowed(url: URL, inputMyUa = '*') {
        let robotTxt: string = '';
        try {
            robotTxt = await this.getCachedRobotTxt(url.origin);
        } catch (err) {
            if (err instanceof DownstreamServiceFailureError) {
                // Remote server is reachable but cannot provide a robot.txt; this is treated as public access
                return true;
            }
            throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
        }
        const myUa = inputMyUa.toLowerCase();
        const lines = robotTxt.split(/\r?\n/g);

        let currentUa = myUa || '*';
        let uaLine = 'User-Agent: *';
        const pathNormalized = `${url.pathname}?`;

        for (const line of lines) {
            const trimmed = line.trim();
            if (trimmed.startsWith('#') || !trimmed) {
                continue;
            }
            const [k, ...rest] = trimmed.split(':');
            const key = k.trim().toLowerCase();
            const value = rest.join(':').trim();

            if (key === 'user-agent') {
                currentUa = value.toLowerCase();
                if (value === '*') {
                    currentUa = myUa;
                }
                uaLine = line;
                continue;
            }

            if (currentUa !== myUa) {
                continue;
            }

            if (key === 'disallow') {
                if (!value) {
                    return true;
                }
                if (value.includes('*')) {
                    const [head, tail] = value.split('*');
                    if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
                        throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
                    }
                } else if (pathNormalized.startsWith(value)) {
                    throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
                }

                continue;
            }

            if (key === 'allow') {
                if (!value) {
                    return true;
                }
                if (pathNormalized.startsWith(value)) {
                    return true;
                }
                continue;
            }
        }

        return true;
    }

}