File size: 5,961 Bytes
f316cce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
    var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
    if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
    else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
    return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
    if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RobotsTxtService = exports.md5Hasher = void 0;
const tsyringe_1 = require("tsyringe");
const url_1 = require("url");
const civ_rpc_1 = require("civkit/civ-rpc");
const async_service_1 = require("civkit/async-service");
const hash_1 = require("civkit/hash");
const lang_1 = require("civkit/lang");
const logger_1 = require("./logger");
const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket");
const threaded_1 = require("../services/threaded");
exports.md5Hasher = new hash_1.HashManager('md5', 'hex');
let RobotsTxtService = class RobotsTxtService extends async_service_1.AsyncService {
    constructor(globalLogger, firebaseStorageBucketControl) {
        super(...arguments);
        this.globalLogger = globalLogger;
        this.firebaseStorageBucketControl = firebaseStorageBucketControl;
        this.logger = this.globalLogger.child({ service: this.constructor.name });
    }
    async init() {
        await this.dependencyReady();
        this.emit('ready');
    }
    async getCachedRobotTxt(origin) {
        const digest = exports.md5Hasher.hash(origin.toLowerCase());
        const cacheLoc = `robots-txt/${digest}`;
        let buff;
        buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
        if (buff) {
            return buff.toString();
        }
        const r = await fetch(new url_1.URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
        if (!r.ok) {
            throw new civ_rpc_1.DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
        }
        buff = Buffer.from(await r.arrayBuffer());
        this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
            contentType: 'text/plain'
        }).catch((err) => {
            this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: (0, lang_1.marshalErrorLike)(err) });
        });
        return buff.toString();
    }
    async assertAccessAllowed(url, inputMyUa = '*') {
        let robotTxt = '';
        try {
            robotTxt = await this.getCachedRobotTxt(url.origin);
        }
        catch (err) {
            if (err instanceof civ_rpc_1.DownstreamServiceFailureError) {
                // Remote server is reachable but cannot provide a robot.txt; this is treated as public access
                return true;
            }
            throw new civ_rpc_1.AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
        }
        const myUa = inputMyUa.toLowerCase();
        const lines = robotTxt.split(/\r?\n/g);
        let currentUa = myUa || '*';
        let uaLine = 'User-Agent: *';
        const pathNormalized = `${url.pathname}?`;
        for (const line of lines) {
            const trimmed = line.trim();
            if (trimmed.startsWith('#') || !trimmed) {
                continue;
            }
            const [k, ...rest] = trimmed.split(':');
            const key = k.trim().toLowerCase();
            const value = rest.join(':').trim();
            if (key === 'user-agent') {
                currentUa = value.toLowerCase();
                if (value === '*') {
                    currentUa = myUa;
                }
                uaLine = line;
                continue;
            }
            if (currentUa !== myUa) {
                continue;
            }
            if (key === 'disallow') {
                if (!value) {
                    return true;
                }
                if (value.includes('*')) {
                    const [head, tail] = value.split('*');
                    if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
                        throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
                    }
                }
                else if (pathNormalized.startsWith(value)) {
                    throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
                }
                continue;
            }
            if (key === 'allow') {
                if (!value) {
                    return true;
                }
                if (pathNormalized.startsWith(value)) {
                    return true;
                }
                continue;
            }
        }
        return true;
    }
};
exports.RobotsTxtService = RobotsTxtService;
__decorate([
    (0, threaded_1.Threaded)(),
    __metadata("design:type", Function),
    __metadata("design:paramtypes", [typeof (_a = typeof url_1.URL !== "undefined" && url_1.URL) === "function" ? _a : Object, Object]),
    __metadata("design:returntype", Promise)
], RobotsTxtService.prototype, "assertAccessAllowed", null);
exports.RobotsTxtService = RobotsTxtService = __decorate([
    (0, tsyringe_1.singleton)(),
    __metadata("design:paramtypes", [logger_1.GlobalLogger,
        firebase_storage_bucket_1.FirebaseStorageBucketControl])
], RobotsTxtService);
//# sourceMappingURL=robots-text.js.map