Spaces:
Build error
Build error
File size: 5,961 Bytes
f316cce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | "use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RobotsTxtService = exports.md5Hasher = void 0;
const tsyringe_1 = require("tsyringe");
const url_1 = require("url");
const civ_rpc_1 = require("civkit/civ-rpc");
const async_service_1 = require("civkit/async-service");
const hash_1 = require("civkit/hash");
const lang_1 = require("civkit/lang");
const logger_1 = require("./logger");
const firebase_storage_bucket_1 = require("../shared/services/firebase-storage-bucket");
const threaded_1 = require("../services/threaded");
exports.md5Hasher = new hash_1.HashManager('md5', 'hex');
let RobotsTxtService = class RobotsTxtService extends async_service_1.AsyncService {
constructor(globalLogger, firebaseStorageBucketControl) {
super(...arguments);
this.globalLogger = globalLogger;
this.firebaseStorageBucketControl = firebaseStorageBucketControl;
this.logger = this.globalLogger.child({ service: this.constructor.name });
}
async init() {
await this.dependencyReady();
this.emit('ready');
}
async getCachedRobotTxt(origin) {
const digest = exports.md5Hasher.hash(origin.toLowerCase());
const cacheLoc = `robots-txt/${digest}`;
let buff;
buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
if (buff) {
return buff.toString();
}
const r = await fetch(new url_1.URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
if (!r.ok) {
throw new civ_rpc_1.DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
}
buff = Buffer.from(await r.arrayBuffer());
this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
contentType: 'text/plain'
}).catch((err) => {
this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: (0, lang_1.marshalErrorLike)(err) });
});
return buff.toString();
}
async assertAccessAllowed(url, inputMyUa = '*') {
let robotTxt = '';
try {
robotTxt = await this.getCachedRobotTxt(url.origin);
}
catch (err) {
if (err instanceof civ_rpc_1.DownstreamServiceFailureError) {
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
return true;
}
throw new civ_rpc_1.AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
}
const myUa = inputMyUa.toLowerCase();
const lines = robotTxt.split(/\r?\n/g);
let currentUa = myUa || '*';
let uaLine = 'User-Agent: *';
const pathNormalized = `${url.pathname}?`;
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith('#') || !trimmed) {
continue;
}
const [k, ...rest] = trimmed.split(':');
const key = k.trim().toLowerCase();
const value = rest.join(':').trim();
if (key === 'user-agent') {
currentUa = value.toLowerCase();
if (value === '*') {
currentUa = myUa;
}
uaLine = line;
continue;
}
if (currentUa !== myUa) {
continue;
}
if (key === 'disallow') {
if (!value) {
return true;
}
if (value.includes('*')) {
const [head, tail] = value.split('*');
if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
}
else if (pathNormalized.startsWith(value)) {
throw new civ_rpc_1.ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
continue;
}
if (key === 'allow') {
if (!value) {
return true;
}
if (pathNormalized.startsWith(value)) {
return true;
}
continue;
}
}
return true;
}
};
exports.RobotsTxtService = RobotsTxtService;
__decorate([
(0, threaded_1.Threaded)(),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_a = typeof url_1.URL !== "undefined" && url_1.URL) === "function" ? _a : Object, Object]),
__metadata("design:returntype", Promise)
], RobotsTxtService.prototype, "assertAccessAllowed", null);
exports.RobotsTxtService = RobotsTxtService = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
firebase_storage_bucket_1.FirebaseStorageBucketControl])
], RobotsTxtService);
//# sourceMappingURL=robots-text.js.map |