Spaces:
Build error
Build error
File size: 4,400 Bytes
23a3b80 66db317 0da71ca 23a3b80 66db317 23a3b80 66db317 23a3b80 0d6cf2b 23a3b80 0da71ca 23a3b80 0da71ca 23a3b80 0da71ca 23a3b80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | import { singleton } from 'tsyringe';
import { URL } from 'url';
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
import { AsyncService } from 'civkit/async-service';
import { HashManager } from 'civkit/hash';
import { marshalErrorLike } from 'civkit/lang';
import { GlobalLogger } from './logger';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { Threaded } from '../services/threaded';
export const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class RobotsTxtService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: GlobalLogger,
protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async getCachedRobotTxt(origin: string) {
const digest = md5Hasher.hash(origin.toLowerCase());
const cacheLoc = `robots-txt/${digest}`;
let buff;
buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
if (buff) {
return buff.toString();
}
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
if (!r.ok) {
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
}
buff = Buffer.from(await r.arrayBuffer());
this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
contentType: 'text/plain'
}).catch((err) => {
this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) });
});
return buff.toString();
}
@Threaded()
async assertAccessAllowed(url: URL, inputMyUa = '*') {
let robotTxt: string = '';
try {
robotTxt = await this.getCachedRobotTxt(url.origin);
} catch (err) {
if (err instanceof DownstreamServiceFailureError) {
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
return true;
}
throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
}
const myUa = inputMyUa.toLowerCase();
const lines = robotTxt.split(/\r?\n/g);
let currentUa = myUa || '*';
let uaLine = 'User-Agent: *';
const pathNormalized = `${url.pathname}?`;
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith('#') || !trimmed) {
continue;
}
const [k, ...rest] = trimmed.split(':');
const key = k.trim().toLowerCase();
const value = rest.join(':').trim();
if (key === 'user-agent') {
currentUa = value.toLowerCase();
if (value === '*') {
currentUa = myUa;
}
uaLine = line;
continue;
}
if (currentUa !== myUa) {
continue;
}
if (key === 'disallow') {
if (!value) {
return true;
}
if (value.includes('*')) {
const [head, tail] = value.split('*');
if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
} else if (pathNormalized.startsWith(value)) {
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
continue;
}
if (key === 'allow') {
if (!value) {
return true;
}
if (pathNormalized.startsWith(value)) {
return true;
}
continue;
}
}
return true;
}
}
|