web_reader / build /services /serp /google.js
Mohammad Shahid
Include pre-built files for HF deployment
f316cce
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var _a, _b;
Object.defineProperty(exports, "__esModule", { value: true });
exports.GoogleSERP = void 0;
const tsyringe_1 = require("tsyringe");
const async_service_1 = require("civkit/async-service");
const logger_1 = require("../logger");
const jsdom_1 = require("../jsdom");
const worker_threads_1 = require("worker_threads");
const lodash_1 = __importDefault(require("lodash"));
const puppeteer_1 = require("./puppeteer");
const curl_1 = require("../curl");
const promises_1 = require("fs/promises");
const civ_rpc_1 = require("civkit/civ-rpc");
const errors_1 = require("../errors");
const vectorize_1 = require("civkit/vectorize");
const decorators_1 = require("civkit/decorators");
const proxy_provider_1 = require("../../shared/services/proxy-provider");
let GoogleSERP = class GoogleSERP extends async_service_1.AsyncService {
constructor(globalLogger, puppeteerControl, jsDomControl, curlControl, proxyProvider) {
const filteredDeps = worker_threads_1.isMainThread ? arguments : lodash_1.default.without(arguments, puppeteerControl);
super(...filteredDeps);
this.globalLogger = globalLogger;
this.puppeteerControl = puppeteerControl;
this.jsDomControl = jsDomControl;
this.curlControl = curlControl;
this.proxyProvider = proxyProvider;
this.logger = this.globalLogger.child({ service: this.constructor.name });
this.googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
}
async init() {
await this.dependencyReady();
this.emit('ready');
}
async sideLoadWithAllocatedProxy(url, opts) {
if (opts?.allocProxy === 'none') {
return this.curlControl.sideLoad(url, opts);
}
const proxy = await this.proxyProvider.alloc(process.env.PREFERRED_PROXY_COUNTRY || 'auto');
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
const r = await this.curlControl.sideLoad(url, {
...opts,
proxyUrl: proxy.href,
});
if (r.status === 429) {
throw new errors_1.ServiceBadAttemptError('Google returned a 429 error. This may happen due to various reasons, including rate limiting or other issues.');
}
if (opts && opts.allocProxy) {
opts.proxyUrl ??= proxy.href;
}
return { ...r, proxy };
}
digestQuery(query) {
const url = new URL(`https://${this.googleDomain}/search`);
const clone = { ...query };
const num = clone.num || 10;
if (clone.page) {
const page = parseInt(clone.page);
delete clone.page;
clone.start = (page - 1) * num;
if (clone.start === 0) {
delete clone.start;
}
}
if (clone.location) {
delete clone.location;
}
for (const [k, v] of Object.entries(clone)) {
if (v === undefined || v === null) {
continue;
}
url.searchParams.set(k, `${v}`);
}
return url;
}
async webSearch(query, opts) {
const url = this.digestQuery(query);
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
if (opts && sideLoaded.sideLoadOpts) {
opts.sideLoad = sideLoaded.sideLoadOpts;
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
return snapshot;
}
async newsSearch(query, opts) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'nws');
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
if (opts && sideLoaded.sideLoadOpts) {
opts.sideLoad = sideLoaded.sideLoadOpts;
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
return snapshot;
}
async imageSearch(query, opts) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'isch');
url.searchParams.set('asearch', 'isch');
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
if (sideLoaded.status !== 200 || !sideLoaded.file) {
throw new errors_1.ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
const jsonTxt = (await (0, promises_1.readFile)((await sideLoaded.file.filePath))).toString();
const rJSON = (0, vectorize_1.parseJSONText)(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
return lodash_1.default.get(rJSON, 'ischj.metadata').map((x) => {
return {
link: lodash_1.default.get(x, 'result.referrer_url'),
title: lodash_1.default.get(x, 'result.page_title'),
snippet: lodash_1.default.get(x, 'text_in_grid.snippet'),
source: lodash_1.default.get(x, 'result.site_title'),
imageWidth: lodash_1.default.get(x, 'original_image.width'),
imageHeight: lodash_1.default.get(x, 'original_image.height'),
imageUrl: lodash_1.default.get(x, 'original_image.url'),
variant: 'images',
};
});
}
};
exports.GoogleSERP = GoogleSERP;
__decorate([
(0, decorators_1.retryWith)((err) => {
if (err instanceof errors_1.ServiceBadApproachError) {
return false;
}
if (err instanceof errors_1.ServiceBadAttemptError) {
// Keep trying
return true;
}
if (err instanceof civ_rpc_1.ApplicationError) {
// Quit with this error
return false;
}
return undefined;
}, 3),
__metadata("design:type", Function),
__metadata("design:paramtypes", [typeof (_b = typeof URL !== "undefined" && URL) === "function" ? _b : Object, Object]),
__metadata("design:returntype", Promise)
], GoogleSERP.prototype, "sideLoadWithAllocatedProxy", null);
exports.GoogleSERP = GoogleSERP = __decorate([
(0, tsyringe_1.singleton)(),
__metadata("design:paramtypes", [logger_1.GlobalLogger,
puppeteer_1.SERPSpecializedPuppeteerControl,
jsdom_1.JSDomControl,
curl_1.CurlControl, typeof (_a = typeof proxy_provider_1.ProxyProviderService !== "undefined" && proxy_provider_1.ProxyProviderService) === "function" ? _a : Object])
], GoogleSERP);
async function getWebSearchResults() {
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
// @ts-ignore
await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
if (!wrapper1) {
return undefined;
}
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
if (!query) {
return undefined;
}
const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
return candidates.map((x, pos) => {
const primaryLink = x.querySelector('a:not([href="#"])');
if (!primaryLink) {
return undefined;
}
const url = primaryLink.getAttribute('href');
if (primaryLink.querySelector('div[role="heading"]')) {
// const spans = primaryLink.querySelectorAll('span');
// const title = spans[0]?.textContent;
// const source = spans[1]?.textContent;
// const date = spans[spans.length - 1].textContent;
// return {
// link: url,
// title,
// source,
// date,
// variant: 'video'
// };
return undefined;
}
const title = primaryLink.querySelector('h3')?.textContent;
const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
let date = cite?.split('·')[1]?.trim();
const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
let snippet = snippets[snippets.length - 1]?.textContent;
if (!snippet) {
snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
}
date ??= snippets[snippets.length - 2]?.textContent?.trim();
const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
return {
link: l.getAttribute('href'),
title: l.textContent,
};
});
const perhapsParent = x.parentElement?.closest('div[data-hveid]');
if (!siteLinks?.length && perhapsParent) {
const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
if (candidates.length) {
siteLinks = candidates.map((l) => {
const link = l.querySelector('a');
if (!link) {
return undefined;
}
const snippet = l.nextElementSibling?.textContent;
return {
link: link.getAttribute('href'),
title: link.textContent,
snippet,
};
}).filter(Boolean);
}
}
return {
link: url,
title,
source,
date,
snippet: snippet ?? undefined,
imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
siteLinks: siteLinks.length ? siteLinks : undefined,
variant: 'web',
};
}).filter(Boolean);
}
async function getNewsSearchResults() {
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
// @ts-ignore
await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
if (!wrapper1) {
return undefined;
}
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
if (!query) {
return undefined;
}
const candidates = Array.from(wrapper1.querySelectorAll('div[data-news-doc-id]'));
return candidates.map((x) => {
const primaryLink = x.querySelector('a:not([href="#"])');
if (!primaryLink) {
return undefined;
}
const url = primaryLink.getAttribute('href');
const titleElem = primaryLink.querySelector('div[role="heading"]');
if (!titleElem) {
return undefined;
}
const title = titleElem.textContent?.trim();
const source = titleElem.previousElementSibling?.textContent?.trim();
const snippet = titleElem.nextElementSibling?.textContent?.trim();
const innerSpans = Array.from(titleElem.parentElement?.querySelectorAll('span') || []);
const date = innerSpans[innerSpans.length - 1]?.textContent?.trim();
return {
link: url,
title,
source,
date,
snippet,
variant: 'news',
};
}).filter(Boolean);
}
//# sourceMappingURL=google.js.map