| | import { EventEmitter } from 'events' |
| | import Bottleneck from 'bottleneck' |
| | import { fetchWithRetry } from '@/frame/lib/fetch-utils' |
| | import cheerio from 'cheerio' |
| |
|
| | import type { Permalink } from '@/search/scripts/scrape/types' |
| |
|
| | |
| | class HTTPError extends Error { |
| | response: { ok: boolean; statusCode?: number } |
| | request: { requestUrl?: { pathname?: string } } |
| |
|
| | constructor( |
| | message: string, |
| | response: { ok: boolean; statusCode?: number }, |
| | request: { requestUrl?: { pathname?: string } }, |
| | ) { |
| | super(message) |
| | this.name = 'HTTPError' |
| | this.response = response |
| | this.request = request |
| | } |
| | } |
| |
|
| | |
| | type HTTPErrorWithUrl = HTTPError & { url?: string; relativePath?: string } |
| | type ErrorWithUrl = Error & { url?: string; relativePath?: string } |
| |
|
| | interface DomWaiterOptions { |
| | parseDOM?: boolean |
| | json?: boolean |
| | maxConcurrent?: number |
| | minTime?: number |
| | } |
| |
|
| | export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter { |
| | const emitter = new EventEmitter() |
| |
|
| | |
| | |
| | |
| | const defaultErrorHandler = () => { |
| | |
| | |
| | } |
| | emitter.on('error', defaultErrorHandler) |
| |
|
| | const defaults = { |
| | parseDOM: true, |
| | json: false, |
| | maxConcurrent: 5, |
| | minTime: 500, |
| | } |
| | opts = Object.assign(defaults, opts) |
| |
|
| | const limiter = new Bottleneck(opts) |
| |
|
| | for (const page of pages) { |
| | async function schedulePage() { |
| | try { |
| | await limiter.schedule(() => getPage(page, emitter, opts)) |
| | } catch (err) { |
| | |
| | emitter.emit('error', err) |
| | } |
| | } |
| |
|
| | schedulePage() |
| | } |
| |
|
| | limiter.on('idle', () => { |
| | emitter.emit('done') |
| | }) |
| |
|
| | limiter.on('error', (err) => { |
| | emitter.emit('error', err) |
| | }) |
| |
|
| | return emitter |
| | } |
| |
|
| | async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) { |
| | |
| | try { |
| | emitter.emit('beforePageLoad', page) |
| |
|
| | if (opts.json) { |
| | try { |
| | const response = await fetchWithRetry(page.url!, undefined, { |
| | retries: 3, |
| | throwHttpErrors: false, |
| | timeout: 60000, |
| | }) |
| | if (!response.ok) { |
| | const httpError = new HTTPError( |
| | `HTTP ${response.status}: ${response.statusText}`, |
| | { ok: response.ok, statusCode: response.status }, |
| | { requestUrl: { pathname: page.url } }, |
| | ) |
| | |
| | ;(httpError as HTTPErrorWithUrl).url = page.url |
| | ;(httpError as HTTPErrorWithUrl).relativePath = page.relativePath |
| | |
| | emitter.emit('error', httpError) |
| | return |
| | } |
| | const json = await response.json() |
| | const pageCopy = Object.assign({}, page, { json }) |
| | emitter.emit('page', pageCopy) |
| | } catch (err) { |
| | |
| | if (err instanceof Error && page.url) { |
| | const enhancedError = new Error(err.message, { cause: err.cause }) |
| | enhancedError.name = err.name |
| | enhancedError.stack = err.stack |
| | ;(enhancedError as ErrorWithUrl).url = page.url |
| | ;(enhancedError as ErrorWithUrl).relativePath = page.relativePath |
| | emitter.emit('error', enhancedError) |
| | } else { |
| | emitter.emit('error', err) |
| | } |
| | } |
| | } else { |
| | try { |
| | const response = await fetchWithRetry(page.url!, undefined, { |
| | retries: 3, |
| | throwHttpErrors: false, |
| | timeout: 60000, |
| | }) |
| | if (!response.ok) { |
| | const httpError = new HTTPError( |
| | `HTTP ${response.status}: ${response.statusText}`, |
| | { ok: response.ok, statusCode: response.status }, |
| | { requestUrl: { pathname: page.url } }, |
| | ) |
| | |
| | ;(httpError as HTTPErrorWithUrl).url = page.url |
| | ;(httpError as HTTPErrorWithUrl).relativePath = page.relativePath |
| | |
| | emitter.emit('error', httpError) |
| | return |
| | } |
| | const body = await response.text() |
| | const pageCopy = Object.assign({}, page, { body }) |
| | if (opts.parseDOM) |
| | (pageCopy as Permalink & { $?: ReturnType<typeof cheerio.load> }).$ = cheerio.load(body) |
| | emitter.emit('page', pageCopy) |
| | } catch (err) { |
| | |
| | if (err instanceof Error && page.url) { |
| | const enhancedError = new Error(err.message, { cause: err.cause }) |
| | enhancedError.name = err.name |
| | enhancedError.stack = err.stack |
| | ;(enhancedError as ErrorWithUrl).url = page.url |
| | ;(enhancedError as ErrorWithUrl).relativePath = page.relativePath |
| | emitter.emit('error', enhancedError) |
| | } else { |
| | emitter.emit('error', err) |
| | } |
| | } |
| | } |
| | } catch (err) { |
| | |
| | console.error('Unexpected error in getPage:', err) |
| | emitter.emit('error', err) |
| | } |
| | } |
| |
|