Spaces:
Paused
Paused
File size: 2,753 Bytes
0e759d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import { Document, ScrapeOptions, URLTrace, scrapeOptions } from "../../controllers/v1/types";
import { logger } from "../logger";
import { getScrapeQueue } from "../../services/queue-service";
import { waitForJob } from "../../services/queue-jobs";
import { addScrapeJob } from "../../services/queue-jobs";
import { getJobPriority } from "../job-priority";
import type { Logger } from "winston";
import { getJobFromGCS } from "../gcs-jobs";
interface ScrapeDocumentOptions {
url: string;
teamId: string;
origin: string;
timeout: number;
isSingleUrl?: boolean;
}
export async function scrapeDocument(
options: ScrapeDocumentOptions,
urlTraces: URLTrace[],
logger: Logger,
internalScrapeOptions: Partial<ScrapeOptions> = { onlyMainContent: false },
): Promise<Document | null> {
const trace = urlTraces.find((t) => t.url === options.url);
if (trace) {
trace.status = "scraped";
trace.timing.scrapedAt = new Date().toISOString();
}
async function attemptScrape(timeout: number) {
const jobId = crypto.randomUUID();
const jobPriority = await getJobPriority({
team_id: options.teamId,
basePriority: 10,
from_extract: true,
});
await addScrapeJob(
{
url: options.url,
mode: "single_urls",
team_id: options.teamId,
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
internalOptions: {
useCache: true,
teamId: options.teamId,
},
origin: options.origin,
is_scrape: true,
from_extract: true,
},
{},
jobId,
jobPriority,
);
const doc = await waitForJob(jobId, timeout);
await getScrapeQueue().remove(jobId);
if (trace) {
trace.timing.completedAt = new Date().toISOString();
trace.contentStats = {
rawContentLength: doc.markdown?.length || 0,
processedContentLength: doc.markdown?.length || 0,
tokensUsed: 0,
};
}
return doc;
}
try {
try {
logger.debug("Attempting scrape...");
const x = await attemptScrape(options.timeout);
logger.debug("Scrape finished!");
return x;
} catch (timeoutError) {
logger.warn("Scrape failed.", { error: timeoutError });
if (options.isSingleUrl) {
// For single URLs, try again with double timeout
logger.debug("Attempting scrape...");
const x = await attemptScrape(options.timeout * 2);
logger.debug("Scrape finished!");
return x;
}
throw timeoutError;
}
} catch (error) {
logger.error(`error in scrapeDocument`, { error });
if (trace) {
trace.status = "error";
trace.error = error.message;
}
return null;
}
}
|