Steel / api /src /modules /actions /actions.controller.ts
supernovagateway's picture
Upload folder using huggingface_hub
fb38ec5 verified
import { FastifyReply } from "fastify";
import { BrowserContext, Page, HTTPResponse } from "puppeteer-core";
import { CDPService } from "../../services/cdp/cdp.service.js";
import { SessionService } from "../../services/session.service.js";
import { ScrapeFormat } from "../../types/index.js";
import { getErrors } from "../../utils/errors.js";
import { updateLog } from "../../utils/logging.js";
import { IProxyServer } from "../../utils/proxy.js";
import {
cleanHtml,
getDefuddleContent,
htmlToMarkdown,
transformHtml,
} from "../../utils/scrape/index.js";
import { normalizeUrl } from "../../utils/url.js";
import { PDFRequest, ScrapeRequest, ScreenshotRequest, SearchRequest } from "./actions.schema.js";
import { DefuddleResponse } from "defuddle";
import pdf2html from "pdf2html";
import {
buildHtmlLikeMetadataFromPdf,
extractLinksFromConvertedHtml,
} from "../../utils/scrape/pdfToHtml.js";
import { safeGoto } from "../../utils/scrape/safeGoTo.js";
export const handleScrape = async (
sessionService: SessionService,
browserService: CDPService,
request: ScrapeRequest,
reply: FastifyReply,
) => {
const startTime = Date.now();
let times: Record<string, number> = {};
const { url, format, screenshot, pdf, proxyUrl, logUrl, delay } = request.body;
let proxy: IProxyServer | null = null;
let context: BrowserContext | null = null;
try {
if (proxyUrl) {
proxy = await sessionService.proxyFactory(proxyUrl);
await proxy.listen();
}
times.proxyTime = Date.now() - startTime;
let page: Page;
let response: HTTPResponse | null = null;
let pdfResponse: HTTPResponse | null = null;
let isPdfNavigation = false;
if (!browserService.isRunning()) {
await browserService.launch();
}
if (proxy) {
// If a proxy is used, we proceed with browser navigation; implementing proxy-aware Node fetch
// would require an HTTP agent and is outside current scope.
context = await browserService.createBrowserContext(proxy.url);
page = await context.newPage();
times.proxyPageTime = Date.now() - startTime - times.proxyTime;
} else {
page = await browserService.getPrimaryPage();
times.pageTime = Date.now() - startTime - times.proxyTime;
}
// PDF retrieval will use node fetch with session cookies; removed CDP tracking
let normalizedUrl: string | null = null;
if (url) {
normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
throw new Error(`Invalid URL: ${url}`);
}
}
const safeResponse = normalizedUrl
? await safeGoto(page, normalizedUrl, {
timeout: 30000,
waitUntil: "domcontentloaded",
})
: { response: null, isPdf: false, pdfResponse: null };
response = safeResponse.response !== null ? safeResponse.response : safeResponse.pdfResponse;
pdfResponse = safeResponse.pdfResponse;
const isPdf = safeResponse.isPdf;
if (delay) {
await new Promise((resolve) => setTimeout(resolve, delay));
}
const contentType = response?.headers()["content-type"]?.toLowerCase() || "";
let scrapeResponse: Record<string, any> = {};
let htmlContent = "";
let cleanedHtml: string;
let readabilityContent: DefuddleResponse;
if (isPdf || contentType.includes("application/pdf")) {
// Node fetch using session cookies (same browser auth state)
const targetUrl = normalizedUrl || url!;
const cookies = await page.cookies(targetUrl);
const cookieHeader = cookies.map((c) => `${c.name}=${c.value}`).join("; ");
const fetchHeaders: Record<string, string> = {};
if (cookieHeader) fetchHeaders["Cookie"] = cookieHeader;
if (!fetchHeaders["Referer"]) {
const u = new URL(targetUrl);
fetchHeaders["Referer"] = u.origin + "/";
}
const nodeRes = await fetch(targetUrl, {
method: "GET",
redirect: "follow",
headers: fetchHeaders,
});
const nodeCT = (nodeRes.headers.get("content-type") || "").toLowerCase();
if (!nodeRes.ok || !nodeCT.includes("application/pdf")) {
throw new Error(`Expected PDF; got status ${nodeRes.status} content-type ${nodeCT}`);
}
const arrBuf = await nodeRes.arrayBuffer();
const pdfBuffer = Buffer.from(arrBuf);
const convertStart = Date.now();
htmlContent = await pdf2html.html(pdfBuffer);
times.pdfHtmlConvertTime = Date.now() - convertStart;
const metaStart = Date.now();
const pdfMeta = await pdf2html.meta(pdfBuffer);
times.pdfMetaTime = Date.now() - metaStart;
const htmlMeta = buildHtmlLikeMetadataFromPdf(pdfMeta, {
urlSource: targetUrl,
statusCode: nodeRes.status,
htmlForFallback: htmlContent,
});
const htmlLinks = extractLinksFromConvertedHtml(htmlContent);
scrapeResponse = {
content: {},
metadata: {
...htmlMeta,
statusCode: nodeRes.status,
headers: Object.fromEntries(nodeRes.headers.entries()),
originalContentType: nodeCT,
pdfAcquisition: "node-fetch-with-cookies",
},
links: htmlLinks,
};
if (pdf) {
scrapeResponse.pdf = pdfBuffer.toString("base64");
}
} else {
// Regular HTML flow
await page.evaluate(() => {
(window as any).__name = (func: Function) => func;
});
const [{ html, metadata, links }, base64Screenshot, pdfBuffer] = await Promise.all([
page.evaluate(() => {
const getMetaContent = (selector: string) => {
const element = document.querySelector(selector);
return element ? element.getAttribute("content") : null;
};
const getMetaByName = (name: string) => getMetaContent(`meta[name="${name}"]`);
const getMetaByProperty = (property: string) =>
getMetaContent(`meta[property="${property}"]`);
const extractJsonLd = () => {
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
const jsonLdData: any[] = [];
scripts.forEach((script) => {
try {
const data = JSON.parse(script.textContent || "");
jsonLdData.push(data);
} catch (e) {
console.error(e);
}
});
return jsonLdData;
};
return {
html: document.documentElement.outerHTML,
links: [...document.links].map((l) => ({
url: l.href,
text: l.textContent?.trim() || "",
})),
metadata: {
title: document.title,
language: document.documentElement.lang,
urlSource: window.location.href,
timestamp: new Date().toISOString(),
description: getMetaByName("description"),
keywords: getMetaByName("keywords"),
author: getMetaByName("author"),
ogTitle: getMetaByProperty("og:title"),
ogDescription: getMetaByProperty("og:description"),
ogImage: getMetaByProperty("og:image"),
ogUrl: getMetaByProperty("og:url"),
ogSiteName: getMetaByProperty("og:site_name"),
articleAuthor: getMetaByProperty("article:author"),
publishedTime: getMetaByProperty("article:published_time"),
modifiedTime: getMetaByProperty("article:modified_time"),
canonical: document.querySelector('link[rel="canonical"]')?.getAttribute("href"),
favicon: document.querySelector('link[rel="icon"]')?.getAttribute("href"),
jsonLd: extractJsonLd(),
statusCode: 200,
},
};
}),
screenshot ? page.screenshot({ encoding: "base64", type: "jpeg", quality: 100 }) : null,
pdf ? page.pdf() : null,
]);
htmlContent = html;
times.extractionTime = Date.now() - startTime - (times.pageLoadTime || 0);
scrapeResponse = { content: {}, metadata, links };
if (base64Screenshot) {
scrapeResponse.screenshot = base64Screenshot;
}
if (pdfBuffer) {
scrapeResponse.pdf = Buffer.from(pdfBuffer).toString("base64");
}
}
// Format handling (works for both PDF converted HTML and normal HTML)
if (format && format.length > 0) {
if (format.includes(ScrapeFormat.HTML)) {
scrapeResponse.content.html = htmlContent;
}
const needsCleanedHtml = format.includes(ScrapeFormat.CLEANED_HTML);
const needsReadability =
format.includes(ScrapeFormat.READABILITY) || format.includes(ScrapeFormat.MARKDOWN);
if (needsCleanedHtml) {
const cleanHtmlStart = Date.now();
cleanedHtml = cleanHtml(htmlContent);
times.cleanedHtmlTime = Date.now() - cleanHtmlStart;
if (format.includes(ScrapeFormat.CLEANED_HTML)) {
scrapeResponse.content.cleaned_html = cleanedHtml;
}
}
if (needsReadability) {
const readabilityStart = Date.now();
readabilityContent = await getDefuddleContent(
transformHtml(htmlContent, normalizedUrl || url),
);
times.readabilityTime = Date.now() - readabilityStart;
if (format.includes(ScrapeFormat.READABILITY)) {
scrapeResponse.content.readability = readabilityContent.content;
}
}
if (format.includes(ScrapeFormat.MARKDOWN)) {
const markdownStart = Date.now();
scrapeResponse.content.markdown = await htmlToMarkdown(readabilityContent!.content);
times.markdownTime = Date.now() - markdownStart;
}
} else {
scrapeResponse.content.html = htmlContent;
}
times.totalInstanceTime = Date.now() - startTime;
if (logUrl) {
await updateLog(logUrl, { times });
}
return reply.send(scrapeResponse);
} catch (e: unknown) {
const error = getErrors(e);
if (logUrl) {
await updateLog(logUrl, { times, response: { browserError: error } });
}
if (url) {
await browserService.refreshPrimaryPage();
}
return reply.code(500).send({ message: error });
} finally {
if (context) {
await context.close().catch(() => {});
}
if (proxy) {
await proxy.close(true).catch(() => {});
}
}
};
export const handleSearch = async (
sessionService: SessionService,
browserService: CDPService,
request: SearchRequest,
reply: FastifyReply,
) => {
const startTime = Date.now();
let times: Record<string, number> = {};
const { query, proxyUrl, logUrl } = request.body;
let proxy: IProxyServer | null = null;
let context: BrowserContext | null = null;
try {
if (proxyUrl) {
proxy = await sessionService.proxyFactory(proxyUrl);
await proxy.listen();
}
times.proxyTime = Date.now() - startTime;
let page: Page;
if (!browserService.isRunning()) {
await browserService.launch();
}
if (proxy) {
// If a proxy is used, we proceed with browser navigation; implementing proxy-aware Node fetch
// would require an HTTP agent and is outside current scope.
context = await browserService.createBrowserContext(proxy.url);
page = await context.newPage();
times.proxyPageTime = Date.now() - startTime - times.proxyTime;
} else {
page = await browserService.getPrimaryPage();
times.pageTime = Date.now() - startTime - times.proxyTime;
}
await page.evaluate(() => {
(window as any).__name = (func: Function) => func;
});
// Go to Brave
await page.goto(`https://search.brave.com/search?q=${encodeURIComponent(query)}`, {
waitUntil: "networkidle2",
});
// Wait for results to load
await page.waitForSelector("#results");
// Scrape results
const results = await page.evaluate(() => {
const items = document.querySelectorAll("div.snippet");
return Array.from(items)
.map((item) => {
if (
[
"llm-snippet",
"faq",
"pagination-snippet",
"search-elsewhere",
"infoblox-snippet",
"discussions",
].includes(item.id)
) {
return;
}
const urlEl = item.querySelector("div.result-content a");
const descEl = item.querySelector("div.generic-snippet");
const titleEl = item.querySelector("div.result-content a div.title");
return {
title: titleEl?.textContent?.trim() || null,
url: urlEl?.getAttribute("href") || null,
description: descEl?.textContent?.split("-")[1]?.trim() || null,
};
})
.filter(
(item) =>
item &&
typeof item === "object" &&
"title" in item &&
"url" in item &&
"description" in item &&
item.title !== null &&
item.url !== null,
);
});
times.totalInstanceTime = Date.now() - startTime;
if (logUrl) {
await updateLog(logUrl, { times });
}
return reply.send({ results });
} catch (e: unknown) {
const error = getErrors(e);
if (logUrl) {
await updateLog(logUrl, { times, response: { browserError: error } });
}
return reply.code(500).send({ message: error });
} finally {
if (context) {
await context.close().catch(() => {});
}
if (proxy) {
await proxy.close(true).catch(() => {});
}
}
};
export const handleScreenshot = async (
sessionService: SessionService,
browserService: CDPService,
request: ScreenshotRequest,
reply: FastifyReply,
) => {
const startTime = Date.now();
let times: Record<string, number> = {};
const { url, logUrl, proxyUrl, delay, fullPage } = request.body;
let proxy: IProxyServer | null = null;
let context: BrowserContext | null = null;
if (!browserService.isRunning()) {
await browserService.launch();
}
try {
if (proxyUrl) {
proxy = await sessionService.proxyFactory(proxyUrl);
await proxy.listen();
}
times.proxyTime = Date.now() - startTime;
let page: Page;
if (proxy) {
context = await browserService.createBrowserContext(proxy.url);
page = await context.newPage();
times.proxyPageTime = Date.now() - startTime - times.proxyTime;
} else {
page = await browserService.getPrimaryPage();
times.pageTime = Date.now() - startTime;
}
if (url) {
const normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
throw new Error(`Invalid URL: ${url}`);
}
await page.goto(normalizedUrl, { timeout: 30000, waitUntil: "domcontentloaded" });
times.pageLoadTime = Date.now() - times.pageTime - times.proxyTime - startTime;
}
if (delay) {
await new Promise((resolve) => setTimeout(resolve, delay));
}
const screenshot = await page.screenshot({ fullPage, type: "jpeg", quality: 100 });
times.screenshotTime =
Date.now() - times.pageLoadTime - times.pageTime - times.proxyTime - startTime;
if (logUrl) {
await updateLog(logUrl, { times });
}
return reply.send(screenshot);
} catch (e: unknown) {
const error = getErrors(e);
if (logUrl) {
await updateLog(logUrl, { times, response: { browserError: error } });
}
if (url) {
await browserService.refreshPrimaryPage();
}
return reply.code(500).send({ message: error });
} finally {
if (context) {
await context.close().catch(() => {});
}
if (proxy) {
await proxy.close(true).catch(() => {});
}
}
};
export const handlePDF = async (
sessionService: SessionService,
browserService: CDPService,
request: PDFRequest,
reply: FastifyReply,
) => {
const startTime = Date.now();
let times: Record<string, number> = {};
const { url, logUrl, proxyUrl, delay } = request.body;
let proxy: IProxyServer | null = null;
let context: BrowserContext | null = null;
if (!browserService.isRunning()) {
await browserService.launch();
}
try {
if (proxyUrl) {
proxy = await sessionService.proxyFactory(proxyUrl);
await proxy.listen();
}
times.proxyTime = Date.now() - startTime;
let page: Page;
if (proxy) {
context = await browserService.createBrowserContext(proxy.url);
page = await context.newPage();
times.proxyPageTime = Date.now() - startTime - times.proxyTime;
} else {
page = await browserService.getPrimaryPage();
times.pageTime = Date.now() - startTime;
}
if (url) {
const normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
throw new Error(`Invalid URL: ${url}`);
}
await page.goto(normalizedUrl, { timeout: 30000, waitUntil: "domcontentloaded" });
times.pageLoadTime = Date.now() - times.pageTime - times.proxyTime - startTime;
}
if (delay) {
await new Promise((resolve) => setTimeout(resolve, delay));
}
const pdf = await page.pdf();
times.pdfTime = Date.now() - times.pageLoadTime - times.pageTime - times.proxyTime - startTime;
if (logUrl) {
await updateLog(logUrl, { times });
}
return reply.send(pdf);
} catch (e: unknown) {
const error = getErrors(e);
if (logUrl) {
await updateLog(logUrl, { times, response: { browserError: error } });
}
if (url) {
await browserService.refreshPrimaryPage();
}
return reply.code(500).send({ message: error });
} finally {
if (context) {
await context.close().catch(() => {});
}
if (proxy) {
await proxy.close(true).catch(() => {});
}
}
};