Spaces:
Paused
Paused
File size: 3,953 Bytes
9f069df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js";
const DEFAULT_URLS = [
"https://en.wikipedia.org/wiki/Web_scraping",
"https://news.ycombinator.com/",
"https://www.apple.com/iphone/",
"https://www.nytimes.com/",
"https://www.reddit.com/r/javascript/",
];
const urls = process.argv.slice(2);
const targets = urls.length > 0 ? urls : DEFAULT_URLS;
const apiKey = process.env.FIRECRAWL_API_KEY;
const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev";
const userAgent =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
const timeoutMs = 30_000;
function truncate(value: string, max = 180): string {
if (!value) {
return "";
}
return value.length > max ? `${value.slice(0, max)}…` : value;
}
async function fetchHtml(url: string): Promise<{
ok: boolean;
status: number;
contentType: string;
finalUrl: string;
body: string;
}> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await fetch(url, {
method: "GET",
headers: { Accept: "*/*", "User-Agent": userAgent },
signal: controller.signal,
});
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
const body = await res.text();
return {
ok: res.ok,
status: res.status,
contentType,
finalUrl: res.url || url,
body,
};
} finally {
clearTimeout(timer);
}
}
async function run() {
if (!apiKey) {
console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
}
for (const url of targets) {
console.log(`\n=== ${url}`);
let localStatus = "skipped";
let localTitle = "";
let localText = "";
let localError: string | undefined;
try {
const res = await fetchHtml(url);
if (!res.ok) {
localStatus = `http ${res.status}`;
} else if (!res.contentType.includes("text/html")) {
localStatus = `non-html (${res.contentType})`;
} else {
const readable = await extractReadableContent({
html: res.body,
url: res.finalUrl,
extractMode: "markdown",
});
if (readable?.text) {
localStatus = "readability";
localTitle = readable.title ?? "";
localText = readable.text;
} else {
localStatus = "readability-empty";
}
}
} catch (error) {
localStatus = "error";
localError = error instanceof Error ? error.message : String(error);
}
console.log(`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`);
if (localError) {
console.log(`local error: ${localError}`);
}
if (localText) {
console.log(`local sample: ${truncate(localText)}`);
}
if (apiKey) {
try {
const firecrawl = await fetchFirecrawlContent({
url,
extractMode: "markdown",
apiKey,
baseUrl,
onlyMainContent: true,
maxAgeMs: 172_800_000,
proxy: "auto",
storeInCache: true,
timeoutSeconds: 60,
});
console.log(
`firecrawl: ok len=${firecrawl.text.length} title=${truncate(
firecrawl.title ?? "",
80,
)} status=${firecrawl.status ?? "n/a"}`,
);
if (firecrawl.warning) {
console.log(`firecrawl warning: ${firecrawl.warning}`);
}
if (firecrawl.text) {
console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.log(`firecrawl: error ${message}`);
}
}
}
process.exit(0);
}
run().catch((error) => {
console.error(error);
process.exit(1);
});
|