Spaces:
Sleeping
Sleeping
| import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js"; | |
| const DEFAULT_URLS = [ | |
| "https://en.wikipedia.org/wiki/Web_scraping", | |
| "https://news.ycombinator.com/", | |
| "https://www.apple.com/iphone/", | |
| "https://www.nytimes.com/", | |
| "https://www.reddit.com/r/javascript/", | |
| ]; | |
| const urls = process.argv.slice(2); | |
| const targets = urls.length > 0 ? urls : DEFAULT_URLS; | |
| const apiKey = process.env.FIRECRAWL_API_KEY; | |
| const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev"; | |
| const userAgent = | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; | |
| const timeoutMs = 30_000; | |
| function truncate(value: string, max = 180): string { | |
| if (!value) { | |
| return ""; | |
| } | |
| return value.length > max ? `${value.slice(0, max)}…` : value; | |
| } | |
| async function fetchHtml(url: string): Promise<{ | |
| ok: boolean; | |
| status: number; | |
| contentType: string; | |
| finalUrl: string; | |
| body: string; | |
| }> { | |
| const controller = new AbortController(); | |
| const timer = setTimeout(() => controller.abort(), timeoutMs); | |
| try { | |
| const res = await fetch(url, { | |
| method: "GET", | |
| headers: { Accept: "*/*", "User-Agent": userAgent }, | |
| signal: controller.signal, | |
| }); | |
| const contentType = res.headers.get("content-type") ?? "application/octet-stream"; | |
| const body = await res.text(); | |
| return { | |
| ok: res.ok, | |
| status: res.status, | |
| contentType, | |
| finalUrl: res.url || url, | |
| body, | |
| }; | |
| } finally { | |
| clearTimeout(timer); | |
| } | |
| } | |
| async function run() { | |
| if (!apiKey) { | |
| console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped."); | |
| } | |
| for (const url of targets) { | |
| console.log(`\n=== ${url}`); | |
| let localStatus = "skipped"; | |
| let localTitle = ""; | |
| let localText = ""; | |
| let localError: string | undefined; | |
| try { | |
| const res = await fetchHtml(url); | |
| if (!res.ok) { | |
| localStatus = `http ${res.status}`; | |
| } else if (!res.contentType.includes("text/html")) { | |
| localStatus = `non-html (${res.contentType})`; | |
| } else { | |
| const readable = await extractReadableContent({ | |
| html: res.body, | |
| url: res.finalUrl, | |
| extractMode: "markdown", | |
| }); | |
| if (readable?.text) { | |
| localStatus = "readability"; | |
| localTitle = readable.title ?? ""; | |
| localText = readable.text; | |
| } else { | |
| localStatus = "readability-empty"; | |
| } | |
| } | |
| } catch (error) { | |
| localStatus = "error"; | |
| localError = error instanceof Error ? error.message : String(error); | |
| } | |
| console.log(`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`); | |
| if (localError) { | |
| console.log(`local error: ${localError}`); | |
| } | |
| if (localText) { | |
| console.log(`local sample: ${truncate(localText)}`); | |
| } | |
| if (apiKey) { | |
| try { | |
| const firecrawl = await fetchFirecrawlContent({ | |
| url, | |
| extractMode: "markdown", | |
| apiKey, | |
| baseUrl, | |
| onlyMainContent: true, | |
| maxAgeMs: 172_800_000, | |
| proxy: "auto", | |
| storeInCache: true, | |
| timeoutSeconds: 60, | |
| }); | |
| console.log( | |
| `firecrawl: ok len=${firecrawl.text.length} title=${truncate( | |
| firecrawl.title ?? "", | |
| 80, | |
| )} status=${firecrawl.status ?? "n/a"}`, | |
| ); | |
| if (firecrawl.warning) { | |
| console.log(`firecrawl warning: ${firecrawl.warning}`); | |
| } | |
| if (firecrawl.text) { | |
| console.log(`firecrawl sample: ${truncate(firecrawl.text)}`); | |
| } | |
| } catch (error) { | |
| const message = error instanceof Error ? error.message : String(error); | |
| console.log(`firecrawl: error ${message}`); | |
| } | |
| } | |
| } | |
| process.exit(0); | |
| } | |
| run().catch((error) => { | |
| console.error(error); | |
| process.exit(1); | |
| }); | |