| import { extractReadableContent, fetchFirecrawlContent } from "../src/agents/tools/web-tools.js"; |
|
|
| const DEFAULT_URLS = [ |
| "https://en.wikipedia.org/wiki/Web_scraping", |
| "https://news.ycombinator.com/", |
| "https://www.apple.com/iphone/", |
| "https://www.nytimes.com/", |
| "https://www.reddit.com/r/javascript/", |
| ]; |
|
|
| const urls = process.argv.slice(2); |
| const targets = urls.length > 0 ? urls : DEFAULT_URLS; |
| const apiKey = process.env.FIRECRAWL_API_KEY; |
| const baseUrl = process.env.FIRECRAWL_BASE_URL ?? "https://api.firecrawl.dev"; |
|
|
| const userAgent = |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; |
| const timeoutMs = 30_000; |
|
|
| function truncate(value: string, max = 180): string { |
| if (!value) return ""; |
| return value.length > max ? `${value.slice(0, max)}…` : value; |
| } |
|
|
| async function fetchHtml(url: string): Promise<{ |
| ok: boolean; |
| status: number; |
| contentType: string; |
| finalUrl: string; |
| body: string; |
| }> { |
| const controller = new AbortController(); |
| const timer = setTimeout(() => controller.abort(), timeoutMs); |
| try { |
| const res = await fetch(url, { |
| method: "GET", |
| headers: { Accept: "*/*", "User-Agent": userAgent }, |
| signal: controller.signal, |
| }); |
| const contentType = res.headers.get("content-type") ?? "application/octet-stream"; |
| const body = await res.text(); |
| return { |
| ok: res.ok, |
| status: res.status, |
| contentType, |
| finalUrl: res.url || url, |
| body, |
| }; |
| } finally { |
| clearTimeout(timer); |
| } |
| } |
|
|
| async function run() { |
| if (!apiKey) { |
| console.log("FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped."); |
| } |
|
|
| for (const url of targets) { |
| console.log(`\n=== ${url}`); |
| let localStatus = "skipped"; |
| let localTitle = ""; |
| let localText = ""; |
| let localError: string | undefined; |
|
|
| try { |
| const res = await fetchHtml(url); |
| if (!res.ok) { |
| localStatus = `http ${res.status}`; |
| } else if (!res.contentType.includes("text/html")) { |
| localStatus = `non-html (${res.contentType})`; |
| } else { |
| const readable = await extractReadableContent({ |
| html: res.body, |
| url: res.finalUrl, |
| extractMode: "markdown", |
| }); |
| if (readable?.text) { |
| localStatus = "readability"; |
| localTitle = readable.title ?? ""; |
| localText = readable.text; |
| } else { |
| localStatus = "readability-empty"; |
| } |
| } |
| } catch (error) { |
| localStatus = "error"; |
| localError = error instanceof Error ? error.message : String(error); |
| } |
|
|
| console.log(`local: ${localStatus} len=${localText.length} title=${truncate(localTitle, 80)}`); |
| if (localError) console.log(`local error: ${localError}`); |
| if (localText) console.log(`local sample: ${truncate(localText)}`); |
|
|
| if (apiKey) { |
| try { |
| const firecrawl = await fetchFirecrawlContent({ |
| url, |
| extractMode: "markdown", |
| apiKey, |
| baseUrl, |
| onlyMainContent: true, |
| maxAgeMs: 172_800_000, |
| proxy: "auto", |
| storeInCache: true, |
| timeoutSeconds: 60, |
| }); |
| console.log( |
| `firecrawl: ok len=${firecrawl.text.length} title=${truncate( |
| firecrawl.title ?? "", |
| 80, |
| )} status=${firecrawl.status ?? "n/a"}`, |
| ); |
| if (firecrawl.warning) console.log(`firecrawl warning: ${firecrawl.warning}`); |
| if (firecrawl.text) console.log(`firecrawl sample: ${truncate(firecrawl.text)}`); |
| } catch (error) { |
| const message = error instanceof Error ? error.message : String(error); |
| console.log(`firecrawl: error ${message}`); |
| } |
| } |
| } |
|
|
| process.exit(0); |
| } |
|
|
| run().catch((error) => { |
| console.error(error); |
| process.exit(1); |
| }); |
|
|