| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | import fs from "node:fs/promises"; |
| | import type { PdfEntry, PdfBytesChunk } from "./types.js"; |
| | import { MAX_CHUNK_BYTES } from "./types.js"; |
| | import { isFileUrl } from "./pdf-indexer.js"; |
| |
|
| | |
| | const pdfCache = new Map<string, Uint8Array>(); |
| |
|
| | |
| | let pdfjs: typeof import("pdfjs-dist"); |
| | async function getPdfjs() { |
| | if (!pdfjs) { |
| | pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs"); |
| | } |
| | return pdfjs; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | export async function loadPdfData(entry: PdfEntry): Promise<Uint8Array> { |
| | const cached = pdfCache.get(entry.url); |
| | if (cached) return cached; |
| |
|
| | console.error(`[loader] Fetching: ${entry.url}`); |
| |
|
| | let data: Uint8Array; |
| | if (isFileUrl(entry.url)) { |
| | const filePath = entry.url.replace("file://", ""); |
| | data = new Uint8Array(await fs.readFile(filePath)); |
| | } else { |
| | const response = await fetch(entry.url); |
| | if (!response.ok) { |
| | throw new Error(`Failed to fetch: ${response.status}`); |
| | } |
| | data = new Uint8Array(await response.arrayBuffer()); |
| | } |
| |
|
| | pdfCache.set(entry.url, data); |
| | return data; |
| | } |
| |
|
| | |
| | async function fetchRange( |
| | url: string, |
| | start: number, |
| | end: number, |
| | ): Promise<{ data: Uint8Array; total: number } | null> { |
| | try { |
| | const res = await fetch(url, { |
| | headers: { Range: `bytes=${start}-${end}` }, |
| | }); |
| | if (res.status !== 206) return null; |
| |
|
| | const total = parseInt( |
| | res.headers.get("Content-Range")?.split("/")[1] || "0", |
| | ); |
| | return { data: new Uint8Array(await res.arrayBuffer()), total }; |
| | } catch { |
| | return null; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | export async function loadPdfBytesChunk( |
| | entry: PdfEntry, |
| | offset = 0, |
| | byteCount = MAX_CHUNK_BYTES, |
| | ): Promise<PdfBytesChunk> { |
| | |
| | if (!pdfCache.has(entry.url)) { |
| | const range = await fetchRange(entry.url, offset, offset + byteCount - 1); |
| | if (range) { |
| | return { |
| | url: entry.url, |
| | bytes: Buffer.from(range.data).toString("base64"), |
| | offset, |
| | byteCount: range.data.length, |
| | totalBytes: range.total, |
| | hasMore: offset + range.data.length < range.total, |
| | }; |
| | } |
| | } |
| |
|
| | |
| | const data = await loadPdfData(entry); |
| | const chunk = data.slice(offset, offset + byteCount); |
| |
|
| | return { |
| | url: entry.url, |
| | bytes: Buffer.from(chunk).toString("base64"), |
| | offset, |
| | byteCount: chunk.length, |
| | totalBytes: data.length, |
| | hasMore: offset + chunk.length < data.length, |
| | }; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | export async function populatePdfMetadata(entry: PdfEntry): Promise<void> { |
| | try { |
| | const lib = await getPdfjs(); |
| | const data = await loadPdfData(entry); |
| |
|
| | entry.metadata.fileSizeBytes = data.length; |
| |
|
| | const pdf = await lib.getDocument({ data: new Uint8Array(data) }).promise; |
| | entry.metadata.pageCount = pdf.numPages; |
| |
|
| | const info = (await pdf.getMetadata()).info as |
| | | Record<string, unknown> |
| | | undefined; |
| | if (info?.Title) entry.metadata.title = String(info.Title); |
| | if (info?.Author) entry.metadata.author = String(info.Author); |
| |
|
| | await pdf.destroy(); |
| | } catch (err) { |
| | console.error(`[loader] Metadata error: ${err}`); |
| | } |
| | } |
| |
|