File size: 4,039 Bytes
e1cc3bc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | /**
* PDF Loader - Loads PDFs and extracts content in chunks
*
* Demonstrates:
* - Chunked data loading with size limits
* - HTTP Range requests for streaming
* - Caching for repeated requests
*/
import fs from "node:fs/promises";
import type { PdfEntry, PdfBytesChunk } from "./types.js";
import { MAX_CHUNK_BYTES } from "./types.js";
import { isFileUrl } from "./pdf-indexer.js";
// Cache for loaded PDFs
const pdfCache = new Map<string, Uint8Array>();
// Lazy-load pdfjs
let pdfjs: typeof import("pdfjs-dist");
async function getPdfjs() {
if (!pdfjs) {
pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
}
return pdfjs;
}
// ============================================================================
// PDF Data Loading
// ============================================================================
/** Fetch PDF data (with caching) */
export async function loadPdfData(entry: PdfEntry): Promise<Uint8Array> {
const cached = pdfCache.get(entry.url);
if (cached) return cached;
console.error(`[loader] Fetching: ${entry.url}`);
let data: Uint8Array;
if (isFileUrl(entry.url)) {
const filePath = entry.url.replace("file://", "");
data = new Uint8Array(await fs.readFile(filePath));
} else {
const response = await fetch(entry.url);
if (!response.ok) {
throw new Error(`Failed to fetch: ${response.status}`);
}
data = new Uint8Array(await response.arrayBuffer());
}
pdfCache.set(entry.url, data);
return data;
}
/** Try HTTP Range request for partial content */
async function fetchRange(
url: string,
start: number,
end: number,
): Promise<{ data: Uint8Array; total: number } | null> {
try {
const res = await fetch(url, {
headers: { Range: `bytes=${start}-${end}` },
});
if (res.status !== 206) return null;
const total = parseInt(
res.headers.get("Content-Range")?.split("/")[1] || "0",
);
return { data: new Uint8Array(await res.arrayBuffer()), total };
} catch {
return null;
}
}
// ============================================================================
// Chunked Binary Loading (demonstrates size-limited responses)
// ============================================================================
export async function loadPdfBytesChunk(
entry: PdfEntry,
offset = 0,
byteCount = MAX_CHUNK_BYTES,
): Promise<PdfBytesChunk> {
// Try Range request first (streaming without full download)
if (!pdfCache.has(entry.url)) {
const range = await fetchRange(entry.url, offset, offset + byteCount - 1);
if (range) {
return {
url: entry.url,
bytes: Buffer.from(range.data).toString("base64"),
offset,
byteCount: range.data.length,
totalBytes: range.total,
hasMore: offset + range.data.length < range.total,
};
}
}
// Fallback: load full PDF and slice
const data = await loadPdfData(entry);
const chunk = data.slice(offset, offset + byteCount);
return {
url: entry.url,
bytes: Buffer.from(chunk).toString("base64"),
offset,
byteCount: chunk.length,
totalBytes: data.length,
hasMore: offset + chunk.length < data.length,
};
}
// ============================================================================
// Metadata Extraction
// ============================================================================
export async function populatePdfMetadata(entry: PdfEntry): Promise<void> {
try {
const lib = await getPdfjs();
const data = await loadPdfData(entry);
entry.metadata.fileSizeBytes = data.length;
const pdf = await lib.getDocument({ data: new Uint8Array(data) }).promise;
entry.metadata.pageCount = pdf.numPages;
const info = (await pdf.getMetadata()).info as
| Record<string, unknown>
| undefined;
if (info?.Title) entry.metadata.title = String(info.Title);
if (info?.Author) entry.metadata.author = String(info.Author);
await pdf.destroy();
} catch (err) {
console.error(`[loader] Metadata error: ${err}`);
}
}
|