File size: 4,039 Bytes
e1cc3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/**
 * PDF Loader - Loads PDFs and extracts content in chunks
 *
 * Demonstrates:
 * - Chunked data loading with size limits
 * - HTTP Range requests for streaming
 * - Caching for repeated requests
 */
import fs from "node:fs/promises";
import type { PdfEntry, PdfBytesChunk } from "./types.js";
import { MAX_CHUNK_BYTES } from "./types.js";
import { isFileUrl } from "./pdf-indexer.js";

// Cache for loaded PDFs
const pdfCache = new Map<string, Uint8Array>();

// Lazy-load pdfjs
let pdfjs: typeof import("pdfjs-dist");
async function getPdfjs() {
  if (!pdfjs) {
    pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
  }
  return pdfjs;
}

// ============================================================================
// PDF Data Loading
// ============================================================================

/** Fetch PDF data (with caching) */
export async function loadPdfData(entry: PdfEntry): Promise<Uint8Array> {
  const cached = pdfCache.get(entry.url);
  if (cached) return cached;

  console.error(`[loader] Fetching: ${entry.url}`);

  let data: Uint8Array;
  if (isFileUrl(entry.url)) {
    const filePath = entry.url.replace("file://", "");
    data = new Uint8Array(await fs.readFile(filePath));
  } else {
    const response = await fetch(entry.url);
    if (!response.ok) {
      throw new Error(`Failed to fetch: ${response.status}`);
    }
    data = new Uint8Array(await response.arrayBuffer());
  }

  pdfCache.set(entry.url, data);
  return data;
}

/** Try HTTP Range request for partial content */
async function fetchRange(
  url: string,
  start: number,
  end: number,
): Promise<{ data: Uint8Array; total: number } | null> {
  try {
    const res = await fetch(url, {
      headers: { Range: `bytes=${start}-${end}` },
    });
    if (res.status !== 206) return null;

    const total = parseInt(
      res.headers.get("Content-Range")?.split("/")[1] || "0",
    );
    return { data: new Uint8Array(await res.arrayBuffer()), total };
  } catch {
    return null;
  }
}

// ============================================================================
// Chunked Binary Loading (demonstrates size-limited responses)
// ============================================================================

export async function loadPdfBytesChunk(
  entry: PdfEntry,
  offset = 0,
  byteCount = MAX_CHUNK_BYTES,
): Promise<PdfBytesChunk> {
  // Try Range request first (streaming without full download)
  if (!pdfCache.has(entry.url)) {
    const range = await fetchRange(entry.url, offset, offset + byteCount - 1);
    if (range) {
      return {
        url: entry.url,
        bytes: Buffer.from(range.data).toString("base64"),
        offset,
        byteCount: range.data.length,
        totalBytes: range.total,
        hasMore: offset + range.data.length < range.total,
      };
    }
  }

  // Fallback: load full PDF and slice
  const data = await loadPdfData(entry);
  const chunk = data.slice(offset, offset + byteCount);

  return {
    url: entry.url,
    bytes: Buffer.from(chunk).toString("base64"),
    offset,
    byteCount: chunk.length,
    totalBytes: data.length,
    hasMore: offset + chunk.length < data.length,
  };
}

// ============================================================================
// Metadata Extraction
// ============================================================================

export async function populatePdfMetadata(entry: PdfEntry): Promise<void> {
  try {
    const lib = await getPdfjs();
    const data = await loadPdfData(entry);

    entry.metadata.fileSizeBytes = data.length;

    const pdf = await lib.getDocument({ data: new Uint8Array(data) }).promise;
    entry.metadata.pageCount = pdf.numPages;

    const info = (await pdf.getMetadata()).info as
      | Record<string, unknown>
      | undefined;
    if (info?.Title) entry.metadata.title = String(info.Title);
    if (info?.Author) entry.metadata.author = String(info.Author);

    await pdf.destroy();
  } catch (err) {
    console.error(`[loader] Metadata error: ${err}`);
  }
}