File size: 2,837 Bytes
9a43362
 
 
 
 
ef4c2aa
9a43362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef4c2aa
 
9a43362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/**
 * PDF Parser
 * Extracts text from PDF files
 */

import pdfParse from 'pdf-parse';

export interface PDFContent {
  success: boolean;
  text?: string;
  title?: string;
  pageCount?: number;
  error?: string;
}

/**
 * Extract text from PDF buffer
 */
export async function extractPDFText(buffer: Buffer): Promise<PDFContent> {
  try {
    const data = await pdfParse(buffer);

    if (!data.text) {
      return {
        success: false,
        error: 'No text content found in PDF',
      };
    }

    // Clean up text: remove extra whitespace, join lines
    const cleanText = data.text
      .split('\n')
      .map((line: string) => line.trim())
      .filter((line: string) => line.length > 0)
      .join('\n\n');

    // If text is too long, truncate to first 10000 characters
    const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText;

    return {
      success: true,
      text: finalText,
      pageCount: data.numpages,
      title: extractTitle(cleanText),
    };
  } catch (error) {
    console.error('[pdf parser] extraction failed:', error);
    return {
      success: false,
      error: error instanceof Error ? error.message : 'Failed to parse PDF',
    };
  }
}

/**
 * Extract PDF text from file path (Node.js only)
 */
export async function extractPDFFromFile(filePath: string): Promise<PDFContent> {
  try {
    const fs = await import('fs');
    const buffer = await fs.promises.readFile(filePath);
    return extractPDFText(buffer);
  } catch (error) {
    console.error('[pdf parser] file read failed:', error);
    return {
      success: false,
      error: error instanceof Error ? error.message : 'Failed to read PDF file',
    };
  }
}

/**
 * Extract title from PDF text (first line or heading)
 */
function extractTitle(text: string): string {
  const lines = text.split('\n');

  // Look for a reasonable title (non-empty line, not too long)
  for (const line of lines) {
    const trimmed = line.trim();
    if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) {
      return trimmed;
    }
  }

  return 'PDF Document';
}

/**
 * Chunk PDF text by word count
 */
export function chunkPDFText(
  text: string,
  maxWords: number = 500
): { chunks: string[]; count: number } {
  if (!text || text.trim().length === 0) {
    return { chunks: [], count: 0 };
  }

  const words = text
    .split(/\s+/)
    .filter((w) => w.length > 0);
  const chunks: string[] = [];
  let currentChunk: string[] = [];

  for (const word of words) {
    currentChunk.push(word);

    if (currentChunk.length >= maxWords) {
      chunks.push(currentChunk.join(' '));
      currentChunk = [];
    }
  }

  // Add remaining words
  if (currentChunk.length > 0) {
    chunks.push(currentChunk.join(' '));
  }

  return { chunks, count: chunks.length };
}