File size: 1,498 Bytes
9a43362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/**
 * Tests for PDF parser
 */

import { extractPDFText, chunkPDFText } from './pdf';

describe('PDF Parser', () => {
  describe('chunkPDFText', () => {
    it('should chunk text by word count', () => {
      const words = Array(1000).fill('word');
      const text = words.join(' ');
      const { chunks, count } = chunkPDFText(text, 500);

      // 1000 words at 500 per chunk = 2 chunks
      expect(count).toBeGreaterThanOrEqual(1);
      expect(chunks.length).toBeGreaterThanOrEqual(1);
    });

    it('should handle small texts', () => {
      const text = 'a b c d e';
      const { chunks, count } = chunkPDFText(text, 500);

      expect(count).toBe(1);
      expect(chunks[0]).toContain('a b c d e');
    });

    it('should handle empty texts', () => {
      const text = '';
      const { chunks, count } = chunkPDFText(text, 500);

      expect(count).toBe(0);
      expect(chunks.length).toBe(0);
    });

    it('should handle whitespace-only texts', () => {
      const text = '   \n\n  ';
      const { chunks, count } = chunkPDFText(text, 500);

      expect(count).toBe(0);
      expect(chunks.length).toBe(0);
    });

    it('should respect max word count per chunk', () => {
      const text = Array(100).fill('word').join(' ');
      const { chunks } = chunkPDFText(text, 20);

      chunks.forEach((chunk) => {
        const wordCount = chunk.split(/\s+/).filter((w) => w.length > 0).length;
        expect(wordCount).toBeLessThanOrEqual(20);
      });
    });
  });
});