Spaces:
Sleeping
Sleeping
| /** | |
| * Tests for PDF parser | |
| */ | |
| import { extractPDFText, chunkPDFText } from './pdf'; | |
| describe('PDF Parser', () => { | |
| describe('chunkPDFText', () => { | |
| it('should chunk text by word count', () => { | |
| const words = Array(1000).fill('word'); | |
| const text = words.join(' '); | |
| const { chunks, count } = chunkPDFText(text, 500); | |
| // 1000 words at 500 per chunk = 2 chunks | |
| expect(count).toBeGreaterThanOrEqual(1); | |
| expect(chunks.length).toBeGreaterThanOrEqual(1); | |
| }); | |
| it('should handle small texts', () => { | |
| const text = 'a b c d e'; | |
| const { chunks, count } = chunkPDFText(text, 500); | |
| expect(count).toBe(1); | |
| expect(chunks[0]).toContain('a b c d e'); | |
| }); | |
| it('should handle empty texts', () => { | |
| const text = ''; | |
| const { chunks, count } = chunkPDFText(text, 500); | |
| expect(count).toBe(0); | |
| expect(chunks.length).toBe(0); | |
| }); | |
| it('should handle whitespace-only texts', () => { | |
| const text = ' \n\n '; | |
| const { chunks, count } = chunkPDFText(text, 500); | |
| expect(count).toBe(0); | |
| expect(chunks.length).toBe(0); | |
| }); | |
| it('should respect max word count per chunk', () => { | |
| const text = Array(100).fill('word').join(' '); | |
| const { chunks } = chunkPDFText(text, 20); | |
| chunks.forEach((chunk) => { | |
| const wordCount = chunk.split(/\s+/).filter((w) => w.length > 0).length; | |
| expect(wordCount).toBeLessThanOrEqual(20); | |
| }); | |
| }); | |
| }); | |
| }); | |