/** * Tests for PDF parser */ import { extractPDFText, chunkPDFText } from './pdf'; describe('PDF Parser', () => { describe('chunkPDFText', () => { it('should chunk text by word count', () => { const words = Array(1000).fill('word'); const text = words.join(' '); const { chunks, count } = chunkPDFText(text, 500); // 1000 words at 500 per chunk = 2 chunks expect(count).toBeGreaterThanOrEqual(1); expect(chunks.length).toBeGreaterThanOrEqual(1); }); it('should handle small texts', () => { const text = 'a b c d e'; const { chunks, count } = chunkPDFText(text, 500); expect(count).toBe(1); expect(chunks[0]).toContain('a b c d e'); }); it('should handle empty texts', () => { const text = ''; const { chunks, count } = chunkPDFText(text, 500); expect(count).toBe(0); expect(chunks.length).toBe(0); }); it('should handle whitespace-only texts', () => { const text = ' \n\n '; const { chunks, count } = chunkPDFText(text, 500); expect(count).toBe(0); expect(chunks.length).toBe(0); }); it('should respect max word count per chunk', () => { const text = Array(100).fill('word').join(' '); const { chunks } = chunkPDFText(text, 20); chunks.forEach((chunk) => { const wordCount = chunk.split(/\s+/).filter((w) => w.length > 0).length; expect(wordCount).toBeLessThanOrEqual(20); }); }); }); });