""" Tests for file parser tool Author: @mangubee Date: 2026-01-02 Tests cover: - PDF parsing - Excel parsing - Word document parsing - Text/CSV parsing - Retry logic - Error handling """ import pytest from pathlib import Path from unittest.mock import Mock, patch, MagicMock from src.tools.file_parser import ( parse_pdf, parse_excel, parse_word, parse_text, parse_file, ) # ============================================================================ # Test Fixtures # ============================================================================ FIXTURES_DIR = Path(__file__).parent / "fixtures" @pytest.fixture def sample_text_file(): """Path to sample text file""" return str(FIXTURES_DIR / "sample.txt") @pytest.fixture def sample_csv_file(): """Path to sample CSV file""" return str(FIXTURES_DIR / "sample.csv") @pytest.fixture def sample_excel_file(): """Path to sample Excel file""" return str(FIXTURES_DIR / "sample.xlsx") @pytest.fixture def sample_word_file(): """Path to sample Word file""" return str(FIXTURES_DIR / "sample.docx") @pytest.fixture def mock_pdf_reader(): """Mock PyPDF2 PdfReader""" mock_page_1 = Mock() mock_page_1.extract_text.return_value = "Test PDF page 1 content" mock_page_2 = Mock() mock_page_2.extract_text.return_value = "Test PDF page 2 content" mock_reader = Mock() mock_reader.pages = [mock_page_1, mock_page_2] return mock_reader # ============================================================================ # PDF Parser Tests # ============================================================================ def test_parse_pdf_success(mock_pdf_reader): """Test successful PDF parsing""" with patch('PyPDF2.PdfReader') as mock_reader_class: with patch('src.tools.file_parser.Path') as mock_path_class: # Mock file exists mock_path = Mock() mock_path.exists.return_value = True mock_path_class.return_value = mock_path # Mock PdfReader mock_reader_class.return_value = mock_pdf_reader result = parse_pdf("test.pdf") assert result["file_type"] == "PDF" assert result["pages"] == 2 assert "page 1 content" in result["content"].lower() assert "page 2 content" in result["content"].lower() def test_parse_pdf_file_not_found(): """Test PDF parsing with missing file""" with patch('src.tools.file_parser.Path') as mock_path_class: mock_path = Mock() mock_path.exists.return_value = False mock_path_class.return_value = mock_path with pytest.raises(FileNotFoundError): parse_pdf("nonexistent.pdf") def test_parse_pdf_io_error_retry(): """Test PDF parsing with IO error triggers retry""" with patch('PyPDF2.PdfReader') as mock_reader_class: with patch('src.tools.file_parser.Path') as mock_path_class: # Mock file exists mock_path = Mock() mock_path.exists.return_value = True mock_path_class.return_value = mock_path # Mock IO error mock_reader_class.side_effect = IOError("Disk error") with pytest.raises(IOError): parse_pdf("test.pdf") # Verify retry happened (should be called MAX_RETRIES times) assert mock_reader_class.call_count == 3 # ============================================================================ # Excel Parser Tests # ============================================================================ def test_parse_excel_success(sample_excel_file): """Test successful Excel parsing with real file""" result = parse_excel(sample_excel_file) assert result["file_type"] == "Excel" assert len(result["sheets"]) == 2 assert "Data" in result["sheets"] assert "Summary" in result["sheets"] assert "Apple" in result["content"] assert "Banana" in result["content"] def test_parse_excel_file_not_found(): """Test Excel parsing with missing file""" with pytest.raises(FileNotFoundError): parse_excel("nonexistent.xlsx") def test_parse_excel_io_error_retry(): """Test Excel parsing with IO error triggers retry""" with patch('openpyxl.load_workbook') as mock_load: with patch('src.tools.file_parser.Path') as mock_path_class: # Mock file exists mock_path = Mock() mock_path.exists.return_value = True mock_path_class.return_value = mock_path # Mock IO error mock_load.side_effect = IOError("Disk error") with pytest.raises(IOError): parse_excel("test.xlsx") # Verify retry happened assert mock_load.call_count == 3 # ============================================================================ # Word Document Parser Tests # ============================================================================ def test_parse_word_success(sample_word_file): """Test successful Word document parsing with real file""" result = parse_word(sample_word_file) assert result["file_type"] == "Word" assert result["paragraphs"] > 0 assert "Test Word Document" in result["content"] assert "first paragraph" in result["content"] def test_parse_word_file_not_found(): """Test Word parsing with missing file""" with pytest.raises(FileNotFoundError): parse_word("nonexistent.docx") def test_parse_word_io_error_retry(): """Test Word parsing with IO error triggers retry""" with patch('docx.Document') as mock_doc_class: with patch('src.tools.file_parser.Path') as mock_path_class: # Mock file exists mock_path = Mock() mock_path.exists.return_value = True mock_path_class.return_value = mock_path # Mock IO error mock_doc_class.side_effect = IOError("Disk error") with pytest.raises(IOError): parse_word("test.docx") # Verify retry happened assert mock_doc_class.call_count == 3 # ============================================================================ # Text/CSV Parser Tests # ============================================================================ def test_parse_text_success(sample_text_file): """Test successful text file parsing with real file""" result = parse_text(sample_text_file) assert result["file_type"] == "Text" assert result["lines"] > 0 assert "test text file" in result["content"].lower() def test_parse_csv_success(sample_csv_file): """Test successful CSV file parsing with real file""" result = parse_text(sample_csv_file) assert result["file_type"] == "CSV" assert result["lines"] > 0 assert "Name,Age,City" in result["content"] assert "Alice" in result["content"] def test_parse_text_file_not_found(): """Test text parsing with missing file""" with pytest.raises(FileNotFoundError): parse_text("nonexistent.txt") def test_parse_text_io_error_retry(): """Test text parsing with IO error triggers retry""" with patch('builtins.open') as mock_open: with patch('src.tools.file_parser.Path') as mock_path_class: # Mock file exists mock_path = Mock() mock_path.exists.return_value = True mock_path.suffix = '.txt' mock_path_class.return_value = mock_path # Mock IO error mock_open.side_effect = IOError("Disk error") with pytest.raises(IOError): parse_text("test.txt") # Verify retry happened assert mock_open.call_count == 3 # ============================================================================ # Unified Parser Tests # ============================================================================ def test_parse_file_pdf(): """Test unified parser dispatches to PDF parser""" with patch('src.tools.file_parser.parse_pdf') as mock_parse_pdf: mock_parse_pdf.return_value = {"file_type": "PDF"} result = parse_file("test.pdf") assert result["file_type"] == "PDF" mock_parse_pdf.assert_called_once() def test_parse_file_excel(): """Test unified parser dispatches to Excel parser""" with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: mock_parse_excel.return_value = {"file_type": "Excel"} result = parse_file("test.xlsx") assert result["file_type"] == "Excel" mock_parse_excel.assert_called_once() def test_parse_file_word(): """Test unified parser dispatches to Word parser""" with patch('src.tools.file_parser.parse_word') as mock_parse_word: mock_parse_word.return_value = {"file_type": "Word"} result = parse_file("test.docx") assert result["file_type"] == "Word" mock_parse_word.assert_called_once() def test_parse_file_text(): """Test unified parser dispatches to text parser""" with patch('src.tools.file_parser.parse_text') as mock_parse_text: mock_parse_text.return_value = {"file_type": "Text"} result = parse_file("test.txt") assert result["file_type"] == "Text" mock_parse_text.assert_called_once() def test_parse_file_unsupported_extension(): """Test unified parser rejects unsupported file type""" with pytest.raises(ValueError, match="Unsupported file type"): parse_file("test.mp4") def test_parse_file_xls_extension(): """Test unified parser handles .xls extension""" with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: mock_parse_excel.return_value = {"file_type": "Excel"} result = parse_file("test.xls") assert result["file_type"] == "Excel" mock_parse_excel.assert_called_once()