| | """ |
| | Tests for file parser tool |
| | Author: @mangobee |
| | Date: 2026-01-02 |
| | |
| | Tests cover: |
| | - PDF parsing |
| | - Excel parsing |
| | - Word document parsing |
| | - Text/CSV parsing |
| | - Retry logic |
| | - Error handling |
| | """ |
| |
|
| | import pytest |
| | from pathlib import Path |
| | from unittest.mock import Mock, patch, MagicMock |
| | from src.tools.file_parser import ( |
| | parse_pdf, |
| | parse_excel, |
| | parse_word, |
| | parse_text, |
| | parse_file, |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | FIXTURES_DIR = Path(__file__).parent / "fixtures" |
| |
|
| |
|
| | @pytest.fixture |
| | def sample_text_file(): |
| | """Path to sample text file""" |
| | return str(FIXTURES_DIR / "sample.txt") |
| |
|
| |
|
| | @pytest.fixture |
| | def sample_csv_file(): |
| | """Path to sample CSV file""" |
| | return str(FIXTURES_DIR / "sample.csv") |
| |
|
| |
|
| | @pytest.fixture |
| | def sample_excel_file(): |
| | """Path to sample Excel file""" |
| | return str(FIXTURES_DIR / "sample.xlsx") |
| |
|
| |
|
| | @pytest.fixture |
| | def sample_word_file(): |
| | """Path to sample Word file""" |
| | return str(FIXTURES_DIR / "sample.docx") |
| |
|
| |
|
| | @pytest.fixture |
| | def mock_pdf_reader(): |
| | """Mock PyPDF2 PdfReader""" |
| | mock_page_1 = Mock() |
| | mock_page_1.extract_text.return_value = "Test PDF page 1 content" |
| |
|
| | mock_page_2 = Mock() |
| | mock_page_2.extract_text.return_value = "Test PDF page 2 content" |
| |
|
| | mock_reader = Mock() |
| | mock_reader.pages = [mock_page_1, mock_page_2] |
| |
|
| | return mock_reader |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def test_parse_pdf_success(mock_pdf_reader): |
| | """Test successful PDF parsing""" |
| | with patch('PyPDF2.PdfReader') as mock_reader_class: |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | |
| | mock_path = Mock() |
| | mock_path.exists.return_value = True |
| | mock_path_class.return_value = mock_path |
| |
|
| | |
| | mock_reader_class.return_value = mock_pdf_reader |
| |
|
| | result = parse_pdf("test.pdf") |
| |
|
| | assert result["file_type"] == "PDF" |
| | assert result["pages"] == 2 |
| | assert "page 1 content" in result["content"].lower() |
| | assert "page 2 content" in result["content"].lower() |
| |
|
| |
|
| | def test_parse_pdf_file_not_found(): |
| | """Test PDF parsing with missing file""" |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | mock_path = Mock() |
| | mock_path.exists.return_value = False |
| | mock_path_class.return_value = mock_path |
| |
|
| | with pytest.raises(FileNotFoundError): |
| | parse_pdf("nonexistent.pdf") |
| |
|
| |
|
| | def test_parse_pdf_io_error_retry(): |
| | """Test PDF parsing with IO error triggers retry""" |
| | with patch('PyPDF2.PdfReader') as mock_reader_class: |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | |
| | mock_path = Mock() |
| | mock_path.exists.return_value = True |
| | mock_path_class.return_value = mock_path |
| |
|
| | |
| | mock_reader_class.side_effect = IOError("Disk error") |
| |
|
| | with pytest.raises(IOError): |
| | parse_pdf("test.pdf") |
| |
|
| | |
| | assert mock_reader_class.call_count == 3 |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def test_parse_excel_success(sample_excel_file): |
| | """Test successful Excel parsing with real file""" |
| | result = parse_excel(sample_excel_file) |
| |
|
| | assert result["file_type"] == "Excel" |
| | assert len(result["sheets"]) == 2 |
| | assert "Data" in result["sheets"] |
| | assert "Summary" in result["sheets"] |
| | assert "Apple" in result["content"] |
| | assert "Banana" in result["content"] |
| |
|
| |
|
| | def test_parse_excel_file_not_found(): |
| | """Test Excel parsing with missing file""" |
| | with pytest.raises(FileNotFoundError): |
| | parse_excel("nonexistent.xlsx") |
| |
|
| |
|
| | def test_parse_excel_io_error_retry(): |
| | """Test Excel parsing with IO error triggers retry""" |
| | with patch('openpyxl.load_workbook') as mock_load: |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | |
| | mock_path = Mock() |
| | mock_path.exists.return_value = True |
| | mock_path_class.return_value = mock_path |
| |
|
| | |
| | mock_load.side_effect = IOError("Disk error") |
| |
|
| | with pytest.raises(IOError): |
| | parse_excel("test.xlsx") |
| |
|
| | |
| | assert mock_load.call_count == 3 |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def test_parse_word_success(sample_word_file): |
| | """Test successful Word document parsing with real file""" |
| | result = parse_word(sample_word_file) |
| |
|
| | assert result["file_type"] == "Word" |
| | assert result["paragraphs"] > 0 |
| | assert "Test Word Document" in result["content"] |
| | assert "first paragraph" in result["content"] |
| |
|
| |
|
| | def test_parse_word_file_not_found(): |
| | """Test Word parsing with missing file""" |
| | with pytest.raises(FileNotFoundError): |
| | parse_word("nonexistent.docx") |
| |
|
| |
|
| | def test_parse_word_io_error_retry(): |
| | """Test Word parsing with IO error triggers retry""" |
| | with patch('docx.Document') as mock_doc_class: |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | |
| | mock_path = Mock() |
| | mock_path.exists.return_value = True |
| | mock_path_class.return_value = mock_path |
| |
|
| | |
| | mock_doc_class.side_effect = IOError("Disk error") |
| |
|
| | with pytest.raises(IOError): |
| | parse_word("test.docx") |
| |
|
| | |
| | assert mock_doc_class.call_count == 3 |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def test_parse_text_success(sample_text_file): |
| | """Test successful text file parsing with real file""" |
| | result = parse_text(sample_text_file) |
| |
|
| | assert result["file_type"] == "Text" |
| | assert result["lines"] > 0 |
| | assert "test text file" in result["content"].lower() |
| |
|
| |
|
| | def test_parse_csv_success(sample_csv_file): |
| | """Test successful CSV file parsing with real file""" |
| | result = parse_text(sample_csv_file) |
| |
|
| | assert result["file_type"] == "CSV" |
| | assert result["lines"] > 0 |
| | assert "Name,Age,City" in result["content"] |
| | assert "Alice" in result["content"] |
| |
|
| |
|
| | def test_parse_text_file_not_found(): |
| | """Test text parsing with missing file""" |
| | with pytest.raises(FileNotFoundError): |
| | parse_text("nonexistent.txt") |
| |
|
| |
|
| | def test_parse_text_io_error_retry(): |
| | """Test text parsing with IO error triggers retry""" |
| | with patch('builtins.open') as mock_open: |
| | with patch('src.tools.file_parser.Path') as mock_path_class: |
| | |
| | mock_path = Mock() |
| | mock_path.exists.return_value = True |
| | mock_path.suffix = '.txt' |
| | mock_path_class.return_value = mock_path |
| |
|
| | |
| | mock_open.side_effect = IOError("Disk error") |
| |
|
| | with pytest.raises(IOError): |
| | parse_text("test.txt") |
| |
|
| | |
| | assert mock_open.call_count == 3 |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def test_parse_file_pdf(): |
| | """Test unified parser dispatches to PDF parser""" |
| | with patch('src.tools.file_parser.parse_pdf') as mock_parse_pdf: |
| | mock_parse_pdf.return_value = {"file_type": "PDF"} |
| |
|
| | result = parse_file("test.pdf") |
| |
|
| | assert result["file_type"] == "PDF" |
| | mock_parse_pdf.assert_called_once() |
| |
|
| |
|
| | def test_parse_file_excel(): |
| | """Test unified parser dispatches to Excel parser""" |
| | with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: |
| | mock_parse_excel.return_value = {"file_type": "Excel"} |
| |
|
| | result = parse_file("test.xlsx") |
| |
|
| | assert result["file_type"] == "Excel" |
| | mock_parse_excel.assert_called_once() |
| |
|
| |
|
| | def test_parse_file_word(): |
| | """Test unified parser dispatches to Word parser""" |
| | with patch('src.tools.file_parser.parse_word') as mock_parse_word: |
| | mock_parse_word.return_value = {"file_type": "Word"} |
| |
|
| | result = parse_file("test.docx") |
| |
|
| | assert result["file_type"] == "Word" |
| | mock_parse_word.assert_called_once() |
| |
|
| |
|
| | def test_parse_file_text(): |
| | """Test unified parser dispatches to text parser""" |
| | with patch('src.tools.file_parser.parse_text') as mock_parse_text: |
| | mock_parse_text.return_value = {"file_type": "Text"} |
| |
|
| | result = parse_file("test.txt") |
| |
|
| | assert result["file_type"] == "Text" |
| | mock_parse_text.assert_called_once() |
| |
|
| |
|
| | def test_parse_file_unsupported_extension(): |
| | """Test unified parser rejects unsupported file type""" |
| | with pytest.raises(ValueError, match="Unsupported file type"): |
| | parse_file("test.mp4") |
| |
|
| |
|
| | def test_parse_file_xls_extension(): |
| | """Test unified parser handles .xls extension""" |
| | with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: |
| | mock_parse_excel.return_value = {"file_type": "Excel"} |
| |
|
| | result = parse_file("test.xls") |
| |
|
| | assert result["file_type"] == "Excel" |
| | mock_parse_excel.assert_called_once() |
| |
|