|
|
""" |
|
|
Tests for file parser tool |
|
|
Author: @mangubee |
|
|
Date: 2026-01-02 |
|
|
|
|
|
Tests cover: |
|
|
- PDF parsing |
|
|
- Excel parsing |
|
|
- Word document parsing |
|
|
- Text/CSV parsing |
|
|
- Retry logic |
|
|
- Error handling |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
from pathlib import Path |
|
|
from unittest.mock import Mock, patch, MagicMock |
|
|
from src.tools.file_parser import ( |
|
|
parse_pdf, |
|
|
parse_excel, |
|
|
parse_word, |
|
|
parse_text, |
|
|
parse_file, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FIXTURES_DIR = Path(__file__).parent / "fixtures" |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_text_file(): |
|
|
"""Path to sample text file""" |
|
|
return str(FIXTURES_DIR / "sample.txt") |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_csv_file(): |
|
|
"""Path to sample CSV file""" |
|
|
return str(FIXTURES_DIR / "sample.csv") |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_excel_file(): |
|
|
"""Path to sample Excel file""" |
|
|
return str(FIXTURES_DIR / "sample.xlsx") |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_word_file(): |
|
|
"""Path to sample Word file""" |
|
|
return str(FIXTURES_DIR / "sample.docx") |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def mock_pdf_reader(): |
|
|
"""Mock PyPDF2 PdfReader""" |
|
|
mock_page_1 = Mock() |
|
|
mock_page_1.extract_text.return_value = "Test PDF page 1 content" |
|
|
|
|
|
mock_page_2 = Mock() |
|
|
mock_page_2.extract_text.return_value = "Test PDF page 2 content" |
|
|
|
|
|
mock_reader = Mock() |
|
|
mock_reader.pages = [mock_page_1, mock_page_2] |
|
|
|
|
|
return mock_reader |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_pdf_success(mock_pdf_reader): |
|
|
"""Test successful PDF parsing""" |
|
|
with patch('PyPDF2.PdfReader') as mock_reader_class: |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
|
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = True |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
|
|
|
mock_reader_class.return_value = mock_pdf_reader |
|
|
|
|
|
result = parse_pdf("test.pdf") |
|
|
|
|
|
assert result["file_type"] == "PDF" |
|
|
assert result["pages"] == 2 |
|
|
assert "page 1 content" in result["content"].lower() |
|
|
assert "page 2 content" in result["content"].lower() |
|
|
|
|
|
|
|
|
def test_parse_pdf_file_not_found(): |
|
|
"""Test PDF parsing with missing file""" |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = False |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
with pytest.raises(FileNotFoundError): |
|
|
parse_pdf("nonexistent.pdf") |
|
|
|
|
|
|
|
|
def test_parse_pdf_io_error_retry(): |
|
|
"""Test PDF parsing with IO error triggers retry""" |
|
|
with patch('PyPDF2.PdfReader') as mock_reader_class: |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
|
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = True |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
|
|
|
mock_reader_class.side_effect = IOError("Disk error") |
|
|
|
|
|
with pytest.raises(IOError): |
|
|
parse_pdf("test.pdf") |
|
|
|
|
|
|
|
|
assert mock_reader_class.call_count == 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_excel_success(sample_excel_file): |
|
|
"""Test successful Excel parsing with real file""" |
|
|
result = parse_excel(sample_excel_file) |
|
|
|
|
|
assert result["file_type"] == "Excel" |
|
|
assert len(result["sheets"]) == 2 |
|
|
assert "Data" in result["sheets"] |
|
|
assert "Summary" in result["sheets"] |
|
|
assert "Apple" in result["content"] |
|
|
assert "Banana" in result["content"] |
|
|
|
|
|
|
|
|
def test_parse_excel_file_not_found(): |
|
|
"""Test Excel parsing with missing file""" |
|
|
with pytest.raises(FileNotFoundError): |
|
|
parse_excel("nonexistent.xlsx") |
|
|
|
|
|
|
|
|
def test_parse_excel_io_error_retry(): |
|
|
"""Test Excel parsing with IO error triggers retry""" |
|
|
with patch('openpyxl.load_workbook') as mock_load: |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
|
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = True |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
|
|
|
mock_load.side_effect = IOError("Disk error") |
|
|
|
|
|
with pytest.raises(IOError): |
|
|
parse_excel("test.xlsx") |
|
|
|
|
|
|
|
|
assert mock_load.call_count == 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_word_success(sample_word_file): |
|
|
"""Test successful Word document parsing with real file""" |
|
|
result = parse_word(sample_word_file) |
|
|
|
|
|
assert result["file_type"] == "Word" |
|
|
assert result["paragraphs"] > 0 |
|
|
assert "Test Word Document" in result["content"] |
|
|
assert "first paragraph" in result["content"] |
|
|
|
|
|
|
|
|
def test_parse_word_file_not_found(): |
|
|
"""Test Word parsing with missing file""" |
|
|
with pytest.raises(FileNotFoundError): |
|
|
parse_word("nonexistent.docx") |
|
|
|
|
|
|
|
|
def test_parse_word_io_error_retry(): |
|
|
"""Test Word parsing with IO error triggers retry""" |
|
|
with patch('docx.Document') as mock_doc_class: |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
|
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = True |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
|
|
|
mock_doc_class.side_effect = IOError("Disk error") |
|
|
|
|
|
with pytest.raises(IOError): |
|
|
parse_word("test.docx") |
|
|
|
|
|
|
|
|
assert mock_doc_class.call_count == 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_text_success(sample_text_file): |
|
|
"""Test successful text file parsing with real file""" |
|
|
result = parse_text(sample_text_file) |
|
|
|
|
|
assert result["file_type"] == "Text" |
|
|
assert result["lines"] > 0 |
|
|
assert "test text file" in result["content"].lower() |
|
|
|
|
|
|
|
|
def test_parse_csv_success(sample_csv_file): |
|
|
"""Test successful CSV file parsing with real file""" |
|
|
result = parse_text(sample_csv_file) |
|
|
|
|
|
assert result["file_type"] == "CSV" |
|
|
assert result["lines"] > 0 |
|
|
assert "Name,Age,City" in result["content"] |
|
|
assert "Alice" in result["content"] |
|
|
|
|
|
|
|
|
def test_parse_text_file_not_found(): |
|
|
"""Test text parsing with missing file""" |
|
|
with pytest.raises(FileNotFoundError): |
|
|
parse_text("nonexistent.txt") |
|
|
|
|
|
|
|
|
def test_parse_text_io_error_retry(): |
|
|
"""Test text parsing with IO error triggers retry""" |
|
|
with patch('builtins.open') as mock_open: |
|
|
with patch('src.tools.file_parser.Path') as mock_path_class: |
|
|
|
|
|
mock_path = Mock() |
|
|
mock_path.exists.return_value = True |
|
|
mock_path.suffix = '.txt' |
|
|
mock_path_class.return_value = mock_path |
|
|
|
|
|
|
|
|
mock_open.side_effect = IOError("Disk error") |
|
|
|
|
|
with pytest.raises(IOError): |
|
|
parse_text("test.txt") |
|
|
|
|
|
|
|
|
assert mock_open.call_count == 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_file_pdf(): |
|
|
"""Test unified parser dispatches to PDF parser""" |
|
|
with patch('src.tools.file_parser.parse_pdf') as mock_parse_pdf: |
|
|
mock_parse_pdf.return_value = {"file_type": "PDF"} |
|
|
|
|
|
result = parse_file("test.pdf") |
|
|
|
|
|
assert result["file_type"] == "PDF" |
|
|
mock_parse_pdf.assert_called_once() |
|
|
|
|
|
|
|
|
def test_parse_file_excel(): |
|
|
"""Test unified parser dispatches to Excel parser""" |
|
|
with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: |
|
|
mock_parse_excel.return_value = {"file_type": "Excel"} |
|
|
|
|
|
result = parse_file("test.xlsx") |
|
|
|
|
|
assert result["file_type"] == "Excel" |
|
|
mock_parse_excel.assert_called_once() |
|
|
|
|
|
|
|
|
def test_parse_file_word(): |
|
|
"""Test unified parser dispatches to Word parser""" |
|
|
with patch('src.tools.file_parser.parse_word') as mock_parse_word: |
|
|
mock_parse_word.return_value = {"file_type": "Word"} |
|
|
|
|
|
result = parse_file("test.docx") |
|
|
|
|
|
assert result["file_type"] == "Word" |
|
|
mock_parse_word.assert_called_once() |
|
|
|
|
|
|
|
|
def test_parse_file_text(): |
|
|
"""Test unified parser dispatches to text parser""" |
|
|
with patch('src.tools.file_parser.parse_text') as mock_parse_text: |
|
|
mock_parse_text.return_value = {"file_type": "Text"} |
|
|
|
|
|
result = parse_file("test.txt") |
|
|
|
|
|
assert result["file_type"] == "Text" |
|
|
mock_parse_text.assert_called_once() |
|
|
|
|
|
|
|
|
def test_parse_file_unsupported_extension(): |
|
|
"""Test unified parser rejects unsupported file type""" |
|
|
with pytest.raises(ValueError, match="Unsupported file type"): |
|
|
parse_file("test.mp4") |
|
|
|
|
|
|
|
|
def test_parse_file_xls_extension(): |
|
|
"""Test unified parser handles .xls extension""" |
|
|
with patch('src.tools.file_parser.parse_excel') as mock_parse_excel: |
|
|
mock_parse_excel.return_value = {"file_type": "Excel"} |
|
|
|
|
|
result = parse_file("test.xls") |
|
|
|
|
|
assert result["file_type"] == "Excel" |
|
|
mock_parse_excel.assert_called_once() |
|
|
|