yomitalk / tests /unit /test_content_extractor.py
KyosukeIchikawa's picture
style: Apply comprehensive ruff formatting across entire codebase
dd82ad4
"""Unit tests for ContentExtractor class."""
from unittest.mock import MagicMock, patch
from yomitalk.components.content_extractor import ContentExtractor
class TestContentExtractor:
"""Test class for ContentExtractor."""
def setup_method(self):
"""Set up test fixtures before each test method is run."""
# No need to create instance since all methods are now classmethods
def test_initialization(self):
"""Test that ContentExtractor initializes correctly."""
# Check that supported extensions are properly defined
assert isinstance(ContentExtractor.SUPPORTED_TEXT_EXTENSIONS, list)
assert isinstance(ContentExtractor.SUPPORTED_PDF_EXTENSIONS, list)
assert isinstance(ContentExtractor.SUPPORTED_EXTENSIONS, list)
# Check that text and PDF extensions are included in supported extensions
for ext in ContentExtractor.SUPPORTED_TEXT_EXTENSIONS:
assert ext in ContentExtractor.SUPPORTED_EXTENSIONS
for ext in ContentExtractor.SUPPORTED_PDF_EXTENSIONS:
assert ext in ContentExtractor.SUPPORTED_EXTENSIONS
def test_supported_extensions(self):
"""Test the supported extensions."""
# Test that common extensions are included
assert ".txt" in ContentExtractor.SUPPORTED_TEXT_EXTENSIONS
assert ".md" in ContentExtractor.SUPPORTED_TEXT_EXTENSIONS
assert ".pdf" in ContentExtractor.SUPPORTED_PDF_EXTENSIONS
# Check the combined list
all_extensions = ContentExtractor.SUPPORTED_TEXT_EXTENSIONS + ContentExtractor.SUPPORTED_PDF_EXTENSIONS
for ext in all_extensions:
assert ext in ContentExtractor.SUPPORTED_EXTENSIONS
def test_extract_file_content(self):
"""Test extracting content from a file object."""
# Mock a file object
mock_file = MagicMock()
mock_file.name = "test.txt"
mock_file.read.return_value = b"This is test content."
mock_file.tell.return_value = 0
# Test with the mock file
extension, content = ContentExtractor.extract_file_content(mock_file)
# Verify results
assert extension == ".txt"
assert content == b"This is test content."
def test_extract_text(self):
"""Test the extract_text method."""
# Test with None input
assert ContentExtractor.extract_text(None) == "Please upload a file."
# Mock a valid file object for later implementation
# of more comprehensive tests as needed
def test_is_url_valid_urls(self):
"""Test is_url method with valid URLs."""
valid_urls = [
"https://www.example.com",
"http://example.com",
"https://youtube.com/watch?v=dQw4w9WgXcQ",
"https://en.wikipedia.org/wiki/Test",
"https://feeds.feedburner.com/example",
"https://www.bing.com/search?q=test",
]
for url in valid_urls:
assert ContentExtractor.is_url(url) is True
def test_is_url_invalid_urls(self):
"""Test is_url method with invalid URLs."""
invalid_urls = [
"",
"not a url",
"example.com", # Missing scheme
"file://local/path", # Local file path
"ftp://example.com", # Non-HTTP scheme
"https://", # Missing netloc
"://example.com", # Missing scheme
]
for url in invalid_urls:
assert ContentExtractor.is_url(url) is False
def test_is_url_edge_cases(self):
"""Test is_url method with edge cases."""
# Test with whitespace
assert ContentExtractor.is_url(" https://example.com ") is True
# Test with None input
assert ContentExtractor.is_url(None) is False
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_success(self, mock_converter):
"""Test successful URL text extraction."""
# Mock the converter response
mock_result = MagicMock()
mock_result.text_content = "Extracted content from URL"
mock_converter.convert.return_value = mock_result
url = "https://example.com/article"
result = ContentExtractor.extract_from_url(url)
assert result == "Extracted content from URL"
mock_converter.convert.assert_called_once_with(url)
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_empty_content(self, mock_converter):
"""Test URL extraction with empty content."""
# Mock the converter response with empty content
mock_result = MagicMock()
mock_result.text_content = None
mock_converter.convert.return_value = mock_result
url = "https://example.com/empty"
result = ContentExtractor.extract_from_url(url)
assert result == ""
mock_converter.convert.assert_called_once_with(url)
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_conversion_error(self, mock_converter):
"""Test URL extraction with conversion error."""
# Mock the converter to raise an exception
mock_converter.convert.side_effect = Exception("Connection error")
url = "https://example.com/error"
result = ContentExtractor.extract_from_url(url)
assert "URL conversion error: Connection error" in result
mock_converter.convert.assert_called_once_with(url)
def test_extract_from_url_invalid_url(self):
"""Test URL extraction with invalid URL."""
invalid_url = "not a url"
result = ContentExtractor.extract_from_url(invalid_url)
assert result == "Invalid URL format."
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_youtube(self, mock_converter):
"""Test URL extraction from YouTube."""
# Mock the converter response for YouTube
mock_result = MagicMock()
mock_result.text_content = "YouTube video transcript: How to code"
mock_converter.convert.return_value = mock_result
youtube_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
result = ContentExtractor.extract_from_url(youtube_url)
assert result == "YouTube video transcript: How to code"
mock_converter.convert.assert_called_once_with(youtube_url)
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_wikipedia(self, mock_converter):
"""Test URL extraction from Wikipedia."""
# Mock the converter response for Wikipedia
mock_result = MagicMock()
mock_result.text_content = "Wikipedia article about machine learning..."
mock_converter.convert.return_value = mock_result
wikipedia_url = "https://en.wikipedia.org/wiki/Machine_learning"
result = ContentExtractor.extract_from_url(wikipedia_url)
assert result == "Wikipedia article about machine learning..."
mock_converter.convert.assert_called_once_with(wikipedia_url)
@patch("yomitalk.components.content_extractor._markdown_converter")
def test_extract_from_url_rss_feed(self, mock_converter):
"""Test URL extraction from RSS feed."""
# Mock the converter response for RSS feed
mock_result = MagicMock()
mock_result.text_content = "RSS feed content: Latest news articles..."
mock_converter.convert.return_value = mock_result
rss_url = "https://feeds.feedburner.com/example"
result = ContentExtractor.extract_from_url(rss_url)
assert result == "RSS feed content: Latest news articles..."
mock_converter.convert.assert_called_once_with(rss_url)
def test_append_text_with_source_no_separator(self):
"""Test appending text without separator."""
existing_text = "Existing content"
new_text = "New content"
source_name = "test.txt"
result = ContentExtractor.append_text_with_source(existing_text, new_text, source_name, add_separator=False)
expected = "Existing content\n\nNew content"
assert result == expected
def test_append_text_with_source_with_separator(self):
"""Test appending text with separator."""
existing_text = "Existing content"
new_text = "New content"
source_name = "test.txt"
result = ContentExtractor.append_text_with_source(existing_text, new_text, source_name, add_separator=True)
expected = "Existing content\n\n---\n**Source: test.txt**\n\nNew content"
assert result == expected
def test_append_text_with_source_empty_existing(self):
"""Test appending to empty existing text."""
existing_text = ""
new_text = "New content"
source_name = "test.txt"
result = ContentExtractor.append_text_with_source(existing_text, new_text, source_name, add_separator=True)
expected = "**Source: test.txt**\n\nNew content"
assert result == expected
def test_append_text_with_source_empty_new_text(self):
"""Test appending empty new text."""
existing_text = "Existing content"
new_text = ""
source_name = "test.txt"
result = ContentExtractor.append_text_with_source(existing_text, new_text, source_name, add_separator=True)
# Should return existing text unchanged when new text is empty
assert result == existing_text
def test_get_source_name_from_file(self):
"""Test extracting source name from file object."""
# Mock file object with name attribute
mock_file = MagicMock()
mock_file.name = "/path/to/document.pdf"
result = ContentExtractor.get_source_name_from_file(mock_file)
assert result == "document.pdf"
def test_get_source_name_from_file_none(self):
"""Test extracting source name from None file object."""
result = ContentExtractor.get_source_name_from_file(None)
assert result == "Unknown File"
def test_get_source_name_from_file_no_name(self):
"""Test extracting source name from file object without name."""
mock_file = MagicMock()
del mock_file.name # Remove name attribute
result = ContentExtractor.get_source_name_from_file(mock_file)
assert result == "Uploaded File"