Spaces:
Sleeping
Sleeping
File size: 5,090 Bytes
c01a6e5 ce71763 c01a6e5 ce71763 c01a6e5 ce71763 c01a6e5 ce71763 c01a6e5 ce71763 c01a6e5 ce71763 c01a6e5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import os
import tempfile
import pytest
from src.core.document_processor import DocumentProcessor
@pytest.fixture
def processor():
return DocumentProcessor(chunk_size=50, overlap=10)
def _write_temp_file(content: str, suffix: str) -> str:
f = tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding="utf-8")
f.write(content)
f.close()
return f.name
class TestExtractText:
def test_txt_extraction(self, processor):
path = _write_temp_file("Hello world", ".txt")
try:
assert processor.extract_text(path) == "Hello world"
finally:
os.unlink(path)
def test_md_extraction(self, processor):
path = _write_temp_file("# Title\nSome content", ".md")
try:
text = processor.extract_text(path)
assert "Title" in text
assert "Some content" in text
finally:
os.unlink(path)
def test_html_extraction(self, processor):
path = _write_temp_file("<html><body><p>Hello HTML</p></body></html>", ".html")
try:
text = processor.extract_text(path)
assert "Hello HTML" in text
finally:
os.unlink(path)
def test_unsupported_format_raises(self, processor):
path = _write_temp_file("data", ".csv")
try:
with pytest.raises(ValueError, match="Unsupported file type"):
processor.extract_text(path)
finally:
os.unlink(path)
class TestCleanText:
def test_collapses_whitespace(self, processor):
assert processor.clean_text("hello \n\t world") == "hello world"
def test_strips_leading_trailing(self, processor):
assert processor.clean_text(" hello ") == "hello"
def test_empty_string(self, processor):
assert processor.clean_text("") == ""
class TestChunkText:
def test_short_text_is_single_chunk(self, processor):
chunks = processor.chunk_text("short text")
assert len(chunks) == 1
assert chunks[0] == "short text"
def test_long_text_produces_multiple_chunks(self, processor):
text = "token " * 300
chunks = processor.chunk_text(text)
assert len(chunks) > 1
def test_chunk_size_respected(self, processor):
text = "word " * 250
chunks = processor.chunk_text(text)
assert all(processor.count_tokens(c) <= processor.chunk_size for c in chunks)
def test_overlap_creates_shared_content(self):
proc = DocumentProcessor(chunk_size=20, overlap=5)
text = "alpha beta gamma delta epsilon " * 40
chunks = proc.chunk_text(text)
assert len(chunks) >= 2
first_tokens = proc._tokenizer.encode(chunks[0])
second_tokens = proc._tokenizer.encode(chunks[1])
assert first_tokens[-proc.overlap:] == second_tokens[:proc.overlap]
def test_no_empty_chunks(self, processor):
chunks = processor.chunk_text("hello world " * 20)
assert all(len(c) > 0 for c in chunks)
class TestExtractMetadata:
def test_txt_metadata_has_required_keys(self, processor):
path = _write_temp_file("content", ".txt")
try:
meta = processor.extract_metadata(path)
assert "title" in meta
assert "author" in meta
assert "date" in meta
assert "file_type" in meta
finally:
os.unlink(path)
def test_file_type_matches_extension(self, processor):
path = _write_temp_file("content", ".txt")
try:
meta = processor.extract_metadata(path)
assert meta["file_type"] == ".txt"
finally:
os.unlink(path)
def test_date_is_set_when_missing(self, processor):
path = _write_temp_file("content", ".txt")
try:
meta = processor.extract_metadata(path)
assert meta["date"] is not None
finally:
os.unlink(path)
class TestProcessDocument:
def test_returns_dict_with_expected_keys(self, processor):
path = _write_temp_file("Hello world content", ".txt")
try:
result = processor.process_document(path)
assert result is not None
assert "metadata" in result
assert "chunks" in result
finally:
os.unlink(path)
def test_duplicate_returns_none(self, processor):
path = _write_temp_file("Identical content", ".txt")
try:
first = processor.process_document(path)
second = processor.process_document(path)
assert first is not None
assert second is None
finally:
os.unlink(path)
def test_different_files_not_flagged_as_duplicate(self, processor):
path1 = _write_temp_file("Content A", ".txt")
path2 = _write_temp_file("Content B", ".txt")
try:
assert processor.process_document(path1) is not None
assert processor.process_document(path2) is not None
finally:
os.unlink(path1)
os.unlink(path2)
|