File size: 5,090 Bytes
c01a6e5
 
 
 
 
 
 
 
 
ce71763
c01a6e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce71763
c01a6e5
 
 
 
ce71763
c01a6e5
ce71763
c01a6e5
 
ce71763
 
c01a6e5
ce71763
 
 
 
c01a6e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import tempfile

import pytest
from src.core.document_processor import DocumentProcessor


@pytest.fixture
def processor():
    return DocumentProcessor(chunk_size=50, overlap=10)


def _write_temp_file(content: str, suffix: str) -> str:
    f = tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding="utf-8")
    f.write(content)
    f.close()
    return f.name


class TestExtractText:
    def test_txt_extraction(self, processor):
        path = _write_temp_file("Hello world", ".txt")
        try:
            assert processor.extract_text(path) == "Hello world"
        finally:
            os.unlink(path)

    def test_md_extraction(self, processor):
        path = _write_temp_file("# Title\nSome content", ".md")
        try:
            text = processor.extract_text(path)
            assert "Title" in text
            assert "Some content" in text
        finally:
            os.unlink(path)

    def test_html_extraction(self, processor):
        path = _write_temp_file("<html><body><p>Hello HTML</p></body></html>", ".html")
        try:
            text = processor.extract_text(path)
            assert "Hello HTML" in text
        finally:
            os.unlink(path)

    def test_unsupported_format_raises(self, processor):
        path = _write_temp_file("data", ".csv")
        try:
            with pytest.raises(ValueError, match="Unsupported file type"):
                processor.extract_text(path)
        finally:
            os.unlink(path)


class TestCleanText:
    def test_collapses_whitespace(self, processor):
        assert processor.clean_text("hello   \n\t world") == "hello world"

    def test_strips_leading_trailing(self, processor):
        assert processor.clean_text("  hello  ") == "hello"

    def test_empty_string(self, processor):
        assert processor.clean_text("") == ""


class TestChunkText:
    def test_short_text_is_single_chunk(self, processor):
        chunks = processor.chunk_text("short text")
        assert len(chunks) == 1
        assert chunks[0] == "short text"

    def test_long_text_produces_multiple_chunks(self, processor):
        text = "token " * 300
        chunks = processor.chunk_text(text)
        assert len(chunks) > 1

    def test_chunk_size_respected(self, processor):
        text = "word " * 250
        chunks = processor.chunk_text(text)
        assert all(processor.count_tokens(c) <= processor.chunk_size for c in chunks)

    def test_overlap_creates_shared_content(self):
        proc = DocumentProcessor(chunk_size=20, overlap=5)
        text = "alpha beta gamma delta epsilon " * 40
        chunks = proc.chunk_text(text)
        assert len(chunks) >= 2
        first_tokens = proc._tokenizer.encode(chunks[0])
        second_tokens = proc._tokenizer.encode(chunks[1])
        assert first_tokens[-proc.overlap:] == second_tokens[:proc.overlap]

    def test_no_empty_chunks(self, processor):
        chunks = processor.chunk_text("hello world " * 20)
        assert all(len(c) > 0 for c in chunks)


class TestExtractMetadata:
    def test_txt_metadata_has_required_keys(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert "title" in meta
            assert "author" in meta
            assert "date" in meta
            assert "file_type" in meta
        finally:
            os.unlink(path)

    def test_file_type_matches_extension(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert meta["file_type"] == ".txt"
        finally:
            os.unlink(path)

    def test_date_is_set_when_missing(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert meta["date"] is not None
        finally:
            os.unlink(path)


class TestProcessDocument:
    def test_returns_dict_with_expected_keys(self, processor):
        path = _write_temp_file("Hello world content", ".txt")
        try:
            result = processor.process_document(path)
            assert result is not None
            assert "metadata" in result
            assert "chunks" in result
        finally:
            os.unlink(path)

    def test_duplicate_returns_none(self, processor):
        path = _write_temp_file("Identical content", ".txt")
        try:
            first = processor.process_document(path)
            second = processor.process_document(path)
            assert first is not None
            assert second is None
        finally:
            os.unlink(path)

    def test_different_files_not_flagged_as_duplicate(self, processor):
        path1 = _write_temp_file("Content A", ".txt")
        path2 = _write_temp_file("Content B", ".txt")
        try:
            assert processor.process_document(path1) is not None
            assert processor.process_document(path2) is not None
        finally:
            os.unlink(path1)
            os.unlink(path2)