Spaces:

vampokala
/

doc-ingestion

Sleeping

File size: 5,090 Bytes

import os
import tempfile

import pytest
from src.core.document_processor import DocumentProcessor


@pytest.fixture
def processor():
    return DocumentProcessor(chunk_size=50, overlap=10)


def _write_temp_file(content: str, suffix: str) -> str:
    f = tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding="utf-8")
    f.write(content)
    f.close()
    return f.name


class TestExtractText:
    def test_txt_extraction(self, processor):
        path = _write_temp_file("Hello world", ".txt")
        try:
            assert processor.extract_text(path) == "Hello world"
        finally:
            os.unlink(path)

    def test_md_extraction(self, processor):
        path = _write_temp_file("# Title\nSome content", ".md")
        try:
            text = processor.extract_text(path)
            assert "Title" in text
            assert "Some content" in text
        finally:
            os.unlink(path)

    def test_html_extraction(self, processor):
        path = _write_temp_file("<html><body><p>Hello HTML</p></body></html>", ".html")
        try:
            text = processor.extract_text(path)
            assert "Hello HTML" in text
        finally:
            os.unlink(path)

    def test_unsupported_format_raises(self, processor):
        path = _write_temp_file("data", ".csv")
        try:
            with pytest.raises(ValueError, match="Unsupported file type"):
                processor.extract_text(path)
        finally:
            os.unlink(path)


class TestCleanText:
    def test_collapses_whitespace(self, processor):
        assert processor.clean_text("hello   \n\t world") == "hello world"

    def test_strips_leading_trailing(self, processor):
        assert processor.clean_text("  hello  ") == "hello"

    def test_empty_string(self, processor):
        assert processor.clean_text("") == ""


class TestChunkText:
    def test_short_text_is_single_chunk(self, processor):
        chunks = processor.chunk_text("short text")
        assert len(chunks) == 1
        assert chunks[0] == "short text"

    def test_long_text_produces_multiple_chunks(self, processor):
        text = "token " * 300
        chunks = processor.chunk_text(text)
        assert len(chunks) > 1

    def test_chunk_size_respected(self, processor):
        text = "word " * 250
        chunks = processor.chunk_text(text)
        assert all(processor.count_tokens(c) <= processor.chunk_size for c in chunks)

    def test_overlap_creates_shared_content(self):
        proc = DocumentProcessor(chunk_size=20, overlap=5)
        text = "alpha beta gamma delta epsilon " * 40
        chunks = proc.chunk_text(text)
        assert len(chunks) >= 2
        first_tokens = proc._tokenizer.encode(chunks[0])
        second_tokens = proc._tokenizer.encode(chunks[1])
        assert first_tokens[-proc.overlap:] == second_tokens[:proc.overlap]

    def test_no_empty_chunks(self, processor):
        chunks = processor.chunk_text("hello world " * 20)
        assert all(len(c) > 0 for c in chunks)


class TestExtractMetadata:
    def test_txt_metadata_has_required_keys(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert "title" in meta
            assert "author" in meta
            assert "date" in meta
            assert "file_type" in meta
        finally:
            os.unlink(path)

    def test_file_type_matches_extension(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert meta["file_type"] == ".txt"
        finally:
            os.unlink(path)

    def test_date_is_set_when_missing(self, processor):
        path = _write_temp_file("content", ".txt")
        try:
            meta = processor.extract_metadata(path)
            assert meta["date"] is not None
        finally:
            os.unlink(path)


class TestProcessDocument:
    def test_returns_dict_with_expected_keys(self, processor):
        path = _write_temp_file("Hello world content", ".txt")
        try:
            result = processor.process_document(path)
            assert result is not None
            assert "metadata" in result
            assert "chunks" in result
        finally:
            os.unlink(path)

    def test_duplicate_returns_none(self, processor):
        path = _write_temp_file("Identical content", ".txt")
        try:
            first = processor.process_document(path)
            second = processor.process_document(path)
            assert first is not None
            assert second is None
        finally:
            os.unlink(path)

    def test_different_files_not_flagged_as_duplicate(self, processor):
        path1 = _write_temp_file("Content A", ".txt")
        path2 = _write_temp_file("Content B", ".txt")
        try:
            assert processor.process_document(path1) is not None
            assert processor.process_document(path2) is not None
        finally:
            os.unlink(path1)
            os.unlink(path2)