Spaces:

vampokala
/

doc-ingestion

Running

doc-ingestion / tests /unit /test_document_processor.py

Vamshi Pokala

Redesigned the README and added Citiation and truthfullness in responses

ce71763 about 1 month ago

5.09 kB

	import os
	import tempfile

	import pytest
	from src.core.document_processor import DocumentProcessor


	@pytest.fixture
	def processor():
	return DocumentProcessor(chunk_size=50, overlap=10)


	def _write_temp_file(content: str, suffix: str) -> str:
	f = tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding="utf-8")
	f.write(content)
	f.close()
	return f.name


	class TestExtractText:
	def test_txt_extraction(self, processor):
	path = _write_temp_file("Hello world", ".txt")
	try:
	assert processor.extract_text(path) == "Hello world"
	finally:
	os.unlink(path)

	def test_md_extraction(self, processor):
	path = _write_temp_file("# Title\nSome content", ".md")
	try:
	text = processor.extract_text(path)
	assert "Title" in text
	assert "Some content" in text
	finally:
	os.unlink(path)

	def test_html_extraction(self, processor):
	path = _write_temp_file("<html><body><p>Hello HTML</p></body></html>", ".html")
	try:
	text = processor.extract_text(path)
	assert "Hello HTML" in text
	finally:
	os.unlink(path)

	def test_unsupported_format_raises(self, processor):
	path = _write_temp_file("data", ".csv")
	try:
	with pytest.raises(ValueError, match="Unsupported file type"):
	processor.extract_text(path)
	finally:
	os.unlink(path)


	class TestCleanText:
	def test_collapses_whitespace(self, processor):
	assert processor.clean_text("hello \n\t world") == "hello world"

	def test_strips_leading_trailing(self, processor):
	assert processor.clean_text(" hello ") == "hello"

	def test_empty_string(self, processor):
	assert processor.clean_text("") == ""


	class TestChunkText:
	def test_short_text_is_single_chunk(self, processor):
	chunks = processor.chunk_text("short text")
	assert len(chunks) == 1
	assert chunks[0] == "short text"

	def test_long_text_produces_multiple_chunks(self, processor):
	text = "token " * 300
	chunks = processor.chunk_text(text)
	assert len(chunks) > 1

	def test_chunk_size_respected(self, processor):
	text = "word " * 250
	chunks = processor.chunk_text(text)
	assert all(processor.count_tokens(c) <= processor.chunk_size for c in chunks)

	def test_overlap_creates_shared_content(self):
	proc = DocumentProcessor(chunk_size=20, overlap=5)
	text = "alpha beta gamma delta epsilon " * 40
	chunks = proc.chunk_text(text)
	assert len(chunks) >= 2
	first_tokens = proc._tokenizer.encode(chunks[0])
	second_tokens = proc._tokenizer.encode(chunks[1])
	assert first_tokens[-proc.overlap:] == second_tokens[:proc.overlap]

	def test_no_empty_chunks(self, processor):
	chunks = processor.chunk_text("hello world " * 20)
	assert all(len(c) > 0 for c in chunks)


	class TestExtractMetadata:
	def test_txt_metadata_has_required_keys(self, processor):
	path = _write_temp_file("content", ".txt")
	try:
	meta = processor.extract_metadata(path)
	assert "title" in meta
	assert "author" in meta
	assert "date" in meta
	assert "file_type" in meta
	finally:
	os.unlink(path)

	def test_file_type_matches_extension(self, processor):
	path = _write_temp_file("content", ".txt")
	try:
	meta = processor.extract_metadata(path)
	assert meta["file_type"] == ".txt"
	finally:
	os.unlink(path)

	def test_date_is_set_when_missing(self, processor):
	path = _write_temp_file("content", ".txt")
	try:
	meta = processor.extract_metadata(path)
	assert meta["date"] is not None
	finally:
	os.unlink(path)


	class TestProcessDocument:
	def test_returns_dict_with_expected_keys(self, processor):
	path = _write_temp_file("Hello world content", ".txt")
	try:
	result = processor.process_document(path)
	assert result is not None
	assert "metadata" in result
	assert "chunks" in result
	finally:
	os.unlink(path)

	def test_duplicate_returns_none(self, processor):
	path = _write_temp_file("Identical content", ".txt")
	try:
	first = processor.process_document(path)
	second = processor.process_document(path)
	assert first is not None
	assert second is None
	finally:
	os.unlink(path)

	def test_different_files_not_flagged_as_duplicate(self, processor):
	path1 = _write_temp_file("Content A", ".txt")
	path2 = _write_temp_file("Content B", ".txt")
	try:
	assert processor.process_document(path1) is not None
	assert processor.process_document(path2) is not None
	finally:
	os.unlink(path1)
	os.unlink(path2)