File size: 2,170 Bytes
625e9e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import sys
import os
# Add the parent directory to the path to find document_processor
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from document_processor import DocumentProcessor
# Path to the document to be used for testing
doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'docs', 'python_lists.txt'))
with open(doc_path, 'r', encoding='utf-8') as f:
sample_text = f.read()
def test_splitting_methods():
"""Tests the text splitting methods from DocumentProcessor."""
processor = DocumentProcessor(chunk_size=30, chunk_overlap=10)
print("--- Input Text ---")
print(sample_text)
print("\n" + "="*20 + "\n")
# Test simple word-based splitting
print("--- Testing split_text (word-based) ---")
word_chunks = processor.split_text(sample_text)
print(f"Found {len(word_chunks)} chunks.\n")
for i, chunk in enumerate(word_chunks):
print(f"Chunk {i+1}:")
print(chunk)
print("-" * 10)
print("\n" + "="*20 + "\n")
# Test sentence-based splitting
print("--- Testing split_by_sentences ---")
sentence_chunks = processor.split_by_sentences(sample_text, sentences_per_chunk=2)
print(f"Found {len(sentence_chunks)} chunks.\n")
for i, chunk in enumerate(sentence_chunks):
print(f"Chunk {i+1}:")
print(chunk)
print("-" * 10)
print("\n" + "="*20 + "\n")
# Test recursive splitting
print("--- Testing recursive_split ---")
recursive_chunks = processor.recursive_split(sample_text, chunk_size=500)
print(f"Found {len(recursive_chunks)} chunks.\n")
for i, chunk in enumerate(recursive_chunks):
print(f"Chunk {i+1}:")
print(chunk)
print("-" * 10)
print("\n" + "="*20 + "\n")
# Test smart splitting
print("--- Testing smart_split ---")
smart_chunks = processor.smart_split(sample_text, chunk_size=500)
print(f"Found {len(smart_chunks)} chunks.\n")
for i, chunk in enumerate(smart_chunks):
print(f"Chunk {i+1}:")
print(chunk)
print("-" * 10)
if __name__ == "__main__":
test_splitting_methods()
|