|
|
|
|
|
import sys |
|
|
import os |
|
|
|
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
|
|
from document_processor import DocumentProcessor |
|
|
|
|
|
|
|
|
doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'docs', 'python_lists.txt')) |
|
|
|
|
|
with open(doc_path, 'r', encoding='utf-8') as f: |
|
|
sample_text = f.read() |
|
|
|
|
|
def test_splitting_methods(): |
|
|
"""Tests the text splitting methods from DocumentProcessor.""" |
|
|
processor = DocumentProcessor(chunk_size=30, chunk_overlap=10) |
|
|
|
|
|
print("--- Input Text ---") |
|
|
print(sample_text) |
|
|
print("\n" + "="*20 + "\n") |
|
|
|
|
|
|
|
|
print("--- Testing split_text (word-based) ---") |
|
|
word_chunks = processor.split_text(sample_text) |
|
|
print(f"Found {len(word_chunks)} chunks.\n") |
|
|
for i, chunk in enumerate(word_chunks): |
|
|
print(f"Chunk {i+1}:") |
|
|
print(chunk) |
|
|
print("-" * 10) |
|
|
|
|
|
print("\n" + "="*20 + "\n") |
|
|
|
|
|
|
|
|
print("--- Testing split_by_sentences ---") |
|
|
sentence_chunks = processor.split_by_sentences(sample_text, sentences_per_chunk=2) |
|
|
print(f"Found {len(sentence_chunks)} chunks.\n") |
|
|
for i, chunk in enumerate(sentence_chunks): |
|
|
print(f"Chunk {i+1}:") |
|
|
print(chunk) |
|
|
print("-" * 10) |
|
|
|
|
|
print("\n" + "="*20 + "\n") |
|
|
|
|
|
|
|
|
print("--- Testing recursive_split ---") |
|
|
recursive_chunks = processor.recursive_split(sample_text, chunk_size=500) |
|
|
print(f"Found {len(recursive_chunks)} chunks.\n") |
|
|
for i, chunk in enumerate(recursive_chunks): |
|
|
print(f"Chunk {i+1}:") |
|
|
print(chunk) |
|
|
print("-" * 10) |
|
|
|
|
|
print("\n" + "="*20 + "\n") |
|
|
|
|
|
|
|
|
print("--- Testing smart_split ---") |
|
|
smart_chunks = processor.smart_split(sample_text, chunk_size=500) |
|
|
print(f"Found {len(smart_chunks)} chunks.\n") |
|
|
for i, chunk in enumerate(smart_chunks): |
|
|
print(f"Chunk {i+1}:") |
|
|
print(chunk) |
|
|
print("-" * 10) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_splitting_methods() |
|
|
|