Spaces:
Sleeping
Sleeping
| import time | |
| from . import chunk_text | |
| def benchmark_chunker(text, chunk_size, overlap, method): | |
| print(f"Benchmarking {method} chunker...") | |
| t0 = time.time() | |
| chunks = chunk_text(text, chunk_size, overlap, method) | |
| t1 = time.time() | |
| lens = [len(c["text"]) for c in chunks] | |
| print(f"Total Chunks: {len(chunks)}") | |
| print(f"Avg Chunk Size: {sum(lens)/len(lens):.1f}") | |
| print(f"Min/Max Chunk Size: {min(lens)}/{max(lens)}") | |
| print(f"Time Taken: {t1-t0:.4f}s") | |
| print("Sample metadata:", chunks[0]["meta"] if chunks else None) | |
| print("--- Sample chunk ---") | |
| if chunks: | |
| print(chunks[0]["text"][:200]) | |
| print("-" * 40) | |
| if __name__ == "__main__": | |
| text = ("This is a sample paragraph. " * 20 + "\n\n") * 100 | |
| benchmark_chunker(text, chunk_size=300, overlap=50, method="fixed") | |
| benchmark_chunker(text, chunk_size=300, overlap=0, method="semantic") |