Spaces:
Runtime error
Runtime error
| """Test text splitters that require an integration.""" | |
| import pytest | |
| from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter | |
| def test_huggingface_type_check() -> None: | |
| """Test that type checks are done properly on input.""" | |
| with pytest.raises(ValueError): | |
| CharacterTextSplitter.from_huggingface_tokenizer("foo") | |
| def test_huggingface_tokenizer() -> None: | |
| """Test text splitter that uses a HuggingFace tokenizer.""" | |
| from transformers import GPT2TokenizerFast | |
| tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| text_splitter = CharacterTextSplitter.from_huggingface_tokenizer( | |
| tokenizer, separator=" ", chunk_size=1, chunk_overlap=0 | |
| ) | |
| output = text_splitter.split_text("foo bar") | |
| assert output == ["foo", "bar"] | |
| class TestTokenTextSplitter: | |
| """Test token text splitter.""" | |
| def test_basic(self) -> None: | |
| """Test no overlap.""" | |
| splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0) | |
| output = splitter.split_text("abcdef" * 5) # 10 token string | |
| expected_output = ["abcdefabcdefabc", "defabcdefabcdef"] | |
| assert output == expected_output | |
| def test_overlap(self) -> None: | |
| """Test with overlap.""" | |
| splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1) | |
| output = splitter.split_text("abcdef" * 5) # 10 token string | |
| expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"] | |
| assert output == expected_output | |