Update app.py
Browse files
app.py
CHANGED
|
@@ -707,6 +707,10 @@ def sentence_splitter_instantiation(
|
|
| 707 |
sentence_splitter_config,
|
| 708 |
):
|
| 709 |
### Chunker/Sentence Splitter
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
if sentence_splitter_config.value is not None:
|
| 711 |
sentence_splitter_config_values = sentence_splitter_config.value
|
| 712 |
validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
|
|
@@ -719,6 +723,7 @@ def sentence_splitter_instantiation(
|
|
| 719 |
paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
|
| 720 |
secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
|
| 721 |
include_metadata=sentence_splitter_config_values.get("include_metadata"),
|
|
|
|
| 722 |
)
|
| 723 |
|
| 724 |
else:
|
|
@@ -729,6 +734,7 @@ def sentence_splitter_instantiation(
|
|
| 729 |
paragraph_separator="\n\n\n",
|
| 730 |
secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
|
| 731 |
include_metadata=True,
|
|
|
|
| 732 |
)
|
| 733 |
return (sentence_splitter,)
|
| 734 |
|
|
|
|
| 707 |
sentence_splitter_config,
|
| 708 |
):
|
| 709 |
### Chunker/Sentence Splitter
|
| 710 |
+
def simple_whitespace_tokenizer(text):
|
| 711 |
+
"""Tokenizer that considers each word as one token"""
|
| 712 |
+
return text.split()
|
| 713 |
+
|
| 714 |
if sentence_splitter_config.value is not None:
|
| 715 |
sentence_splitter_config_values = sentence_splitter_config.value
|
| 716 |
validated_chunk_overlap = min(sentence_splitter_config_values.get("chunk_overlap"),
|
|
|
|
| 723 |
paragraph_separator=sentence_splitter_config_values.get("paragraph_separator"),
|
| 724 |
secondary_chunking_regex=sentence_splitter_config_values.get("secondary_chunking_regex"),
|
| 725 |
include_metadata=sentence_splitter_config_values.get("include_metadata"),
|
| 726 |
+
tokenizer=simple_whitespace_tokenizer
|
| 727 |
)
|
| 728 |
|
| 729 |
else:
|
|
|
|
| 734 |
paragraph_separator="\n\n\n",
|
| 735 |
secondary_chunking_regex="[^,.;?!]+[,.;?!]?",
|
| 736 |
include_metadata=True,
|
| 737 |
+
tokenizer=simple_whitespace_tokenizer
|
| 738 |
)
|
| 739 |
return (sentence_splitter,)
|
| 740 |
|