Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import re | |
| from typing import Any, List, Literal, Optional, Union | |
| from langchain_text_splitters.base import Language, TextSplitter | |
| class CharacterTextSplitter(TextSplitter): | |
| """Splitting text that looks at characters.""" | |
| def __init__( | |
| self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any | |
| ) -> None: | |
| """Create a new TextSplitter.""" | |
| super().__init__(**kwargs) | |
| self._separator = separator | |
| self._is_separator_regex = is_separator_regex | |
| def split_text(self, text: str) -> List[str]: | |
| """Split incoming text and return chunks.""" | |
| # First we naively split the large input into a bunch of smaller ones. | |
| separator = ( | |
| self._separator if self._is_separator_regex else re.escape(self._separator) | |
| ) | |
| splits = _split_text_with_regex(text, separator, self._keep_separator) | |
| _separator = "" if self._keep_separator else self._separator | |
| return self._merge_splits(splits, _separator) | |
| def _split_text_with_regex( | |
| text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] | |
| ) -> List[str]: | |
| # Now that we have the separator, split the text | |
| if separator: | |
| if keep_separator: | |
| # The parentheses in the pattern keep the delimiters in the result. | |
| _splits = re.split(f"({separator})", text) | |
| splits = ( | |
| ([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)]) | |
| if keep_separator == "end" | |
| else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]) | |
| ) | |
| if len(_splits) % 2 == 0: | |
| splits += _splits[-1:] | |
| splits = ( | |
| (splits + [_splits[-1]]) | |
| if keep_separator == "end" | |
| else ([_splits[0]] + splits) | |
| ) | |
| else: | |
| splits = re.split(separator, text) | |
| else: | |
| splits = list(text) | |
| return [s for s in splits if s != ""] | |
| class RecursiveCharacterTextSplitter(TextSplitter): | |
| """Splitting text by recursively look at characters. | |
| Recursively tries to split by different characters to find one | |
| that works. | |
| """ | |
| def __init__( | |
| self, | |
| separators: Optional[List[str]] = None, | |
| keep_separator: bool = True, | |
| is_separator_regex: bool = False, | |
| **kwargs: Any, | |
| ) -> None: | |
| """Create a new TextSplitter.""" | |
| super().__init__(keep_separator=keep_separator, **kwargs) | |
| self._separators = separators or ["\n\n", "\n", " ", ""] | |
| self._is_separator_regex = is_separator_regex | |
| def _split_text(self, text: str, separators: List[str]) -> List[str]: | |
| """Split incoming text and return chunks.""" | |
| final_chunks = [] | |
| # Get appropriate separator to use | |
| separator = separators[-1] | |
| new_separators = [] | |
| for i, _s in enumerate(separators): | |
| _separator = _s if self._is_separator_regex else re.escape(_s) | |
| if _s == "": | |
| separator = _s | |
| break | |
| if re.search(_separator, text): | |
| separator = _s | |
| new_separators = separators[i + 1 :] | |
| break | |
| _separator = separator if self._is_separator_regex else re.escape(separator) | |
| splits = _split_text_with_regex(text, _separator, self._keep_separator) | |
| # Now go merging things, recursively splitting longer texts. | |
| _good_splits = [] | |
| _separator = "" if self._keep_separator else separator | |
| for s in splits: | |
| if self._length_function(s) < self._chunk_size: | |
| _good_splits.append(s) | |
| else: | |
| if _good_splits: | |
| merged_text = self._merge_splits(_good_splits, _separator) | |
| final_chunks.extend(merged_text) | |
| _good_splits = [] | |
| if not new_separators: | |
| final_chunks.append(s) | |
| else: | |
| other_info = self._split_text(s, new_separators) | |
| final_chunks.extend(other_info) | |
| if _good_splits: | |
| merged_text = self._merge_splits(_good_splits, _separator) | |
| final_chunks.extend(merged_text) | |
| return final_chunks | |
| def split_text(self, text: str) -> List[str]: | |
| return self._split_text(text, self._separators) | |
| def from_language( | |
| cls, language: Language, **kwargs: Any | |
| ) -> RecursiveCharacterTextSplitter: | |
| separators = cls.get_separators_for_language(language) | |
| return cls(separators=separators, is_separator_regex=True, **kwargs) | |
| def get_separators_for_language(language: Language) -> List[str]: | |
| if language == Language.CPP: | |
| return [ | |
| # Split along class definitions | |
| "\nclass ", | |
| # Split along function definitions | |
| "\nvoid ", | |
| "\nint ", | |
| "\nfloat ", | |
| "\ndouble ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nswitch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.GO: | |
| return [ | |
| # Split along function definitions | |
| "\nfunc ", | |
| "\nvar ", | |
| "\nconst ", | |
| "\ntype ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nswitch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.JAVA: | |
| return [ | |
| # Split along class definitions | |
| "\nclass ", | |
| # Split along method definitions | |
| "\npublic ", | |
| "\nprotected ", | |
| "\nprivate ", | |
| "\nstatic ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nswitch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.KOTLIN: | |
| return [ | |
| # Split along class definitions | |
| "\nclass ", | |
| # Split along method definitions | |
| "\npublic ", | |
| "\nprotected ", | |
| "\nprivate ", | |
| "\ninternal ", | |
| "\ncompanion ", | |
| "\nfun ", | |
| "\nval ", | |
| "\nvar ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nwhen ", | |
| "\ncase ", | |
| "\nelse ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.JS: | |
| return [ | |
| # Split along function definitions | |
| "\nfunction ", | |
| "\nconst ", | |
| "\nlet ", | |
| "\nvar ", | |
| "\nclass ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nswitch ", | |
| "\ncase ", | |
| "\ndefault ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.TS: | |
| return [ | |
| "\nenum ", | |
| "\ninterface ", | |
| "\nnamespace ", | |
| "\ntype ", | |
| # Split along class definitions | |
| "\nclass ", | |
| # Split along function definitions | |
| "\nfunction ", | |
| "\nconst ", | |
| "\nlet ", | |
| "\nvar ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nswitch ", | |
| "\ncase ", | |
| "\ndefault ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.PHP: | |
| return [ | |
| # Split along function definitions | |
| "\nfunction ", | |
| # Split along class definitions | |
| "\nclass ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nforeach ", | |
| "\nwhile ", | |
| "\ndo ", | |
| "\nswitch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.PROTO: | |
| return [ | |
| # Split along message definitions | |
| "\nmessage ", | |
| # Split along service definitions | |
| "\nservice ", | |
| # Split along enum definitions | |
| "\nenum ", | |
| # Split along option definitions | |
| "\noption ", | |
| # Split along import statements | |
| "\nimport ", | |
| # Split along syntax declarations | |
| "\nsyntax ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.PYTHON: | |
| return [ | |
| # First, try to split along class definitions | |
| "\nclass ", | |
| "\ndef ", | |
| "\n\tdef ", | |
| # Now split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.RST: | |
| return [ | |
| # Split along section titles | |
| "\n=+\n", | |
| "\n-+\n", | |
| "\n\\*+\n", | |
| # Split along directive markers | |
| "\n\n.. *\n\n", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.RUBY: | |
| return [ | |
| # Split along method definitions | |
| "\ndef ", | |
| "\nclass ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nunless ", | |
| "\nwhile ", | |
| "\nfor ", | |
| "\ndo ", | |
| "\nbegin ", | |
| "\nrescue ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.RUST: | |
| return [ | |
| # Split along function definitions | |
| "\nfn ", | |
| "\nconst ", | |
| "\nlet ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nwhile ", | |
| "\nfor ", | |
| "\nloop ", | |
| "\nmatch ", | |
| "\nconst ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.SCALA: | |
| return [ | |
| # Split along class definitions | |
| "\nclass ", | |
| "\nobject ", | |
| # Split along method definitions | |
| "\ndef ", | |
| "\nval ", | |
| "\nvar ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nmatch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.SWIFT: | |
| return [ | |
| # Split along function definitions | |
| "\nfunc ", | |
| # Split along class definitions | |
| "\nclass ", | |
| "\nstruct ", | |
| "\nenum ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\ndo ", | |
| "\nswitch ", | |
| "\ncase ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.MARKDOWN: | |
| return [ | |
| # First, try to split along Markdown headings (starting with level 2) | |
| "\n#{1,6} ", | |
| # Note the alternative syntax for headings (below) is not handled here | |
| # Heading level 2 | |
| # --------------- | |
| # End of code block | |
| "```\n", | |
| # Horizontal lines | |
| "\n\\*\\*\\*+\n", | |
| "\n---+\n", | |
| "\n___+\n", | |
| # Note that this splitter doesn't handle horizontal lines defined | |
| # by *three or more* of ***, ---, or ___, but this is not handled | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.LATEX: | |
| return [ | |
| # First, try to split along Latex sections | |
| "\n\\\\chapter{", | |
| "\n\\\\section{", | |
| "\n\\\\subsection{", | |
| "\n\\\\subsubsection{", | |
| # Now split by environments | |
| "\n\\\\begin{enumerate}", | |
| "\n\\\\begin{itemize}", | |
| "\n\\\\begin{description}", | |
| "\n\\\\begin{list}", | |
| "\n\\\\begin{quote}", | |
| "\n\\\\begin{quotation}", | |
| "\n\\\\begin{verse}", | |
| "\n\\\\begin{verbatim}", | |
| # Now split by math environments | |
| "\n\\\begin{align}", | |
| "$$", | |
| "$", | |
| # Now split by the normal type of lines | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.HTML: | |
| return [ | |
| # First, try to split along HTML tags | |
| "<body", | |
| "<div", | |
| "<p", | |
| "<br", | |
| "<li", | |
| "<h1", | |
| "<h2", | |
| "<h3", | |
| "<h4", | |
| "<h5", | |
| "<h6", | |
| "<span", | |
| "<table", | |
| "<tr", | |
| "<td", | |
| "<th", | |
| "<ul", | |
| "<ol", | |
| "<header", | |
| "<footer", | |
| "<nav", | |
| # Head | |
| "<head", | |
| "<style", | |
| "<script", | |
| "<meta", | |
| "<title", | |
| "", | |
| ] | |
| elif language == Language.CSHARP: | |
| return [ | |
| "\ninterface ", | |
| "\nenum ", | |
| "\nimplements ", | |
| "\ndelegate ", | |
| "\nevent ", | |
| # Split along class definitions | |
| "\nclass ", | |
| "\nabstract ", | |
| # Split along method definitions | |
| "\npublic ", | |
| "\nprotected ", | |
| "\nprivate ", | |
| "\nstatic ", | |
| "\nreturn ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\ncontinue ", | |
| "\nfor ", | |
| "\nforeach ", | |
| "\nwhile ", | |
| "\nswitch ", | |
| "\nbreak ", | |
| "\ncase ", | |
| "\nelse ", | |
| # Split by exceptions | |
| "\ntry ", | |
| "\nthrow ", | |
| "\nfinally ", | |
| "\ncatch ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.SOL: | |
| return [ | |
| # Split along compiler information definitions | |
| "\npragma ", | |
| "\nusing ", | |
| # Split along contract definitions | |
| "\ncontract ", | |
| "\ninterface ", | |
| "\nlibrary ", | |
| # Split along method definitions | |
| "\nconstructor ", | |
| "\ntype ", | |
| "\nfunction ", | |
| "\nevent ", | |
| "\nmodifier ", | |
| "\nerror ", | |
| "\nstruct ", | |
| "\nenum ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\ndo while ", | |
| "\nassembly ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.COBOL: | |
| return [ | |
| # Split along divisions | |
| "\nIDENTIFICATION DIVISION.", | |
| "\nENVIRONMENT DIVISION.", | |
| "\nDATA DIVISION.", | |
| "\nPROCEDURE DIVISION.", | |
| # Split along sections within DATA DIVISION | |
| "\nWORKING-STORAGE SECTION.", | |
| "\nLINKAGE SECTION.", | |
| "\nFILE SECTION.", | |
| # Split along sections within PROCEDURE DIVISION | |
| "\nINPUT-OUTPUT SECTION.", | |
| # Split along paragraphs and common statements | |
| "\nOPEN ", | |
| "\nCLOSE ", | |
| "\nREAD ", | |
| "\nWRITE ", | |
| "\nIF ", | |
| "\nELSE ", | |
| "\nMOVE ", | |
| "\nPERFORM ", | |
| "\nUNTIL ", | |
| "\nVARYING ", | |
| "\nACCEPT ", | |
| "\nDISPLAY ", | |
| "\nSTOP RUN.", | |
| # Split by the normal type of lines | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.LUA: | |
| return [ | |
| # Split along variable and table definitions | |
| "\nlocal ", | |
| # Split along function definitions | |
| "\nfunction ", | |
| # Split along control flow statements | |
| "\nif ", | |
| "\nfor ", | |
| "\nwhile ", | |
| "\nrepeat ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language == Language.HASKELL: | |
| return [ | |
| # Split along function definitions | |
| "\nmain :: ", | |
| "\nmain = ", | |
| "\nlet ", | |
| "\nin ", | |
| "\ndo ", | |
| "\nwhere ", | |
| "\n:: ", | |
| "\n= ", | |
| # Split along type declarations | |
| "\ndata ", | |
| "\nnewtype ", | |
| "\ntype ", | |
| "\n:: ", | |
| # Split along module declarations | |
| "\nmodule ", | |
| # Split along import statements | |
| "\nimport ", | |
| "\nqualified ", | |
| "\nimport qualified ", | |
| # Split along typeclass declarations | |
| "\nclass ", | |
| "\ninstance ", | |
| # Split along case expressions | |
| "\ncase ", | |
| # Split along guards in function definitions | |
| "\n| ", | |
| # Split along record field declarations | |
| "\ndata ", | |
| "\n= {", | |
| "\n, ", | |
| # Split by the normal type of lines | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| elif language in Language._value2member_map_: | |
| raise ValueError(f"Language {language} is not implemented yet!") | |
| else: | |
| raise ValueError( | |
| f"Language {language} is not supported! " | |
| f"Please choose from {list(Language)}" | |
| ) | |