Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language | |
| import tiktoken | |
| CHARACTER_LENGTH = "length_function=lambda x: len(x)" | |
| TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base") | |
| length_function = lambda text: len(enc.encode(text)) | |
| """ | |
| CHARACTER = """CharacterTextSplitter( | |
| separator="\\n\\n", | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter( | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| LANGUAGE = """RecursiveCharacterTextSplitter.from_language( | |
| language="{language}", | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| # Streamlit UI | |
| st.title("Understand Chunk and Token") | |
| chunk_size = st.number_input( | |
| min_value=1, | |
| label="Chunk Size", | |
| value=1000 | |
| ) | |
| chunk_overlap = st.number_input( | |
| min_value=1, | |
| max_value=chunk_size - 1, | |
| label="Chunk Overlap", | |
| value=int(chunk_size * 0.2) | |
| ) | |
| length_function_option = st.selectbox( | |
| "Length Function", | |
| ["Characters", "Tokens"] | |
| ) | |
| splitter_choice = st.selectbox( | |
| "Select a Text Splitter", | |
| ["RecursiveCharacter", "Character"] | |
| ) | |
| # Auswählen der passenden length_function | |
| if length_function_option == "Characters": | |
| # Messen in Zeichen | |
| length_function = len | |
| length_function_str = CHARACTER_LENGTH | |
| elif length_function_option == "Tokens": | |
| # Messen in Tokens mithilfe tiktoken | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| def length_function(text: str) -> int: | |
| return len(enc.encode(text)) | |
| length_function_str = TOKEN_LENGTH | |
| else: | |
| raise ValueError("Ungültige Option für length_function.") | |
| # Text-Eingabe | |
| doc = st.text_area("Füge hier deinen Text ein:") | |
| # Button zum Splitten des Textes | |
| if st.button("Split Text"): | |
| # Erzeugen des Splitter-Objekts basierend auf der Auswahl | |
| if splitter_choice == "Character": | |
| splitter = CharacterTextSplitter( | |
| separator="\n\n", | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| elif splitter_choice == "RecursiveCharacter": | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| elif "Language." in splitter_choice: | |
| lang = splitter_choice.split(".")[1].lower() | |
| splitter = RecursiveCharacterTextSplitter.from_language( | |
| language=lang, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| else: | |
| raise ValueError("Ungültige Wahl beim Text Splitter.") | |
| # Aufteilen des Textes | |
| splits = splitter.split_text(doc) | |
| # Ausgabe der erstellten Textsplitter | |
| for idx, split in enumerate(splits, start=1): | |
| st.text_area(f"Teilstück {idx}", split, height=150) | |