Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language | |
| import tiktoken | |
| # | |
| # BEISPIELHAFTE CODE-SNIPPETS | |
| # =========================== | |
| # Da das Original-Beispiel auf "code_snippets" verweist, kannst du hier | |
| # eigene Code-Beispiele oder Strings einfügen. Für die Demo setzen wir | |
| # einfach ein paar Strings ein. | |
| # | |
| CHARACTER_LENGTH = "length_function=lambda x: len(x)" | |
| TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base") | |
| length_function = lambda text: len(enc.encode(text)) | |
| """ | |
| CHARACTER = """CharacterTextSplitter( | |
| separator="\\n\\n", | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter( | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| LANGUAGE = """RecursiveCharacterTextSplitter.from_language( | |
| language="{language}", | |
| chunk_size={chunk_size}, | |
| chunk_overlap={chunk_overlap}, | |
| length_function={length_function} | |
| ) | |
| """ | |
| # Streamlit UI | |
| st.title("Text Splitter Playground") | |
| st.info("""\ | |
| Splitte einen Text in Teilstücke (Chunks), basierend auf deinen Einstellungen: | |
| - **Chunk Size**: Maximalgröße eines Teilstücks (in Zeichen oder Tokens) | |
| - **Chunk Overlap**: Überlappung zwischen den Teilstücken | |
| - **Length Function**: Gibt an, ob die Teilstück-Größe in Zeichen oder Tokens gemessen werden soll | |
| - **Splitter Choice**: Definiert den Text-Splitter (Charakter-basiert, rekursiv oder basierend auf einer Sprache) | |
| """) | |
| col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) | |
| with col1: | |
| chunk_size = st.number_input( | |
| min_value=1, | |
| label="Chunk Size", | |
| value=1000 | |
| ) | |
| with col2: | |
| chunk_overlap = st.number_input( | |
| min_value=1, | |
| max_value=chunk_size - 1, | |
| label="Chunk Overlap", | |
| value=int(chunk_size * 0.2) | |
| ) | |
| if chunk_overlap >= chunk_size: | |
| st.warning("Achtung: Chunk Overlap sollte kleiner als die Chunk Size sein!") | |
| with col3: | |
| length_function_option = st.selectbox( | |
| "Length Function", | |
| ["Characters", "Tokens"] | |
| ) | |
| splitter_choices = ["RecursiveCharacter", "Character"] + [f"Language.{v.name}" for v in Language] | |
| with col4: | |
| splitter_choice = st.selectbox( | |
| "Select a Text Splitter", | |
| splitter_choices | |
| ) | |
| # Auswählen der passenden length_function | |
| if length_function_option == "Characters": | |
| # Messen in Zeichen | |
| length_function = len | |
| length_function_str = CHARACTER_LENGTH | |
| elif length_function_option == "Tokens": | |
| # Messen in Tokens mithilfe tiktoken | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| def length_function(text: str) -> int: | |
| return len(enc.encode(text)) | |
| length_function_str = TOKEN_LENGTH | |
| else: | |
| raise ValueError("Ungültige Option für length_function.") | |
| # Code-Text bauen, der den vom User ausgewählten Splitter zeigt | |
| if splitter_choice == "Character": | |
| import_text = CHARACTER.format( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function_str | |
| ) | |
| elif splitter_choice == "RecursiveCharacter": | |
| import_text = RECURSIVE_CHARACTER.format( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function_str | |
| ) | |
| elif "Language." in splitter_choice: | |
| lang = splitter_choice.split(".")[1].lower() | |
| import_text = LANGUAGE.format( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| language=lang, | |
| length_function=length_function_str | |
| ) | |
| else: | |
| raise ValueError("Ungültige Wahl beim Text Splitter.") | |
| # Anzeigen des generierten Beispiel-Codes | |
| st.info("**Beispielcode:**\n\n" + import_text) | |
| # Text-Eingabe | |
| doc = st.text_area("Füge hier deinen Text ein:") | |
| # Button zum Splitten des Textes | |
| if st.button("Split Text"): | |
| # Erzeugen des Splitter-Objekts basierend auf der Auswahl | |
| if splitter_choice == "Character": | |
| splitter = CharacterTextSplitter( | |
| separator="\n\n", | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| elif splitter_choice == "RecursiveCharacter": | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| elif "Language." in splitter_choice: | |
| lang = splitter_choice.split(".")[1].lower() | |
| splitter = RecursiveCharacterTextSplitter.from_language( | |
| language=lang, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function | |
| ) | |
| else: | |
| raise ValueError("Ungültige Wahl beim Text Splitter.") | |
| # Aufteilen des Textes | |
| splits = splitter.split_text(doc) | |
| # Ausgabe der erstellten Textsplitter | |
| for idx, split in enumerate(splits, start=1): | |
| st.text_area(f"Teilstück {idx}", split, height=150) | |