Commit
·
c770ec4
1
Parent(s):
b6b8b4f
clean
Browse files
app.py
CHANGED
|
@@ -2,36 +2,51 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
|
|
|
|
|
|
|
|
|
|
| 6 |
num_chunks = int(num_chunks)
|
| 7 |
output = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
if method == "RecursiveCharacterTextSplitter":
|
| 9 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
|
| 10 |
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
|
| 11 |
for i, chunk in enumerate(tokenized_texts):
|
| 12 |
output.append({
|
| 13 |
-
'
|
| 14 |
-
'
|
| 15 |
-
'
|
| 16 |
-
'
|
| 17 |
})
|
|
|
|
| 18 |
df = pd.DataFrame(output)
|
| 19 |
return df
|
| 20 |
|
| 21 |
iface = gr.Interface(
|
| 22 |
fn=tokenize_text,
|
| 23 |
inputs=[
|
| 24 |
-
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
|
| 25 |
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
|
| 26 |
-
gr.Number(label="Chunk Size", value=
|
| 27 |
-
gr.Number(label="Chunk Overlap", value=
|
| 28 |
-
gr.Number(label="Number of Chunks to Display", value=
|
| 29 |
],
|
| 30 |
outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
|
| 31 |
title="Text Tokenization Tool",
|
| 32 |
-
description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results.",
|
| 33 |
theme="dark",
|
|
|
|
| 34 |
width=800 # Adjust this value as needed
|
| 35 |
)
|
| 36 |
|
| 37 |
-
iface.launch(share=True, inbrowser=True)
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
|
| 5 |
+
# Constants for default values
|
| 6 |
+
DEFAULT_CHUNK_SIZE = 100
|
| 7 |
+
DEFAULT_CHUNK_OVERLAP = 0
|
| 8 |
+
DEFAULT_NUM_CHUNKS = 10
|
| 9 |
+
|
| 10 |
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
|
| 11 |
+
"""
|
| 12 |
+
Tokenizes the input text based on the selected method and provided parameters.
|
| 13 |
+
"""
|
| 14 |
num_chunks = int(num_chunks)
|
| 15 |
output = []
|
| 16 |
+
|
| 17 |
+
# Ensure text is provided
|
| 18 |
+
if not text.strip():
|
| 19 |
+
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
|
| 20 |
+
|
| 21 |
if method == "RecursiveCharacterTextSplitter":
|
| 22 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
|
| 23 |
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
|
| 24 |
for i, chunk in enumerate(tokenized_texts):
|
| 25 |
output.append({
|
| 26 |
+
'Chunk #': i,
|
| 27 |
+
'Text Chunk': chunk,
|
| 28 |
+
'Character Count': len(chunk),
|
| 29 |
+
'Token Count': len(chunk.split())
|
| 30 |
})
|
| 31 |
+
|
| 32 |
df = pd.DataFrame(output)
|
| 33 |
return df
|
| 34 |
|
| 35 |
iface = gr.Interface(
|
| 36 |
fn=tokenize_text,
|
| 37 |
inputs=[
|
| 38 |
+
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"], default="RecursiveCharacterTextSplitter"),
|
| 39 |
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
|
| 40 |
+
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
|
| 41 |
+
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
|
| 42 |
+
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS)
|
| 43 |
],
|
| 44 |
outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
|
| 45 |
title="Text Tokenization Tool",
|
| 46 |
+
description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results. It splits the text into chunks based on the specified chunk size and overlap.",
|
| 47 |
theme="dark",
|
| 48 |
+
layout="vertical",
|
| 49 |
width=800 # Adjust this value as needed
|
| 50 |
)
|
| 51 |
|
| 52 |
+
iface.launch(share=True, inbrowser=True)
|