enochsjoseph commited on
Commit
f5e7cf6
·
1 Parent(s): 5028c4e

initial commit

Browse files
Files changed (2) hide show
  1. app.py +38 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+
5
+ def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
6
+ num_chunks = int(num_chunks)
7
+ output = []
8
+ if method == "RecursiveCharacterTextSplitter":
9
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
10
+ tokenized_texts = text_splitter.split_text(text)[:num_chunks]
11
+ for i, chunk in enumerate(tokenized_texts):
12
+ output.append({
13
+ 'chunk_num': i,
14
+ 'text': chunk,
15
+ 'tokens': len(chunk.split()),
16
+ 'size': len(chunk)
17
+ })
18
+ df = pd.DataFrame(output)
19
+ return df
20
+
21
+ iface = gr.Interface(
22
+ fn=tokenize_text,
23
+ inputs=[
24
+ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
25
+ gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
26
+ gr.Number(label="Chunk Size", value=100),
27
+ gr.Number(label="Chunk Overlap", value=0),
28
+ gr.Number(label="Number of Chunks to Display", value=10)
29
+ ],
30
+ outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
31
+ title="Text Tokenization Tool",
32
+ description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results.",
33
+ theme="dark",
34
+ layout="vertical",
35
+ width=800 # Adjust this value as needed
36
+ )
37
+
38
+ iface.launch(share=True, inbrowser=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ langchain
2
+ gradio
3
+ tiktoken
4
+ sentence-transformers