enochsjoseph commited on
Commit
c770ec4
·
1 Parent(s): b6b8b4f
Files changed (1) hide show
  1. app.py +25 -10
app.py CHANGED
@@ -2,36 +2,51 @@ import gradio as gr
2
  import pandas as pd
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
 
 
 
 
 
 
5
  def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
 
 
 
6
  num_chunks = int(num_chunks)
7
  output = []
 
 
 
 
 
8
  if method == "RecursiveCharacterTextSplitter":
9
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
10
  tokenized_texts = text_splitter.split_text(text)[:num_chunks]
11
  for i, chunk in enumerate(tokenized_texts):
12
  output.append({
13
- 'chunk_num': i,
14
- 'text': chunk,
15
- 'tokens': len(chunk.split()),
16
- 'size': len(chunk)
17
  })
 
18
  df = pd.DataFrame(output)
19
  return df
20
 
21
  iface = gr.Interface(
22
  fn=tokenize_text,
23
  inputs=[
24
- gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
25
  gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
26
- gr.Number(label="Chunk Size", value=100),
27
- gr.Number(label="Chunk Overlap", value=0),
28
- gr.Number(label="Number of Chunks to Display", value=10)
29
  ],
30
  outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
31
  title="Text Tokenization Tool",
32
- description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results.",
33
  theme="dark",
 
34
  width=800 # Adjust this value as needed
35
  )
36
 
37
- iface.launch(share=True, inbrowser=True)
 
2
  import pandas as pd
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
 
5
+ # Constants for default values
6
+ DEFAULT_CHUNK_SIZE = 100
7
+ DEFAULT_CHUNK_OVERLAP = 0
8
+ DEFAULT_NUM_CHUNKS = 10
9
+
10
  def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
11
+ """
12
+ Tokenizes the input text based on the selected method and provided parameters.
13
+ """
14
  num_chunks = int(num_chunks)
15
  output = []
16
+
17
+ # Ensure text is provided
18
+ if not text.strip():
19
+ return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
20
+
21
  if method == "RecursiveCharacterTextSplitter":
22
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
23
  tokenized_texts = text_splitter.split_text(text)[:num_chunks]
24
  for i, chunk in enumerate(tokenized_texts):
25
  output.append({
26
+ 'Chunk #': i,
27
+ 'Text Chunk': chunk,
28
+ 'Character Count': len(chunk),
29
+ 'Token Count': len(chunk.split())
30
  })
31
+
32
  df = pd.DataFrame(output)
33
  return df
34
 
35
  iface = gr.Interface(
36
  fn=tokenize_text,
37
  inputs=[
38
+ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"], default="RecursiveCharacterTextSplitter"),
39
  gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
40
+ gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
41
+ gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
42
+ gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS)
43
  ],
44
  outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
45
  title="Text Tokenization Tool",
46
+ description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results. It splits the text into chunks based on the specified chunk size and overlap.",
47
  theme="dark",
48
+ layout="vertical",
49
  width=800 # Adjust this value as needed
50
  )
51
 
52
+ iface.launch(share=True, inbrowser=True)