Commit
·
f5e7cf6
1
Parent(s):
5028c4e
initial commit
Browse files- app.py +38 -0
- requirements.txt +4 -0
app.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
|
| 5 |
+
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
|
| 6 |
+
num_chunks = int(num_chunks)
|
| 7 |
+
output = []
|
| 8 |
+
if method == "RecursiveCharacterTextSplitter":
|
| 9 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
|
| 10 |
+
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
|
| 11 |
+
for i, chunk in enumerate(tokenized_texts):
|
| 12 |
+
output.append({
|
| 13 |
+
'chunk_num': i,
|
| 14 |
+
'text': chunk,
|
| 15 |
+
'tokens': len(chunk.split()),
|
| 16 |
+
'size': len(chunk)
|
| 17 |
+
})
|
| 18 |
+
df = pd.DataFrame(output)
|
| 19 |
+
return df
|
| 20 |
+
|
| 21 |
+
iface = gr.Interface(
|
| 22 |
+
fn=tokenize_text,
|
| 23 |
+
inputs=[
|
| 24 |
+
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
|
| 25 |
+
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
|
| 26 |
+
gr.Number(label="Chunk Size", value=100),
|
| 27 |
+
gr.Number(label="Chunk Overlap", value=0),
|
| 28 |
+
gr.Number(label="Number of Chunks to Display", value=10)
|
| 29 |
+
],
|
| 30 |
+
outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"]),
|
| 31 |
+
title="Text Tokenization Tool",
|
| 32 |
+
description="A tool for tokenizing text using different methods. Enter your text and choose your settings to see the results.",
|
| 33 |
+
theme="dark",
|
| 34 |
+
layout="vertical",
|
| 35 |
+
width=800 # Adjust this value as needed
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
iface.launch(share=True, inbrowser=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
gradio
|
| 3 |
+
tiktoken
|
| 4 |
+
sentence-transformers
|