File size: 4,080 Bytes
1d70738
 
2711a06
1d70738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2711a06
1d70738
2711a06
1d70738
2711a06
 
1d70738
2711a06
 
 
1d70738
2711a06
 
1d70738
2711a06
 
1d70738
2711a06
1d70738
2711a06
1d70738
 
 
 
 
2711a06
 
 
 
 
 
 
 
1d70738
2711a06
1d70738
 
 
 
 
 
2711a06
1d70738
8f70b28
1d70738
2711a06
1d70738
 
2711a06
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Constants for default values
DEFAULT_CHUNK_SIZE = 100
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 10

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })

    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks)
    df['Embeddings'] = embeddings.tolist()
    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    # Compute the query embedding
    query_embedding = model.encode([query])[0]
    
    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
    
    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)
    
    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]

iface = gr.Interface(
    fn=update_output,
    inputs=[
        gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
        gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
        gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
        gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
        gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
        gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
    ],
    outputs=gr.Dataframe(height=900),
    title="Text Tokenization and Embedding Tool",
    description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
)

if __name__ == "__main__":
    iface.launch()