File size: 4,080 Bytes
1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 2711a06 1d70738 8f70b28 1d70738 2711a06 1d70738 2711a06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import gradio as gr
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Constants for default values
DEFAULT_CHUNK_SIZE = 100
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 10
# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
"""
Tokenizes the input text based on the selected method and provided parameters.
"""
num_chunks = int(num_chunks)
output = []
# Ensure text is provided
if not text.strip():
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
if method == "RecursiveCharacterTextSplitter":
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
for i, chunk in enumerate(tokenized_texts):
output.append({
'Chunk #': i,
'Text Chunk': chunk,
'Character Count': len(chunk),
'Token Count': len(chunk.split())
})
df = pd.DataFrame(output)
return df
def calculate_embeddings(df):
"""
Calculates embeddings for each text chunk in the dataframe.
"""
if df.empty:
return df
chunks = df['Text Chunk'].tolist()
embeddings = model.encode(chunks)
df['Embeddings'] = embeddings.tolist()
return df
def search_similar_chunks(query, df_with_embeddings):
"""
Search for chunks similar to the query embedding.
"""
# Compute the query embedding
query_embedding = model.encode([query])[0]
# Calculate similarity scores
chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
# Insert similarity scores into the dataframe after 'Chunk #'
df_with_embeddings.insert(1, 'Similarity', similarity_scores)
# Return the dataframe sorted by similarity scores in descending order
return df_with_embeddings.sort_values(by='Similarity', ascending=False)
def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
"""
Tokenizes the text and calculates embeddings.
"""
df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
df_with_embeddings = calculate_embeddings(df)
return df_with_embeddings
def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
if query:
df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
# Update the headers to reflect the new column order after similarity search
return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
iface = gr.Interface(
fn=update_output,
inputs=[
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
],
outputs=gr.Dataframe(height=900),
title="Text Tokenization and Embedding Tool",
description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
)
if __name__ == "__main__":
iface.launch()
|