enochsjoseph commited on
Commit
1d70738
·
1 Parent(s): 48c0b62

initial commit

Browse files
Files changed (2) hide show
  1. app.py +100 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # Constants for default values
9
+ DEFAULT_CHUNK_SIZE = 100
10
+ DEFAULT_CHUNK_OVERLAP = 0
11
+ DEFAULT_NUM_CHUNKS = 10
12
+
13
+ # Initialize the sentence transformer model for embeddings
14
+ model = SentenceTransformer('all-MiniLM-L6-v2')
15
+
16
+ def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
17
+ """
18
+ Tokenizes the input text based on the selected method and provided parameters.
19
+ """
20
+ num_chunks = int(num_chunks)
21
+ output = []
22
+
23
+ # Ensure text is provided
24
+ if not text.strip():
25
+ return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
26
+
27
+ if method == "RecursiveCharacterTextSplitter":
28
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
29
+ tokenized_texts = text_splitter.split_text(text)[:num_chunks]
30
+ for i, chunk in enumerate(tokenized_texts):
31
+ output.append({
32
+ 'Chunk #': i,
33
+ 'Text Chunk': chunk,
34
+ 'Character Count': len(chunk),
35
+ 'Token Count': len(chunk.split())
36
+ })
37
+
38
+ df = pd.DataFrame(output)
39
+ return df
40
+
41
+ def calculate_embeddings(df):
42
+ """
43
+ Calculates embeddings for each text chunk in the dataframe.
44
+ """
45
+ if df.empty:
46
+ return df
47
+
48
+ chunks = df['Text Chunk'].tolist()
49
+ embeddings = model.encode(chunks)
50
+ df['Embeddings'] = embeddings.tolist()
51
+ return df
52
+
53
+ def search_similar_chunks(query, df_with_embeddings):
54
+ """
55
+ Search for chunks similar to the query embedding.
56
+ """
57
+ # Compute the query embedding
58
+ query_embedding = model.encode([query])[0]
59
+
60
+ # Calculate similarity scores
61
+ chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
62
+ similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
63
+
64
+ # Add similarity scores to the dataframe
65
+ df_with_embeddings['Similarity'] = similarity_scores
66
+
67
+ # Return the dataframe sorted by similarity scores in descending order
68
+ return df_with_embeddings.sort_values(by='Similarity', ascending=False)
69
+
70
+ def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
71
+ """
72
+ Tokenizes the text and calculates embeddings.
73
+ """
74
+ df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
75
+ df_with_embeddings = calculate_embeddings(df)
76
+ return df_with_embeddings
77
+
78
+ def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
79
+ df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
80
+ if query:
81
+ return search_similar_chunks(query, df_with_embeddings)
82
+ return df_with_embeddings
83
+
84
+ iface = gr.Interface(
85
+ fn=update_output,
86
+ inputs=[
87
+ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
88
+ gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
89
+ gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
90
+ gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
91
+ gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
92
+ gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
93
+ ],
94
+ outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count", "Embeddings", "Similarity"]),
95
+ title="Text Tokenization and Embedding Tool",
96
+ description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
97
+ )
98
+
99
+ if __name__ == "__main__":
100
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ sentence-transformers
4
+ scikit-learn
5
+ numpy