Sahar7888 commited on
Commit
6a3d850
·
verified ·
1 Parent(s): 159183d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install necessary libraries
2
+ # !pip install gradio langchain chromadb sentence-transformers
3
+
4
+ import re
5
+ import gradio as gr
6
+ from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
7
+ import chromadb
8
+ from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
9
+ from uuid import uuid4
10
+
11
+ # Define the text processing and querying functions
12
+ def max_token_length(txt_list):
13
+ max_length = 0
14
+ for txt in txt_list:
15
+ token_count = len(re.findall(r'\w+', txt))
16
+ if token_count > max_length:
17
+ max_length = token_count
18
+ return f"Max Token Length: {max_length} tokens"
19
+
20
+ def process_text(text):
21
+ model_max_chunk_length = 256
22
+ token_splitter = SentenceTransformersTokenTextSplitter(
23
+ tokens_per_chunk=model_max_chunk_length,
24
+ model_name="all-MiniLM-L6-v2",
25
+ chunk_overlap=0
26
+ )
27
+
28
+ character_splitter = RecursiveCharacterTextSplitter(
29
+ separators=['\n \n', '\n\n', '\n', '. '],
30
+ chunk_size=1000,
31
+ chunk_overlap=0,
32
+ )
33
+
34
+ text_splitted = character_splitter.split_text(text)
35
+ text_tokens = []
36
+ for t in text_splitted:
37
+ text_tokens.extend(token_splitter.split_text(t))
38
+
39
+ max_len = max_token_length(text_tokens)
40
+
41
+ embedding_fn = SentenceTransformerEmbeddingFunction()
42
+ chroma_db = chromadb.Client()
43
+ chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn)
44
+ ids = [str(uuid4()) for _ in range(len(text_tokens))]
45
+ chroma_collection.add(documents=text_tokens, ids=ids)
46
+
47
+ return chroma_collection, max_len
48
+
49
+ def query_text(chroma_collection, query):
50
+ res = chroma_collection.query(query_texts=[query], n_results=10)
51
+ return res['documents']
52
+
53
+ # Define Gradio interface
54
+ def gradio_interface(text, query):
55
+ chroma_collection, max_len = process_text(text)
56
+ result = query_text(chroma_collection, query)
57
+ return max_len, result
58
+
59
+ # Create Gradio blocks interface
60
+ with gr.Blocks() as demo:
61
+ gr.Markdown("## Text Processing and Querying Interface")
62
+ text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...")
63
+ query_input = gr.Textbox(lines=1, placeholder="Enter the query here...")
64
+ max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...")
65
+ result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...")
66
+
67
+ btn = gr.Button("Process and Query")
68
+ btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output])
69
+
70
+ # Launch the Gradio interface
71
+ demo.launch()