Spaces:

Sahar7888
/

Data_Query

Sleeping

App Files Files Community

Sahar7888 commited on May 30, 2024

Commit

6a3d850

verified ·

1 Parent(s): 159183d

Upload app.py

Browse files

Files changed (1) hide show

app.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Install necessary libraries
+# !pip install gradio langchain chromadb sentence-transformers
+import re
+import gradio as gr
+from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
+import chromadb
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+from uuid import uuid4
+# Define the text processing and querying functions
+def max_token_length(txt_list):
+    max_length = 0
+    for txt in txt_list:
+        token_count = len(re.findall(r'\w+', txt))
+        if token_count > max_length:
+            max_length = token_count
+    return f"Max Token Length: {max_length} tokens"
+def process_text(text):
+    model_max_chunk_length = 256
+    token_splitter = SentenceTransformersTokenTextSplitter(
+        tokens_per_chunk=model_max_chunk_length,
+        model_name="all-MiniLM-L6-v2",
+        chunk_overlap=0
+    )
+    character_splitter = RecursiveCharacterTextSplitter(
+        separators=['\n    \n', '\n\n', '\n', '. '],
+        chunk_size=1000,
+        chunk_overlap=0,
+    )
+    text_splitted = character_splitter.split_text(text)
+    text_tokens = []
+    for t in text_splitted:
+        text_tokens.extend(token_splitter.split_text(t))
+    max_len = max_token_length(text_tokens)
+    embedding_fn = SentenceTransformerEmbeddingFunction()
+    chroma_db = chromadb.Client()
+    chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn)
+    ids = [str(uuid4()) for _ in range(len(text_tokens))]
+    chroma_collection.add(documents=text_tokens, ids=ids)
+    return chroma_collection, max_len
+def query_text(chroma_collection, query):
+    res = chroma_collection.query(query_texts=[query], n_results=10)
+    return res['documents']
+# Define Gradio interface
+def gradio_interface(text, query):
+    chroma_collection, max_len = process_text(text)
+    result = query_text(chroma_collection, query)
+    return max_len, result
+# Create Gradio blocks interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Text Processing and Querying Interface")
+    text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...")
+    query_input = gr.Textbox(lines=1, placeholder="Enter the query here...")
+    max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...")
+    result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...")
+    btn = gr.Button("Process and Query")
+    btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output])
+# Launch the Gradio interface
+demo.launch()