Spaces:
Sleeping
Sleeping
File size: 2,913 Bytes
6a3d850 a174630 6a3d850 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | # Install necessary libraries
# !pip install gradio langchain chromadb sentence-transformers
import re
import gradio as gr
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from uuid import uuid4
# Define the text processing and querying functions
def max_token_length(txt_list):
max_length = 0
for txt in txt_list:
token_count = len(re.findall(r'\w+', txt))
if token_count > max_length:
max_length = token_count
return f"Max Token Length: {max_length} tokens"
def process_text(text):
model_max_chunk_length = 256
token_splitter = SentenceTransformersTokenTextSplitter(
tokens_per_chunk=model_max_chunk_length,
model_name="all-MiniLM-L6-v2",
chunk_overlap=0
)
character_splitter = RecursiveCharacterTextSplitter(
separators=['\n \n', '\n\n', '\n', '. '],
chunk_size=1000,
chunk_overlap=0,
)
text_splitted = character_splitter.split_text(text)
text_tokens = []
for t in text_splitted:
text_tokens.extend(token_splitter.split_text(t))
max_len = max_token_length(text_tokens)
embedding_fn = SentenceTransformerEmbeddingFunction()
chroma_db = chromadb.Client()
chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn)
ids = [str(uuid4()) for _ in range(len(text_tokens))]
chroma_collection.add(documents=text_tokens, ids=ids)
return chroma_collection, max_len
def query_text(chroma_collection, query):
res = chroma_collection.query(query_texts=[query], n_results=10)
return res['documents']
# Define Gradio interface
def gradio_interface(text, query):
chroma_collection, max_len = process_text(text)
result = query_text(chroma_collection, query)
return max_len, result
# Create Gradio blocks interface
with gr.Blocks() as demo:
gr.Markdown("## Text Processing and Querying Interface")
gr.Markdown("This interface allows you to process a large text document, split it into manageable chunks, and query it using a specified text query. The results will display the max token length and the top 10 document matches for the query.")
text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...")
query_input = gr.Textbox(lines=1, placeholder="Enter the query here...")
max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...")
result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...")
btn = gr.Button("Process and Query")
btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output])
# Launch the Gradio interface
demo.launch()
|