Spaces:

Sahar7888
/

Data_Query

Sleeping

App Files Files Community

Data_Query / app.py

Sahar7888

Update app.py

a174630 verified almost 2 years ago

raw

history blame contribute delete

2.91 kB

	# Install necessary libraries
	# !pip install gradio langchain chromadb sentence-transformers

	import re
	import gradio as gr
	from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
	import chromadb
	from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
	from uuid import uuid4

	# Define the text processing and querying functions
	def max_token_length(txt_list):
	max_length = 0
	for txt in txt_list:
	token_count = len(re.findall(r'\w+', txt))
	if token_count > max_length:
	max_length = token_count
	return f"Max Token Length: {max_length} tokens"

	def process_text(text):
	model_max_chunk_length = 256
	token_splitter = SentenceTransformersTokenTextSplitter(
	tokens_per_chunk=model_max_chunk_length,
	model_name="all-MiniLM-L6-v2",
	chunk_overlap=0
	)

	character_splitter = RecursiveCharacterTextSplitter(
	separators=['\n \n', '\n\n', '\n', '. '],
	chunk_size=1000,
	chunk_overlap=0,
	)

	text_splitted = character_splitter.split_text(text)
	text_tokens = []
	for t in text_splitted:
	text_tokens.extend(token_splitter.split_text(t))

	max_len = max_token_length(text_tokens)

	embedding_fn = SentenceTransformerEmbeddingFunction()
	chroma_db = chromadb.Client()
	chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn)
	ids = [str(uuid4()) for _ in range(len(text_tokens))]
	chroma_collection.add(documents=text_tokens, ids=ids)

	return chroma_collection, max_len

	def query_text(chroma_collection, query):
	res = chroma_collection.query(query_texts=[query], n_results=10)
	return res['documents']

	# Define Gradio interface
	def gradio_interface(text, query):
	chroma_collection, max_len = process_text(text)
	result = query_text(chroma_collection, query)
	return max_len, result

	# Create Gradio blocks interface
	with gr.Blocks() as demo:
	gr.Markdown("## Text Processing and Querying Interface")

	gr.Markdown("This interface allows you to process a large text document, split it into manageable chunks, and query it using a specified text query. The results will display the max token length and the top 10 document matches for the query.")
	text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...")
	query_input = gr.Textbox(lines=1, placeholder="Enter the query here...")
	max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...")
	result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...")

	btn = gr.Button("Process and Query")
	btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output])

	# Launch the Gradio interface
	demo.launch()