# Install necessary libraries # !pip install gradio langchain chromadb sentence-transformers import re import gradio as gr from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter import chromadb from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction from uuid import uuid4 # Define the text processing and querying functions def max_token_length(txt_list): max_length = 0 for txt in txt_list: token_count = len(re.findall(r'\w+', txt)) if token_count > max_length: max_length = token_count return f"Max Token Length: {max_length} tokens" def process_text(text): model_max_chunk_length = 256 token_splitter = SentenceTransformersTokenTextSplitter( tokens_per_chunk=model_max_chunk_length, model_name="all-MiniLM-L6-v2", chunk_overlap=0 ) character_splitter = RecursiveCharacterTextSplitter( separators=['\n \n', '\n\n', '\n', '. '], chunk_size=1000, chunk_overlap=0, ) text_splitted = character_splitter.split_text(text) text_tokens = [] for t in text_splitted: text_tokens.extend(token_splitter.split_text(t)) max_len = max_token_length(text_tokens) embedding_fn = SentenceTransformerEmbeddingFunction() chroma_db = chromadb.Client() chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn) ids = [str(uuid4()) for _ in range(len(text_tokens))] chroma_collection.add(documents=text_tokens, ids=ids) return chroma_collection, max_len def query_text(chroma_collection, query): res = chroma_collection.query(query_texts=[query], n_results=10) return res['documents'] # Define Gradio interface def gradio_interface(text, query): chroma_collection, max_len = process_text(text) result = query_text(chroma_collection, query) return max_len, result # Create Gradio blocks interface with gr.Blocks() as demo: gr.Markdown("## Text Processing and Querying Interface") gr.Markdown("This interface allows you to process a large text document, split it into manageable chunks, and query it using a specified text query. The results will display the max token length and the top 10 document matches for the query.") text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...") query_input = gr.Textbox(lines=1, placeholder="Enter the query here...") max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...") result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...") btn = gr.Button("Process and Query") btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output]) # Launch the Gradio interface demo.launch()