Spaces:
Sleeping
Sleeping
| # Install necessary libraries | |
| # !pip install gradio langchain chromadb sentence-transformers | |
| import re | |
| import gradio as gr | |
| from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter | |
| import chromadb | |
| from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction | |
| from uuid import uuid4 | |
| # Define the text processing and querying functions | |
| def max_token_length(txt_list): | |
| max_length = 0 | |
| for txt in txt_list: | |
| token_count = len(re.findall(r'\w+', txt)) | |
| if token_count > max_length: | |
| max_length = token_count | |
| return f"Max Token Length: {max_length} tokens" | |
| def process_text(text): | |
| model_max_chunk_length = 256 | |
| token_splitter = SentenceTransformersTokenTextSplitter( | |
| tokens_per_chunk=model_max_chunk_length, | |
| model_name="all-MiniLM-L6-v2", | |
| chunk_overlap=0 | |
| ) | |
| character_splitter = RecursiveCharacterTextSplitter( | |
| separators=['\n \n', '\n\n', '\n', '. '], | |
| chunk_size=1000, | |
| chunk_overlap=0, | |
| ) | |
| text_splitted = character_splitter.split_text(text) | |
| text_tokens = [] | |
| for t in text_splitted: | |
| text_tokens.extend(token_splitter.split_text(t)) | |
| max_len = max_token_length(text_tokens) | |
| embedding_fn = SentenceTransformerEmbeddingFunction() | |
| chroma_db = chromadb.Client() | |
| chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn) | |
| ids = [str(uuid4()) for _ in range(len(text_tokens))] | |
| chroma_collection.add(documents=text_tokens, ids=ids) | |
| return chroma_collection, max_len | |
| def query_text(chroma_collection, query): | |
| res = chroma_collection.query(query_texts=[query], n_results=10) | |
| return res['documents'] | |
| # Define Gradio interface | |
| def gradio_interface(text, query): | |
| chroma_collection, max_len = process_text(text) | |
| result = query_text(chroma_collection, query) | |
| return max_len, result | |
| # Create Gradio blocks interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Text Processing and Querying Interface") | |
| gr.Markdown("This interface allows you to process a large text document, split it into manageable chunks, and query it using a specified text query. The results will display the max token length and the top 10 document matches for the query.") | |
| text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...") | |
| query_input = gr.Textbox(lines=1, placeholder="Enter the query here...") | |
| max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...") | |
| result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...") | |
| btn = gr.Button("Process and Query") | |
| btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output]) | |
| # Launch the Gradio interface | |
| demo.launch() | |