File size: 2,913 Bytes
6a3d850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a174630
 
6a3d850
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Install necessary libraries
# !pip install gradio langchain chromadb sentence-transformers

import re
import gradio as gr
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from uuid import uuid4

# Define the text processing and querying functions
def max_token_length(txt_list):
    max_length = 0
    for txt in txt_list:
        token_count = len(re.findall(r'\w+', txt))
        if token_count > max_length:
            max_length = token_count
    return f"Max Token Length: {max_length} tokens"

def process_text(text):
    model_max_chunk_length = 256
    token_splitter = SentenceTransformersTokenTextSplitter(
        tokens_per_chunk=model_max_chunk_length,
        model_name="all-MiniLM-L6-v2",
        chunk_overlap=0
    )
    
    character_splitter = RecursiveCharacterTextSplitter(
        separators=['\n    \n', '\n\n', '\n', '. '],
        chunk_size=1000,
        chunk_overlap=0,
    )
    
    text_splitted = character_splitter.split_text(text)
    text_tokens = []
    for t in text_splitted:
        text_tokens.extend(token_splitter.split_text(t))
        
    max_len = max_token_length(text_tokens)
    
    embedding_fn = SentenceTransformerEmbeddingFunction()
    chroma_db = chromadb.Client()
    chroma_collection = chroma_db.create_collection("texts", embedding_function=embedding_fn)
    ids = [str(uuid4()) for _ in range(len(text_tokens))]
    chroma_collection.add(documents=text_tokens, ids=ids)
    
    return chroma_collection, max_len

def query_text(chroma_collection, query):
    res = chroma_collection.query(query_texts=[query], n_results=10)
    return res['documents']

# Define Gradio interface
def gradio_interface(text, query):
    chroma_collection, max_len = process_text(text)
    result = query_text(chroma_collection, query)
    return max_len, result

# Create Gradio blocks interface
with gr.Blocks() as demo:
    gr.Markdown("## Text Processing and Querying Interface")
     
    gr.Markdown("This interface allows you to process a large text document, split it into manageable chunks, and query it using a specified text query. The results will display the max token length and the top 10 document matches for the query.")
    text_input = gr.Textbox(lines=10, placeholder="Enter the text to process here...")
    query_input = gr.Textbox(lines=1, placeholder="Enter the query here...")
    max_len_output = gr.Textbox(lines=1, placeholder="Max token length will be displayed here...")
    result_output = gr.Textbox(lines=10, placeholder="Query results will be displayed here...")
    
    btn = gr.Button("Process and Query")
    btn.click(gradio_interface, inputs=[text_input, query_input], outputs=[max_len_output, result_output])

# Launch the Gradio interface
demo.launch()