Spaces:
Runtime error
Runtime error
| #https://medium.com/@csakash03/hybrid-search-is-a-method-to-optimize-rag-implementation-98d9d0911341 | |
| #https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6 | |
| import gradio as gr | |
| import zipfile | |
| import os | |
| import re | |
| from pathlib import Path | |
| import chromadb | |
| from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint | |
| from langchain_chroma import Chroma | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| import hashlib | |
| import nltk | |
| from rank_bm25 import BM25Okapi | |
| import numpy as np | |
| from langchain.schema import Document | |
| from dotenv import load_dotenv | |
| # Download the required NLTK data | |
| nltk.download('punkt') | |
| # Define embeddings using Hugging Face models | |
| embeddings = HuggingFaceEmbeddings() | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Initialize Chroma vector store | |
| persist_directory = "./chroma_langchain_db" | |
| client = chromadb.PersistentClient() | |
| collection = client.get_or_create_collection("whatsapp_collection") | |
| vector_store = Chroma( | |
| collection_name="whatsapp_collection", | |
| embedding_function=embeddings, | |
| persist_directory=persist_directory, | |
| ) | |
| # Define global variables | |
| bm25 = None | |
| all_texts = [] | |
| processed_files = {} # Dictionary to store hashes of processed files | |
| llm = HuggingFaceEndpoint( | |
| repo_id="mistralai/Mistral-7B-Instruct-v0.3", | |
| huggingfacehub_api_token=HF_TOKEN.strip(), | |
| temperature=0.1, | |
| max_new_tokens=200 | |
| ) | |
| # Function to remove emojis and clean the text | |
| def clean_text(text): | |
| # Remove emojis | |
| text = re.sub(r'[^\x00-\x7F]+', '', text) | |
| # Additional cleaning if necessary | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # Function to compute a file hash for identifying duplicates | |
| def compute_file_hash(file_path): | |
| hasher = hashlib.md5() | |
| with open(file_path, 'rb') as f: | |
| buf = f.read() | |
| hasher.update(buf) | |
| return hasher.hexdigest() | |
| # Function to process and upload the zip file to Chroma | |
| def process_and_upload_zip(zip_file): | |
| global bm25, all_texts, processed_files | |
| temp_dir = Path("temp") | |
| temp_dir.mkdir(exist_ok=True) | |
| # Compute hash to check if file has been processed | |
| zip_file_hash = compute_file_hash(zip_file.name) | |
| # If the file has been processed before, skip re-uploading | |
| if zip_file_hash in processed_files: | |
| return f"File '{zip_file.name}' already processed. Using existing Chroma storage." | |
| # Extract the zip file | |
| with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: | |
| zip_ref.extractall(temp_dir) | |
| # Load and clean the chat text | |
| chat_files = list(temp_dir.glob("*.txt")) | |
| metadata = [] | |
| all_texts = [] | |
| for chat_file in chat_files: | |
| with open(chat_file, 'r', encoding='utf-8') as file: | |
| page_content = file.read() | |
| # Clean the text | |
| clean_content = clean_text(page_content) | |
| # Split the clean_content into chunks of 2500 characters with 200 overlap | |
| chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200) | |
| chunks = chunk_splitter.split_text(clean_content) | |
| for chunk_index, chunk in enumerate(chunks): | |
| metadata.append({ | |
| "context": chunk, | |
| "document_id": chat_file.stem, | |
| "chunk_index": chunk_index | |
| }) | |
| all_texts.append(chunk) | |
| # Initialize BM25 for sparse retrieval | |
| bm25 = BM25Okapi([doc.split() for doc in all_texts]) | |
| # Create dense embeddings and store in Chroma | |
| chunk_embeddings = embeddings.embed_documents(all_texts) | |
| ids = [f"{m['document_id']}_chunk_{m['chunk_index']}" for m in metadata] | |
| documents = [Document(page_content=m["context"], metadata=m) for m in metadata] | |
| vector_store.add_documents(documents=documents, ids=ids) | |
| # Store the hash of the processed file to avoid reprocessing | |
| processed_files[zip_file_hash] = zip_file.name | |
| return "Data uploaded and stored in Chroma successfully." | |
| def hybrid_search(query): | |
| global bm25, all_texts | |
| # BM25 Sparse Retrieval | |
| query_terms = query.split() | |
| bm25_scores = bm25.get_scores(query_terms) | |
| bm25_top_n_indices = np.argsort(bm25_scores)[::-1][:5] # Top 5 results | |
| sparse_results = [all_texts[i] for i in bm25_top_n_indices] | |
| # Dense Retrieval using Chroma | |
| dense_results = vector_store.similarity_search(query, k=5) | |
| # Combine the results (you can enhance the combination logic here) | |
| combined_results = sparse_results + [result.page_content for result in dense_results] | |
| response = "" | |
| for result in combined_results: | |
| response += f"{result}\n\n" | |
| return f"Hybrid Search Results:\n\n{response}" | |
| # Gradio Interface for uploading and querying | |
| def query_interface(zip_file, query): | |
| upload_status = process_and_upload_zip(zip_file) | |
| search_results = hybrid_search(query) | |
| prompt = (f"Here is a summary of WhatsApp chat contents based on the search for the query: '{query}'. " | |
| f"The chat content includes important messages:\n\n" | |
| f"{search_results}\n\n" | |
| f"Now, based on this chat content, answer the following question as an expert. " | |
| f"Please provide a complete and precise answer in **100 words**.\n\n" | |
| f"Question: {query}") | |
| response = llm.invoke(prompt) | |
| # Generate answer using the LLM | |
| return f"{upload_status}\n\n{search_results}", response | |
| interface = gr.Interface( | |
| fn=query_interface, | |
| inputs=[gr.File(label="Upload WhatsApp Chat Zip File"), gr.Textbox(label="Enter your query")], | |
| outputs=[ | |
| gr.Textbox(label="Chat Content"), # To display the chat content | |
| gr.Textbox(label="Generated Answer") # To display the generated answer | |
| ], | |
| title="WhatsApp Chat Upload and Hybrid Search", | |
| description="Upload a zip file containing WhatsApp chat data. This app processes the data and performs hybrid search with BM25 + Chroma." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |