Spaces:

gourisankar85
/

realtime-rag-pipeline

Sleeping

App Files Files Community

Gourisankar Padihary commited on Jan 28

Commit

d93e32b

1 Parent(s): 5485d7c

Support for all data set

Browse files

Files changed (3) hide show

config.py +1 -1
retriever/chunk_documents.py +13 -0
retriever/embed_documents.py +78 -1

config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 class ConfigConstants:
     # Constants related to datasets and models
-    DATA_SET_NAMES = ['covidqa', 'techqa', 'cuad']
     EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
     RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
     DEFAULT_CHUNK_SIZE = 1000

 class ConfigConstants:
     # Constants related to datasets and models
+    DATA_SET_NAMES = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']
     EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
     RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
     DEFAULT_CHUNK_SIZE = 1000

retriever/chunk_documents.py CHANGED Viewed

@@ -1,12 +1,25 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     documents = []
     for data in dataset:
         text_list = data['documents']
         for text in text_list:
             chunks = text_splitter.split_text(text)
             for i, chunk in enumerate(chunks):
                 documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
     return documents

 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import hashlib
 def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     documents = []
+    seen_hashes = set()  # Track hashes of chunks to avoid duplicates
     for data in dataset:
         text_list = data['documents']
         for text in text_list:
             chunks = text_splitter.split_text(text)
             for i, chunk in enumerate(chunks):
+                # Generate a unique hash for the chunk
+                chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
+                # Skip if the chunk is a duplicate
+                if chunk_hash in seen_hashes:
+                    continue
+                # Add the chunk to the documents list and track its hash
                 documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
+                seen_hashes.add(chunk_hash)
     return documents

retriever/embed_documents.py CHANGED Viewed

@@ -7,7 +7,7 @@ from config import ConfigConstants
 def embed_documents(documents, embedding_path="embeddings.faiss"):
     embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
     if os.path.exists(embedding_path):
         logging.info("Loading embeddings from local file")
         vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
@@ -17,3 +17,80 @@ def embed_documents(documents, embedding_path="embeddings.faiss"):
         vector_store.save_local(embedding_path)
     return vector_store

 def embed_documents(documents, embedding_path="embeddings.faiss"):
     embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
     if os.path.exists(embedding_path):
         logging.info("Loading embeddings from local file")
         vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
         vector_store.save_local(embedding_path)
     return vector_store
+'''import os
+import logging
+import hashlib
+from typing import List, Dict
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from config import ConfigConstants
+def embed_documents(documents: List[Dict], embedding_path: str = "embeddings.faiss", metadata_path: str = "metadata.json") -> FAISS:
+    logging.info(f"Total documents got :{len(documents)}")
+    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
+    if os.path.exists(embedding_path) and os.path.exists(metadata_path):
+        logging.info("Loading embeddings and metadata from local files")
+        vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
+        existing_metadata = _load_metadata(metadata_path)
+    else:
+        # Initialize FAISS with at least one document to avoid the IndexError
+        if documents:
+            vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
+        else:
+            # If no documents are provided, initialize an empty FAISS index with a dummy document
+            vector_store = FAISS.from_texts(["dummy document"], embedding_model)
+        existing_metadata = {}
+    # Identify new or modified documents
+    new_documents = []
+    for doc in documents:
+        doc_hash = _generate_document_hash(doc['text'])
+        if doc_hash not in existing_metadata:
+            new_documents.append(doc)
+            existing_metadata[doc_hash] = True  # Mark as processed
+    if new_documents:
+        logging.info(f"Generating embeddings for {len(new_documents)} new documents")
+        with ThreadPoolExecutor() as executor:
+            futures = []
+            for doc in new_documents:
+                futures.append(executor.submit(_embed_single_document, doc, embedding_model))
+            for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
+                vector_store.add_texts([future.result()])
+        # Save updated embeddings and metadata
+        vector_store.save_local(embedding_path)
+        _save_metadata(metadata_path, existing_metadata)
+    else:
+        logging.info("No new documents to process. Using existing embeddings.")
+    return vector_store
+def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
+    return doc['text']
+def _generate_document_hash(text: str) -> str:
+    """Generate a unique hash for a document based on its text."""
+    return hashlib.sha256(text.encode()).hexdigest()
+def _load_metadata(metadata_path: str) -> Dict[str, bool]:
+    """Load metadata from a file."""
+    import json
+    if os.path.exists(metadata_path):
+        with open(metadata_path, "r") as f:
+            return json.load(f)
+    return {}
+def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
+    """Save metadata to a file."""
+    import json
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f)'''