Spaces:

Kshitijk20
/

ClariDoc

Sleeping

App Files Files Community

Kshitijk20 commited on Sep 16, 2025

Commit

46bf894

1 Parent(s): e2f7b9c

added hybrid retriever

Browse files

Files changed (5) hide show

app/embedding/vectore_store.py +20 -9
app/retrieval/retriever.py +27 -13
app/schemas/metadata_schema.py +3 -3
app/services/RAG_service.py +26 -19
app/utils/metadata_utils.py +37 -25

app/embedding/vectore_store.py CHANGED Viewed

@@ -5,12 +5,12 @@ from pinecone import ServerlessSpec
 from langchain_pinecone import PineconeVectorStore
 from datetime import datetime
 from uuid import uuid4
 class VectorStore:
     def __init__(self, text_chunks, embedding_model):
         self.text_chunks = text_chunks
         self.current_time = datetime.now()
         self.embedding_model = embedding_model
     def create_vectorestore(self):
         load_dotenv()
@@ -18,23 +18,34 @@ class VectorStore:
         pc = Pinecone(api_key=pinecone_key)
         # pc._vector_api.api_client.pool_threads = 1
         time_string = self.current_time.strftime("%Y-%m-%d-%H-%M")
-        index_name = f"rag-project"
         if not pc.has_index(index_name):
             pc.create_index(
-                name = index_name,
-                dimension=1536,
                 metric="cosine",
-                spec = ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         # model_loader = ModelLoader(model_provider="openai")
         # embedding_model = model_loader.load_llm()
         uuids = [str(uuid4()) for _ in range(len(self.text_chunks)) ]
-        vector_store = PineconeVectorStore(index = index, embedding=self.embedding_model)
-        name_space = f"hackrx-index{time_string}"
-        vector_store.add_documents(documents=self.text_chunks, ids = uuids,namespace = name_space )
-        return index, name_space

 from langchain_pinecone import PineconeVectorStore
 from datetime import datetime
 from uuid import uuid4
 class VectorStore:
     def __init__(self, text_chunks, embedding_model):
         self.text_chunks = text_chunks
         self.current_time = datetime.now()
         self.embedding_model = embedding_model
+        # self.index, self.namespace, self.retriever = self.create_vectorestore()
     def create_vectorestore(self):
         load_dotenv()
         pc = Pinecone(api_key=pinecone_key)
         # pc._vector_api.api_client.pool_threads = 1
         time_string = self.current_time.strftime("%Y-%m-%d-%H-%M")
+        index_name = "rag-project"
+        namespace = f"rag-project{time_string}"
         if not pc.has_index(index_name):
             pc.create_index(
+                name=index_name,
+                dimension=1024,
                 metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         # model_loader = ModelLoader(model_provider="openai")
         # embedding_model = model_loader.load_llm()
         uuids = [str(uuid4()) for _ in range(len(self.text_chunks)) ]
+        # vector_store = PineconeVectorStore.from_documents(index = index, embedding=self.embedding_model)
+        # name_space = f"hackrx-index{time_string}"
+        # vector_store.add_documents(documents=self.text_chunks, ids = uuids,namespace = name_space )
+        # retriever = vector_store.as_retriever(
+        #     search_type="similarity",
+        #     search_kwargs={"k": 5},
+        # )
+        vector_store = PineconeVectorStore.from_documents(documents=self.text_chunks,index_name=index_name, embedding=self.embedding_model, namespace = namespace)
+        # vector_store.add_documents(documents=docs, ids=uuids)
+        # retriever = vector_store.as_retriever(
+        #             search_type="similarity",
+        #             search_kwargs={"k": 5,"namespace": namespace}
+        #         )
+        return index, namespace, vector_store

app/retrieval/retriever.py CHANGED Viewed

@@ -1,14 +1,26 @@
-# from app.schemas.request_models import ClauseHit
 class Retriever:
-    def __init__(self, pinecone_index, query = None, metadata = None, namespace=None):
         self.pinecone_index = pinecone_index
         self.query = query
         self.metadata = metadata
         self.namespace = namespace
-    def retrieval_from_pinecone_vectoreStore(self, top_k= 3):
         """
         Retrieve the top matching chunks from Pinecone.
@@ -21,14 +33,14 @@ class Retriever:
         Returns:
             List of ClauseHit objects (lightweight container for chunk info).
         """
-        res = self.pinecone_index.query(
-            vector= self.query,
-            top_k =top_k ,
-            include_metadata = True,
-            include_values = False,
-            filter = self.metadata,
-            namespace = self.namespace
-            )
         # Process the results into the expected format
         # class ClauseHit:
@@ -51,5 +63,7 @@ class Retriever:
         #         score=match['score']
         #     ))
         # return hits
-        return res

+from langchain.retrievers import EnsembleRetriever
 class Retriever:
+    def __init__(self, pinecone_index, query = None, metadata = None, namespace=None, vectore_store = None,sparse_retriever = None, llm = None):
         self.pinecone_index = pinecone_index
         self.query = query
         self.metadata = metadata
         self.namespace = namespace
+        self.vector_store = vectore_store
+        self.sparse_retriever = sparse_retriever
+        self.llm = llm
+        self.dense_retriever = self.vector_store.as_retriever(
+            search_type="similarity",
+            search_kwargs={"k": 5,"namespace": self.namespace, "filter": self.metadata}
+        )
+        self.hybrid_retriever = EnsembleRetriever(
+            retrievers=[self.dense_retriever, sparse_retriever],  # Use .retriever attribute
+            weights=[0.7, 0.3]  # Fix: 'weights' not 'weight'
+        )
+    def retrieval_from_pinecone_vectoreStore(self):
         """
         Retrieve the top matching chunks from Pinecone.
         Returns:
             List of ClauseHit objects (lightweight container for chunk info).
         """
+        # res = self.pinecone_index.query(
+        #     vector= self.query,
+        #     top_k =top_k ,
+        #     include_metadata = True,
+        #     include_values = False,
+        #     filter = self.metadata,
+        #     namespace = self.namespace
+        #     )
         # Process the results into the expected format
         # class ClauseHit:
         #         score=match['score']
         #     ))
         # return hits
+        results = self.hybrid_retriever.invoke(self.query)
+        for doc in results:
+            print(f"printing Doc content : {doc.page_content}")
+        return results

app/schemas/metadata_schema.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Dict, Any, Optional, Union, Literal
 class CommonMetaData(BaseModel):
     # --- Common metadata (across all domains) ---
-    doc_id: Optional[List[str]] = Field(None, description="Unique document identifier")
     doc_category: Optional[List[str]] = Field(None, description="General pool/category e.g. Insurance, HR, Legal")
     doc_type: Optional[List[str]] = Field(None, description="Specific type e.g. Policy doc, Contract, Handbook")
     jurisdiction: Optional[List[str]] = Field(
@@ -17,7 +17,7 @@ class CommonMetaData(BaseModel):
     #     description="List of short, normalized obligation keywords (2–5 words each, no full sentences)"
     # )
     penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
-    notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
     # added_new_keyword: bool = False
     added_new_keyword: bool = True
 class InsuranceMetadata(CommonMetaData):
@@ -28,7 +28,7 @@ class InsuranceMetadata(CommonMetaData):
     default=None,
     description="Type(s) of coverage. Short keywords (1–3 words each)."
     )
-    premium_amount: Optional[List[str]] = None
     exclusions: Optional[List[str]] = Field(
         description="List of normalized keywords representing exclusions (short, 2-5 words each, not full paragraphs).", default=None
     )

 class CommonMetaData(BaseModel):
     # --- Common metadata (across all domains) ---
+    # doc_id: Optional[List[str]] = Field(None, description="Unique document identifier")
     doc_category: Optional[List[str]] = Field(None, description="General pool/category e.g. Insurance, HR, Legal")
     doc_type: Optional[List[str]] = Field(None, description="Specific type e.g. Policy doc, Contract, Handbook")
     jurisdiction: Optional[List[str]] = Field(
     #     description="List of short, normalized obligation keywords (2–5 words each, no full sentences)"
     # )
     penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
+    # notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
     # added_new_keyword: bool = False
     added_new_keyword: bool = True
 class InsuranceMetadata(CommonMetaData):
     default=None,
     description="Type(s) of coverage. Short keywords (1–3 words each)."
     )
+    # premium_amount: Optional[List[str]] = None
     exclusions: Optional[List[str]] = Field(
         description="List of normalized keywords representing exclusions (short, 2-5 words each, not full paragraphs).", default=None
     )

app/services/RAG_service.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from typing import List
 from app.utils.model_loader import ModelLoader
 from app.ingestion.file_loader import FileLoader
 from app.ingestion.text_splitter import splitting_text
@@ -7,11 +6,10 @@ from app.embedding.embeder import QueryEmbedding
 from app.embedding.vectore_store import VectorStore
 from app.metadata_extraction.metadata_ext import MetadataExtractor
 from app.utils.metadata_utils import MetadataService
-# from app.utils.document_op import DocumentOperation
 from langchain_core.documents import Document
 import json
-from typing import List, Optional
-# ...existing imports...
 # Global model instances (loaded once)
 _embedding_model = None
@@ -118,29 +116,38 @@ class RAGService:
     def create_vector_store(self):
         print("[RAGService] Creating vector store...")
-        self.vector_store = VectorStore(self.chunks, self.embedding_model)
-        self.index, self.namespace = self.vector_store.create_vectorestore()
         print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
-    def retrive_documents(self):
         print("[RAGService] Retrieving documents from vector store...")
-        self.retriever = Retriever(self.index,self.query_embedding,self.query_metadata, self.namespace)
         self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
-        print(f"[RAGService] Retrieval result: {self.result}")
     def answer_query(self, raw_query:str) -> str:
         """Answer user query using retrieved documents and LLM"""
         print(f"[RAGService] Answering query: {raw_query}")
-        top_clause = self.result['matches']
-        top_clause_dicts = [r.to_dict() for r in top_clause]
-        self.top_clauses = top_clause_dicts
-        keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
-        for r in top_clause_dicts:
-            meta = r.get("metadata", {})
-            for k in keys_to_remove:
-                meta.pop(k, None)
-        context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
         print(f"context_clauses: {context_clauses}")

 from app.utils.model_loader import ModelLoader
 from app.ingestion.file_loader import FileLoader
 from app.ingestion.text_splitter import splitting_text
 from app.embedding.vectore_store import VectorStore
 from app.metadata_extraction.metadata_ext import MetadataExtractor
 from app.utils.metadata_utils import MetadataService
 from langchain_core.documents import Document
 import json
+from langchain_community.retrievers import BM25Retriever
+from langchain.schema import Document
 # Global model instances (loaded once)
 _embedding_model = None
     def create_vector_store(self):
         print("[RAGService] Creating vector store...")
+        self.vector_store_class_instance = VectorStore(self.chunks, self.embedding_model)
+        self.index, self.namespace, self.vector_store = self.vector_store_class_instance.create_vectorestore()
         print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
+        ### Sparse Retriever(BM25)
+        self.sparse_retriever=BM25Retriever.from_documents(self.chunks)
+        self.sparse_retriever.k=3 ##top- k documents to retriever
+    def retrive_documents(self, raw_query: str):
         print("[RAGService] Retrieving documents from vector store...")
+        self.create_query_embedding(raw_query)
+        self.retriever = Retriever(self.index,raw_query,self.query_metadata, self.namespace, self.vector_store,sparse_retriever = self.sparse_retriever,llm = self.llm)
         self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
+        # self.result = self.retriever.invoke(raw_query)
+        # print(f"[RAGService] Retrieval result: {self.result}")
     def answer_query(self, raw_query:str) -> str:
         """Answer user query using retrieved documents and LLM"""
         print(f"[RAGService] Answering query: {raw_query}")
+        # top_clause = self.result['matches']
+        # top_clause_dicts = [r.to_dict() for r in top_clause]
+        # self.top_clauses = top_clause_dicts
+        # keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
+        # for r in top_clause_dicts:
+        #     meta = r.get("metadata", {})
+        #     for k in keys_to_remove:
+        #         meta.pop(k, None)
+        # context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
+        context_clauses = [doc.page_content for doc in self.result]
         print(f"context_clauses: {context_clauses}")

app/utils/metadata_utils.py CHANGED Viewed

@@ -57,39 +57,51 @@ class MetadataService:
         return normalized
     @staticmethod
-    def cosine_similarity(text1, text2, embedding_model) -> float:
-        vector1 = embedding_model.embed_query(text1)
-        vector2 = embedding_model.embed_query(text2)
-        cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
-        return cosine_similarity
     @staticmethod
     def keyword_sementic_check(result, data, embedding_model):
-        # result = result.model_dump()
-        # data = json.load(open(data, 'r'))
-        # Compare all keys present in both result and data, and check if any value in result[key] is present in data[key]
         for key in result.keys():
-            print(f"Comparing key: {key}",flush=True)
-            # Only check if both result[key] and data[key] are not None and are lists
             if result[key] is not None and data.get(key) is not None:
-                print(f"result[{key}]: {result[key]}",flush=True)
-                print(f"data[{key}]: {data[key]}",flush=True)
-                # Ensure both are lists (skip if not)
                 if isinstance(result[key], list) and isinstance(data[key], list):
-                    for idx,val in enumerate(result[key]):
-                        print(f"Comparing value: {val}",flush=True)
-                        if val in data[key]:
-                            print(f"'{val}' found in data['{key}']")
                         else:
-                            print(f"'{val}' NOT found in data['{key}']")
-                            for data_val in data[key]:
-                                similarity = MetadataService.cosine_similarity(val, data_val,embedding_model)
-                                print(f"Cosine similarity between '{val}' and '{data_val}': {similarity}")
                                 if similarity > 0.90:
-                                    print(f"'{val}' is similar to '{data_val}' with similarity {similarity}",flush=True)
-                                    ## if similarity is greater than 0.90, then consider it as matched and replace the value in result with data value
                                     result[key][idx] = data_val
                                 else:
-                                    print(f"'{val}' is NOT similar to '{data_val}' with similarity {similarity}",flush=True)
         return result

         return normalized
     @staticmethod
+    def cosine_similarity(vec1, vec2) -> float:
+        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
     @staticmethod
     def keyword_sementic_check(result, data, embedding_model):
         for key in result.keys():
+            print(f"Comparing key: {key}", flush=True)
             if result[key] is not None and data.get(key) is not None:
+                print(f"result[{key}]: {result[key]}", flush=True)
+                print(f"data[{key}]: {data[key]}", flush=True)
                 if isinstance(result[key], list) and isinstance(data[key], list):
+                    # Filter to only strings
+                    data_list = [v for v in data[key] if isinstance(v, str)]
+                    val_list = [v for v in result[key] if isinstance(v, str)]
+                    data_set = set(data_list)
+                    if not data_list or not val_list:
+                        print(f"Skipping key '{key}' due to empty valid strings.")
+                        continue
+                    # Precompute embeddings for data_list
+                    data_embeddings = {val: embedding_model.embed_query(val) for val in data_list}
+                    # Precompute embeddings for val_list
+                    val_embeddings_list = embedding_model.embed_documents(val_list)
+                    for idx, val in enumerate(val_list):
+                        print(f"Comparing value: {val}", flush=True)
+                        if val in data_set:
+                            print(f"'{val}' found in data['{key}']", flush=True)
                         else:
+                            print(f"'{val}' NOT found in data['{key}']", flush=True)
+                            val_vector = val_embeddings_list[idx]
+                            for data_val, data_vector in data_embeddings.items():
+                                similarity = MetadataService.cosine_similarity(val_vector, data_vector)
+                                print(f"Cosine similarity between '{val}' and '{data_val}': {similarity}", flush=True)
                                 if similarity > 0.90:
+                                    print(f"'{val}' is similar to '{data_val}' with similarity {similarity}", flush=True)
                                     result[key][idx] = data_val
+                                    break
                                 else:
+                                    print(f"'{val}' is NOT similar to '{data_val}' with similarity {similarity}", flush=True)
         return result