Spaces:

Kshitijk20
/

ClariDoc

Sleeping

App Files Files Community

Kshitijk20 commited on Sep 11, 2025

Commit

e2f7b9c

1 Parent(s): e9b9c34

removed few shot metadata extraction and added sementic check for same keyword

Browse files

Files changed (5) hide show

app/ingestion/text_splitter.py +6 -1
app/metadata_extraction/metadata_ext.py +15 -16
app/schemas/metadata_schema.py +2 -1
app/services/RAG_service.py +1 -1
app/utils/metadata_utils.py +50 -6

app/ingestion/text_splitter.py CHANGED Viewed

@@ -10,12 +10,13 @@ from pydantic import BaseModel
 from typing import Type
 from app.utils.metadata_utils import MetadataService
 class splitting_text:
-    def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
         self.llm = llm
         self.metadata_extractor = MetadataExtractor(llm = self.llm)
         self.metadata_services = MetadataService()
         self.documentTypeSchema = documentTypeSchema
         self.Keywordsfile_path = None
     def _clean_text(self, text:str)-> str:
         """Clean extracted page content"""
@@ -41,6 +42,7 @@ class splitting_text:
             if i == 0:
                 output_folder = "app/data/"
                 filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
                 output_path = os.path.join(output_folder, filename)
@@ -66,6 +68,9 @@ class splitting_text:
                     new_data = self.metadata_services.normalize_dict_to_lists(
                     Document_metadata.model_dump(exclude_none= True)
                 )
                     for key,vals in new_data.items():
                         if isinstance(vals,list):
                             known_keywords[key] = list(set(known_keywords.get(key,[]) + vals))  #get the existing key and add vals and convert into set then list and update the file.

 from typing import Type
 from app.utils.metadata_utils import MetadataService
 class splitting_text:
+    def __init__(self, documentTypeSchema:Type[BaseModel], llm=None, embedding_model=None):
         self.llm = llm
         self.metadata_extractor = MetadataExtractor(llm = self.llm)
         self.metadata_services = MetadataService()
         self.documentTypeSchema = documentTypeSchema
         self.Keywordsfile_path = None
+        self.embedding_model = embedding_model
     def _clean_text(self, text:str)-> str:
         """Clean extracted page content"""
             if i == 0:
+                print(f"Processing first page, setting up metadata extraction...")
                 output_folder = "app/data/"
                 filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
                 output_path = os.path.join(output_folder, filename)
                     new_data = self.metadata_services.normalize_dict_to_lists(
                     Document_metadata.model_dump(exclude_none= True)
                 )
+                    print(f"processing keywords update for page {i}")
+                    new_data = MetadataService.keyword_sementic_check(new_data,known_keywords,embedding_model = self.embedding_model)
                     for key,vals in new_data.items():
                         if isinstance(vals,list):
                             known_keywords[key] = list(set(known_keywords.get(key,[]) + vals))  #get the existing key and add vals and convert into set then list and update the file.

app/metadata_extraction/metadata_ext.py CHANGED Viewed

@@ -61,11 +61,11 @@ class MetadataExtractor:
             print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
             return metadata_class(added_new_keyword=False)
-    def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
         parser = PydanticOutputParser(pydantic_object=metadata_class)
         schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
-        keywords_str = json.dumps(known_keywords, indent=2)
         prompt = ChatPromptTemplate.from_messages([
             ("system", """You are an information extraction system.
@@ -80,35 +80,34 @@ class MetadataExtractor:
             ⚠️ Content Rules:
             - For exclusions and obligations, DO NOT copy full sentences.
             - Instead, extract only concise normalized keywords (2–5 words max each).
-            - Use existing keywords if they already exist in the provided list.
-            - Prefer to reuse existing keywords if they are semantically the same.
-            - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
-            *reuse the closest match from existing keywords*, and also add the new one.
-            - In that case, set `added_new_keyword=true`.
             - Do not include raw paragraphs in the output.
             Schema you must follow:
             {schema}
-            Existing Keywords:
-            {keywords}
             """),
             ("human", "Text:\n{document_content}")
         ])
-        # - Use existing keywords if they already exist in the provided list.
-        #     - Only create a new keyword if absolutely necessary, and set `added_new_keyword=true`.
-        #     - New keywords must be short (1–3 words).
-        #     - Do NOT invent different variations (e.g., if "Medical" already exists, do not output "Mediclaim Plus").
-        #     - For list fields (like exclusions), reuse existing keywords where possible.
         chain = prompt | self.llm | parser
         try:
             result = chain.invoke({
                 "schema": schema_str,
-                "keywords": keywords_str,
                 "document_content": document.page_content
             })
             return result
         except OutputParserException as e:
             print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
-            return metadata_class(added_new_keyword=False)   # instantiate fallback

             print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
             return metadata_class(added_new_keyword=False)
+    def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict = None) -> BaseModel:
         parser = PydanticOutputParser(pydantic_object=metadata_class)
         schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
+        # keywords_str = json.dumps(known_keywords, indent=2)
         prompt = ChatPromptTemplate.from_messages([
             ("system", """You are an information extraction system.
             ⚠️ Content Rules:
             - For exclusions and obligations, DO NOT copy full sentences.
             - Instead, extract only concise normalized keywords (2–5 words max each).
             - Do not include raw paragraphs in the output.
+            - always keep added_new_keyword as True.
             Schema you must follow:
             {schema}
             """),
             ("human", "Text:\n{document_content}")
         ])
+        # - Instead, extract only concise normalized keywords (2–5 words max each).
+        #     - Use existing keywords if they already exist in the provided list.
+        #     - Prefer to reuse existing keywords if they are semantically the same.
+        #     - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
+        #     *reuse the closest match from existing keywords*, and also add the new one.
+        #     - In that case, set `added_new_keyword=true`.
+        # Existing Keywords:
+            # {keywords}
         chain = prompt | self.llm | parser
         try:
             result = chain.invoke({
                 "schema": schema_str,
+                # "keywords": keywords_str,
                 "document_content": document.page_content
             })
             return result
         except OutputParserException as e:
             print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
+            return metadata_class(added_new_keyword=True)   # instantiate fallback

app/schemas/metadata_schema.py CHANGED Viewed

@@ -18,7 +18,8 @@ class CommonMetaData(BaseModel):
     # )
     penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
     notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
-    added_new_keyword: bool = False
 class InsuranceMetadata(CommonMetaData):
     # --- Insurance ---

     # )
     penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
     notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
+    # added_new_keyword: bool = False
+    added_new_keyword: bool = True
 class InsuranceMetadata(CommonMetaData):
     # --- Insurance ---

app/services/RAG_service.py CHANGED Viewed

@@ -86,7 +86,7 @@ class RAGService:
         print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
         self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
         print(f"[RAGService] Document type model: {self.Document_Type}")
-        self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
         print("[RAGService] Splitting document into chunks...")
         self.chunks = self.splitter.text_splitting(doc)
         print(f"[RAGService] Total chunks created: {len(self.chunks)}")

         print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
         self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
         print(f"[RAGService] Document type model: {self.Document_Type}")
+        self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm, embedding_model=self.embedding_model)
         print("[RAGService] Splitting document into chunks...")
         self.chunks = self.splitter.text_splitting(doc)
         print(f"[RAGService] Total chunks created: {len(self.chunks)}")

app/utils/metadata_utils.py CHANGED Viewed

@@ -1,14 +1,20 @@
-from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
 from app.schemas.request_models import DocumentTypeSchema
 class MetadataService:
     def __init__(self):
         self.metadata_models = {
             "Insurance": InsuranceMetadata,
-            "HR/Employment": CommonMetaData,
-            "Legal/Compliance": CommonMetaData,
-            "Financial/Regulatory": CommonMetaData,
             "Government/Public Policy": CommonMetaData,
-            "Technical/IT Policies": CommonMetaData
         }
     @staticmethod
     def format_metadata_for_pinecone(metadata: dict) -> dict:
@@ -48,4 +54,42 @@ class MetadataService:
                 normalized[key] = value
             else:
                 normalized[key] = [value]
-        return normalized

+from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData, HealthcareMetadata, HRMetadata, LegalMetadata,FinancialMetadata,ProcurementMetadata
 from app.schemas.request_models import DocumentTypeSchema
+import numpy as np
+import os
+import json
 class MetadataService:
     def __init__(self):
         self.metadata_models = {
             "Insurance": InsuranceMetadata,
+            "HR/Employment": HRMetadata,
+            "Legal/Compliance": LegalMetadata,
+            "Financial/Regulatory": FinancialMetadata,
             "Government/Public Policy": CommonMetaData,
+            "Technical/IT Policies": CommonMetaData,
+            "Healthcare/Pharma": HealthcareMetadata,
+            "Procurement/Vendor Management": ProcurementMetadata
         }
     @staticmethod
     def format_metadata_for_pinecone(metadata: dict) -> dict:
                 normalized[key] = value
             else:
                 normalized[key] = [value]
+        return normalized
+    @staticmethod
+    def cosine_similarity(text1, text2, embedding_model) -> float:
+        vector1 = embedding_model.embed_query(text1)
+        vector2 = embedding_model.embed_query(text2)
+        cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
+        return cosine_similarity
+    @staticmethod
+    def keyword_sementic_check(result, data, embedding_model):
+        # result = result.model_dump()
+        # data = json.load(open(data, 'r'))
+        # Compare all keys present in both result and data, and check if any value in result[key] is present in data[key]
+        for key in result.keys():
+            print(f"Comparing key: {key}",flush=True)
+            # Only check if both result[key] and data[key] are not None and are lists
+            if result[key] is not None and data.get(key) is not None:
+                print(f"result[{key}]: {result[key]}",flush=True)
+                print(f"data[{key}]: {data[key]}",flush=True)
+                # Ensure both are lists (skip if not)
+                if isinstance(result[key], list) and isinstance(data[key], list):
+                    for idx,val in enumerate(result[key]):
+                        print(f"Comparing value: {val}",flush=True)
+                        if val in data[key]:
+                            print(f"'{val}' found in data['{key}']")
+                        else:
+                            print(f"'{val}' NOT found in data['{key}']")
+                            for data_val in data[key]:
+                                similarity = MetadataService.cosine_similarity(val, data_val,embedding_model)
+                                print(f"Cosine similarity between '{val}' and '{data_val}': {similarity}")
+                                if similarity > 0.90:
+                                    print(f"'{val}' is similar to '{data_val}' with similarity {similarity}",flush=True)
+                                    ## if similarity is greater than 0.90, then consider it as matched and replace the value in result with data value
+                                    result[key][idx] = data_val
+                                else:
+                                    print(f"'{val}' is NOT similar to '{data_val}' with similarity {similarity}",flush=True)
+        return result