Kshitijk20 commited on
Commit
e2f7b9c
·
1 Parent(s): e9b9c34

removed few shot metadata extraction and added sementic check for same keyword

Browse files
app/ingestion/text_splitter.py CHANGED
@@ -10,12 +10,13 @@ from pydantic import BaseModel
10
  from typing import Type
11
  from app.utils.metadata_utils import MetadataService
12
  class splitting_text:
13
- def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
14
  self.llm = llm
15
  self.metadata_extractor = MetadataExtractor(llm = self.llm)
16
  self.metadata_services = MetadataService()
17
  self.documentTypeSchema = documentTypeSchema
18
  self.Keywordsfile_path = None
 
19
 
20
  def _clean_text(self, text:str)-> str:
21
  """Clean extracted page content"""
@@ -41,6 +42,7 @@ class splitting_text:
41
 
42
 
43
  if i == 0:
 
44
  output_folder = "app/data/"
45
  filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
46
  output_path = os.path.join(output_folder, filename)
@@ -66,6 +68,9 @@ class splitting_text:
66
  new_data = self.metadata_services.normalize_dict_to_lists(
67
  Document_metadata.model_dump(exclude_none= True)
68
  )
 
 
 
69
  for key,vals in new_data.items():
70
  if isinstance(vals,list):
71
  known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
 
10
  from typing import Type
11
  from app.utils.metadata_utils import MetadataService
12
  class splitting_text:
13
+ def __init__(self, documentTypeSchema:Type[BaseModel], llm=None, embedding_model=None):
14
  self.llm = llm
15
  self.metadata_extractor = MetadataExtractor(llm = self.llm)
16
  self.metadata_services = MetadataService()
17
  self.documentTypeSchema = documentTypeSchema
18
  self.Keywordsfile_path = None
19
+ self.embedding_model = embedding_model
20
 
21
  def _clean_text(self, text:str)-> str:
22
  """Clean extracted page content"""
 
42
 
43
 
44
  if i == 0:
45
+ print(f"Processing first page, setting up metadata extraction...")
46
  output_folder = "app/data/"
47
  filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
48
  output_path = os.path.join(output_folder, filename)
 
68
  new_data = self.metadata_services.normalize_dict_to_lists(
69
  Document_metadata.model_dump(exclude_none= True)
70
  )
71
+ print(f"processing keywords update for page {i}")
72
+ new_data = MetadataService.keyword_sementic_check(new_data,known_keywords,embedding_model = self.embedding_model)
73
+
74
  for key,vals in new_data.items():
75
  if isinstance(vals,list):
76
  known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
app/metadata_extraction/metadata_ext.py CHANGED
@@ -61,11 +61,11 @@ class MetadataExtractor:
61
  print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
62
  return metadata_class(added_new_keyword=False)
63
 
64
- def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
65
  parser = PydanticOutputParser(pydantic_object=metadata_class)
66
 
67
  schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
68
- keywords_str = json.dumps(known_keywords, indent=2)
69
 
70
  prompt = ChatPromptTemplate.from_messages([
71
  ("system", """You are an information extraction system.
@@ -80,35 +80,34 @@ class MetadataExtractor:
80
  ⚠️ Content Rules:
81
  - For exclusions and obligations, DO NOT copy full sentences.
82
  - Instead, extract only concise normalized keywords (2–5 words max each).
83
- - Use existing keywords if they already exist in the provided list.
84
- - Prefer to reuse existing keywords if they are semantically the same.
85
- - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
86
- *reuse the closest match from existing keywords*, and also add the new one.
87
- - In that case, set `added_new_keyword=true`.
88
  - Do not include raw paragraphs in the output.
 
89
 
90
  Schema you must follow:
91
  {schema}
92
 
93
- Existing Keywords:
94
- {keywords}
95
  """),
96
  ("human", "Text:\n{document_content}")
97
  ])
98
- # - Use existing keywords if they already exist in the provided list.
99
- # - Only create a new keyword if absolutely necessary, and set `added_new_keyword=true`.
100
- # - New keywords must be short (1–3 words).
101
- # - Do NOT invent different variations (e.g., if "Medical" already exists, do not output "Mediclaim Plus").
102
- # - For list fields (like exclusions), reuse existing keywords where possible.
 
 
 
 
103
  chain = prompt | self.llm | parser
104
 
105
  try:
106
  result = chain.invoke({
107
  "schema": schema_str,
108
- "keywords": keywords_str,
109
  "document_content": document.page_content
110
  })
111
  return result
112
  except OutputParserException as e:
113
  print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
114
- return metadata_class(added_new_keyword=False) # instantiate fallback
 
61
  print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
62
  return metadata_class(added_new_keyword=False)
63
 
64
+ def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict = None) -> BaseModel:
65
  parser = PydanticOutputParser(pydantic_object=metadata_class)
66
 
67
  schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
68
+ # keywords_str = json.dumps(known_keywords, indent=2)
69
 
70
  prompt = ChatPromptTemplate.from_messages([
71
  ("system", """You are an information extraction system.
 
80
  ⚠️ Content Rules:
81
  - For exclusions and obligations, DO NOT copy full sentences.
82
  - Instead, extract only concise normalized keywords (2–5 words max each).
 
 
 
 
 
83
  - Do not include raw paragraphs in the output.
84
+ - always keep added_new_keyword as True.
85
 
86
  Schema you must follow:
87
  {schema}
88
 
89
+
 
90
  """),
91
  ("human", "Text:\n{document_content}")
92
  ])
93
+ # - Instead, extract only concise normalized keywords (2–5 words max each).
94
+ # - Use existing keywords if they already exist in the provided list.
95
+ # - Prefer to reuse existing keywords if they are semantically the same.
96
+ # - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
97
+ # *reuse the closest match from existing keywords*, and also add the new one.
98
+ # - In that case, set `added_new_keyword=true`.
99
+ # Existing Keywords:
100
+ # {keywords}
101
+
102
  chain = prompt | self.llm | parser
103
 
104
  try:
105
  result = chain.invoke({
106
  "schema": schema_str,
107
+ # "keywords": keywords_str,
108
  "document_content": document.page_content
109
  })
110
  return result
111
  except OutputParserException as e:
112
  print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
113
+ return metadata_class(added_new_keyword=True) # instantiate fallback
app/schemas/metadata_schema.py CHANGED
@@ -18,7 +18,8 @@ class CommonMetaData(BaseModel):
18
  # )
19
  penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
20
  notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
21
- added_new_keyword: bool = False
 
22
  class InsuranceMetadata(CommonMetaData):
23
 
24
  # --- Insurance ---
 
18
  # )
19
  penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
20
  notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
21
+ # added_new_keyword: bool = False
22
+ added_new_keyword: bool = True
23
  class InsuranceMetadata(CommonMetaData):
24
 
25
  # --- Insurance ---
app/services/RAG_service.py CHANGED
@@ -86,7 +86,7 @@ class RAGService:
86
  print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
87
  self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
88
  print(f"[RAGService] Document type model: {self.Document_Type}")
89
- self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
90
  print("[RAGService] Splitting document into chunks...")
91
  self.chunks = self.splitter.text_splitting(doc)
92
  print(f"[RAGService] Total chunks created: {len(self.chunks)}")
 
86
  print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
87
  self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
88
  print(f"[RAGService] Document type model: {self.Document_Type}")
89
+ self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm, embedding_model=self.embedding_model)
90
  print("[RAGService] Splitting document into chunks...")
91
  self.chunks = self.splitter.text_splitting(doc)
92
  print(f"[RAGService] Total chunks created: {len(self.chunks)}")
app/utils/metadata_utils.py CHANGED
@@ -1,14 +1,20 @@
1
- from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
2
  from app.schemas.request_models import DocumentTypeSchema
 
 
 
 
3
  class MetadataService:
4
  def __init__(self):
5
  self.metadata_models = {
6
  "Insurance": InsuranceMetadata,
7
- "HR/Employment": CommonMetaData,
8
- "Legal/Compliance": CommonMetaData,
9
- "Financial/Regulatory": CommonMetaData,
10
  "Government/Public Policy": CommonMetaData,
11
- "Technical/IT Policies": CommonMetaData
 
 
12
  }
13
  @staticmethod
14
  def format_metadata_for_pinecone(metadata: dict) -> dict:
@@ -48,4 +54,42 @@ class MetadataService:
48
  normalized[key] = value
49
  else:
50
  normalized[key] = [value]
51
- return normalized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData, HealthcareMetadata, HRMetadata, LegalMetadata,FinancialMetadata,ProcurementMetadata
2
  from app.schemas.request_models import DocumentTypeSchema
3
+ import numpy as np
4
+ import os
5
+ import json
6
+
7
  class MetadataService:
8
  def __init__(self):
9
  self.metadata_models = {
10
  "Insurance": InsuranceMetadata,
11
+ "HR/Employment": HRMetadata,
12
+ "Legal/Compliance": LegalMetadata,
13
+ "Financial/Regulatory": FinancialMetadata,
14
  "Government/Public Policy": CommonMetaData,
15
+ "Technical/IT Policies": CommonMetaData,
16
+ "Healthcare/Pharma": HealthcareMetadata,
17
+ "Procurement/Vendor Management": ProcurementMetadata
18
  }
19
  @staticmethod
20
  def format_metadata_for_pinecone(metadata: dict) -> dict:
 
54
  normalized[key] = value
55
  else:
56
  normalized[key] = [value]
57
+ return normalized
58
+
59
+ @staticmethod
60
+ def cosine_similarity(text1, text2, embedding_model) -> float:
61
+ vector1 = embedding_model.embed_query(text1)
62
+ vector2 = embedding_model.embed_query(text2)
63
+ cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
64
+ return cosine_similarity
65
+
66
+ @staticmethod
67
+ def keyword_sementic_check(result, data, embedding_model):
68
+
69
+ # result = result.model_dump()
70
+ # data = json.load(open(data, 'r'))
71
+ # Compare all keys present in both result and data, and check if any value in result[key] is present in data[key]
72
+ for key in result.keys():
73
+ print(f"Comparing key: {key}",flush=True)
74
+ # Only check if both result[key] and data[key] are not None and are lists
75
+ if result[key] is not None and data.get(key) is not None:
76
+ print(f"result[{key}]: {result[key]}",flush=True)
77
+ print(f"data[{key}]: {data[key]}",flush=True)
78
+ # Ensure both are lists (skip if not)
79
+ if isinstance(result[key], list) and isinstance(data[key], list):
80
+ for idx,val in enumerate(result[key]):
81
+ print(f"Comparing value: {val}",flush=True)
82
+ if val in data[key]:
83
+ print(f"'{val}' found in data['{key}']")
84
+ else:
85
+ print(f"'{val}' NOT found in data['{key}']")
86
+ for data_val in data[key]:
87
+ similarity = MetadataService.cosine_similarity(val, data_val,embedding_model)
88
+ print(f"Cosine similarity between '{val}' and '{data_val}': {similarity}")
89
+ if similarity > 0.90:
90
+ print(f"'{val}' is similar to '{data_val}' with similarity {similarity}",flush=True)
91
+ ## if similarity is greater than 0.90, then consider it as matched and replace the value in result with data value
92
+ result[key][idx] = data_val
93
+ else:
94
+ print(f"'{val}' is NOT similar to '{data_val}' with similarity {similarity}",flush=True)
95
+ return result