Spaces:
Sleeping
Sleeping
Commit ·
e2f7b9c
1
Parent(s): e9b9c34
removed few shot metadata extraction and added sementic check for same keyword
Browse files
app/ingestion/text_splitter.py
CHANGED
|
@@ -10,12 +10,13 @@ from pydantic import BaseModel
|
|
| 10 |
from typing import Type
|
| 11 |
from app.utils.metadata_utils import MetadataService
|
| 12 |
class splitting_text:
|
| 13 |
-
def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
|
| 14 |
self.llm = llm
|
| 15 |
self.metadata_extractor = MetadataExtractor(llm = self.llm)
|
| 16 |
self.metadata_services = MetadataService()
|
| 17 |
self.documentTypeSchema = documentTypeSchema
|
| 18 |
self.Keywordsfile_path = None
|
|
|
|
| 19 |
|
| 20 |
def _clean_text(self, text:str)-> str:
|
| 21 |
"""Clean extracted page content"""
|
|
@@ -41,6 +42,7 @@ class splitting_text:
|
|
| 41 |
|
| 42 |
|
| 43 |
if i == 0:
|
|
|
|
| 44 |
output_folder = "app/data/"
|
| 45 |
filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
|
| 46 |
output_path = os.path.join(output_folder, filename)
|
|
@@ -66,6 +68,9 @@ class splitting_text:
|
|
| 66 |
new_data = self.metadata_services.normalize_dict_to_lists(
|
| 67 |
Document_metadata.model_dump(exclude_none= True)
|
| 68 |
)
|
|
|
|
|
|
|
|
|
|
| 69 |
for key,vals in new_data.items():
|
| 70 |
if isinstance(vals,list):
|
| 71 |
known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
|
|
|
|
| 10 |
from typing import Type
|
| 11 |
from app.utils.metadata_utils import MetadataService
|
| 12 |
class splitting_text:
|
| 13 |
+
def __init__(self, documentTypeSchema:Type[BaseModel], llm=None, embedding_model=None):
|
| 14 |
self.llm = llm
|
| 15 |
self.metadata_extractor = MetadataExtractor(llm = self.llm)
|
| 16 |
self.metadata_services = MetadataService()
|
| 17 |
self.documentTypeSchema = documentTypeSchema
|
| 18 |
self.Keywordsfile_path = None
|
| 19 |
+
self.embedding_model = embedding_model
|
| 20 |
|
| 21 |
def _clean_text(self, text:str)-> str:
|
| 22 |
"""Clean extracted page content"""
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
if i == 0:
|
| 45 |
+
print(f"Processing first page, setting up metadata extraction...")
|
| 46 |
output_folder = "app/data/"
|
| 47 |
filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
|
| 48 |
output_path = os.path.join(output_folder, filename)
|
|
|
|
| 68 |
new_data = self.metadata_services.normalize_dict_to_lists(
|
| 69 |
Document_metadata.model_dump(exclude_none= True)
|
| 70 |
)
|
| 71 |
+
print(f"processing keywords update for page {i}")
|
| 72 |
+
new_data = MetadataService.keyword_sementic_check(new_data,known_keywords,embedding_model = self.embedding_model)
|
| 73 |
+
|
| 74 |
for key,vals in new_data.items():
|
| 75 |
if isinstance(vals,list):
|
| 76 |
known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
|
app/metadata_extraction/metadata_ext.py
CHANGED
|
@@ -61,11 +61,11 @@ class MetadataExtractor:
|
|
| 61 |
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 62 |
return metadata_class(added_new_keyword=False)
|
| 63 |
|
| 64 |
-
def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
|
| 65 |
parser = PydanticOutputParser(pydantic_object=metadata_class)
|
| 66 |
|
| 67 |
schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
|
| 68 |
-
keywords_str = json.dumps(known_keywords, indent=2)
|
| 69 |
|
| 70 |
prompt = ChatPromptTemplate.from_messages([
|
| 71 |
("system", """You are an information extraction system.
|
|
@@ -80,35 +80,34 @@ class MetadataExtractor:
|
|
| 80 |
⚠️ Content Rules:
|
| 81 |
- For exclusions and obligations, DO NOT copy full sentences.
|
| 82 |
- Instead, extract only concise normalized keywords (2–5 words max each).
|
| 83 |
-
- Use existing keywords if they already exist in the provided list.
|
| 84 |
-
- Prefer to reuse existing keywords if they are semantically the same.
|
| 85 |
-
- If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
|
| 86 |
-
*reuse the closest match from existing keywords*, and also add the new one.
|
| 87 |
-
- In that case, set `added_new_keyword=true`.
|
| 88 |
- Do not include raw paragraphs in the output.
|
|
|
|
| 89 |
|
| 90 |
Schema you must follow:
|
| 91 |
{schema}
|
| 92 |
|
| 93 |
-
|
| 94 |
-
{keywords}
|
| 95 |
"""),
|
| 96 |
("human", "Text:\n{document_content}")
|
| 97 |
])
|
| 98 |
-
# -
|
| 99 |
-
# -
|
| 100 |
-
# -
|
| 101 |
-
# -
|
| 102 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
chain = prompt | self.llm | parser
|
| 104 |
|
| 105 |
try:
|
| 106 |
result = chain.invoke({
|
| 107 |
"schema": schema_str,
|
| 108 |
-
"keywords": keywords_str,
|
| 109 |
"document_content": document.page_content
|
| 110 |
})
|
| 111 |
return result
|
| 112 |
except OutputParserException as e:
|
| 113 |
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 114 |
-
return metadata_class(added_new_keyword=
|
|
|
|
| 61 |
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 62 |
return metadata_class(added_new_keyword=False)
|
| 63 |
|
| 64 |
+
def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict = None) -> BaseModel:
|
| 65 |
parser = PydanticOutputParser(pydantic_object=metadata_class)
|
| 66 |
|
| 67 |
schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
|
| 68 |
+
# keywords_str = json.dumps(known_keywords, indent=2)
|
| 69 |
|
| 70 |
prompt = ChatPromptTemplate.from_messages([
|
| 71 |
("system", """You are an information extraction system.
|
|
|
|
| 80 |
⚠️ Content Rules:
|
| 81 |
- For exclusions and obligations, DO NOT copy full sentences.
|
| 82 |
- Instead, extract only concise normalized keywords (2–5 words max each).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
- Do not include raw paragraphs in the output.
|
| 84 |
+
- always keep added_new_keyword as True.
|
| 85 |
|
| 86 |
Schema you must follow:
|
| 87 |
{schema}
|
| 88 |
|
| 89 |
+
|
|
|
|
| 90 |
"""),
|
| 91 |
("human", "Text:\n{document_content}")
|
| 92 |
])
|
| 93 |
+
# - Instead, extract only concise normalized keywords (2–5 words max each).
|
| 94 |
+
# - Use existing keywords if they already exist in the provided list.
|
| 95 |
+
# - Prefer to reuse existing keywords if they are semantically the same.
|
| 96 |
+
# - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
|
| 97 |
+
# *reuse the closest match from existing keywords*, and also add the new one.
|
| 98 |
+
# - In that case, set `added_new_keyword=true`.
|
| 99 |
+
# Existing Keywords:
|
| 100 |
+
# {keywords}
|
| 101 |
+
|
| 102 |
chain = prompt | self.llm | parser
|
| 103 |
|
| 104 |
try:
|
| 105 |
result = chain.invoke({
|
| 106 |
"schema": schema_str,
|
| 107 |
+
# "keywords": keywords_str,
|
| 108 |
"document_content": document.page_content
|
| 109 |
})
|
| 110 |
return result
|
| 111 |
except OutputParserException as e:
|
| 112 |
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 113 |
+
return metadata_class(added_new_keyword=True) # instantiate fallback
|
app/schemas/metadata_schema.py
CHANGED
|
@@ -18,7 +18,8 @@ class CommonMetaData(BaseModel):
|
|
| 18 |
# )
|
| 19 |
penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
|
| 20 |
notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
|
| 21 |
-
added_new_keyword: bool = False
|
|
|
|
| 22 |
class InsuranceMetadata(CommonMetaData):
|
| 23 |
|
| 24 |
# --- Insurance ---
|
|
|
|
| 18 |
# )
|
| 19 |
penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
|
| 20 |
notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
|
| 21 |
+
# added_new_keyword: bool = False
|
| 22 |
+
added_new_keyword: bool = True
|
| 23 |
class InsuranceMetadata(CommonMetaData):
|
| 24 |
|
| 25 |
# --- Insurance ---
|
app/services/RAG_service.py
CHANGED
|
@@ -86,7 +86,7 @@ class RAGService:
|
|
| 86 |
print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
|
| 87 |
self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
|
| 88 |
print(f"[RAGService] Document type model: {self.Document_Type}")
|
| 89 |
-
self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
|
| 90 |
print("[RAGService] Splitting document into chunks...")
|
| 91 |
self.chunks = self.splitter.text_splitting(doc)
|
| 92 |
print(f"[RAGService] Total chunks created: {len(self.chunks)}")
|
|
|
|
| 86 |
print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
|
| 87 |
self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
|
| 88 |
print(f"[RAGService] Document type model: {self.Document_Type}")
|
| 89 |
+
self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm, embedding_model=self.embedding_model)
|
| 90 |
print("[RAGService] Splitting document into chunks...")
|
| 91 |
self.chunks = self.splitter.text_splitting(doc)
|
| 92 |
print(f"[RAGService] Total chunks created: {len(self.chunks)}")
|
app/utils/metadata_utils.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
| 1 |
-
from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
|
| 2 |
from app.schemas.request_models import DocumentTypeSchema
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
class MetadataService:
|
| 4 |
def __init__(self):
|
| 5 |
self.metadata_models = {
|
| 6 |
"Insurance": InsuranceMetadata,
|
| 7 |
-
"HR/Employment":
|
| 8 |
-
"Legal/Compliance":
|
| 9 |
-
"Financial/Regulatory":
|
| 10 |
"Government/Public Policy": CommonMetaData,
|
| 11 |
-
"Technical/IT Policies": CommonMetaData
|
|
|
|
|
|
|
| 12 |
}
|
| 13 |
@staticmethod
|
| 14 |
def format_metadata_for_pinecone(metadata: dict) -> dict:
|
|
@@ -48,4 +54,42 @@ class MetadataService:
|
|
| 48 |
normalized[key] = value
|
| 49 |
else:
|
| 50 |
normalized[key] = [value]
|
| 51 |
-
return normalized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData, HealthcareMetadata, HRMetadata, LegalMetadata,FinancialMetadata,ProcurementMetadata
|
| 2 |
from app.schemas.request_models import DocumentTypeSchema
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
class MetadataService:
|
| 8 |
def __init__(self):
|
| 9 |
self.metadata_models = {
|
| 10 |
"Insurance": InsuranceMetadata,
|
| 11 |
+
"HR/Employment": HRMetadata,
|
| 12 |
+
"Legal/Compliance": LegalMetadata,
|
| 13 |
+
"Financial/Regulatory": FinancialMetadata,
|
| 14 |
"Government/Public Policy": CommonMetaData,
|
| 15 |
+
"Technical/IT Policies": CommonMetaData,
|
| 16 |
+
"Healthcare/Pharma": HealthcareMetadata,
|
| 17 |
+
"Procurement/Vendor Management": ProcurementMetadata
|
| 18 |
}
|
| 19 |
@staticmethod
|
| 20 |
def format_metadata_for_pinecone(metadata: dict) -> dict:
|
|
|
|
| 54 |
normalized[key] = value
|
| 55 |
else:
|
| 56 |
normalized[key] = [value]
|
| 57 |
+
return normalized
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def cosine_similarity(text1, text2, embedding_model) -> float:
|
| 61 |
+
vector1 = embedding_model.embed_query(text1)
|
| 62 |
+
vector2 = embedding_model.embed_query(text2)
|
| 63 |
+
cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
|
| 64 |
+
return cosine_similarity
|
| 65 |
+
|
| 66 |
+
@staticmethod
|
| 67 |
+
def keyword_sementic_check(result, data, embedding_model):
|
| 68 |
+
|
| 69 |
+
# result = result.model_dump()
|
| 70 |
+
# data = json.load(open(data, 'r'))
|
| 71 |
+
# Compare all keys present in both result and data, and check if any value in result[key] is present in data[key]
|
| 72 |
+
for key in result.keys():
|
| 73 |
+
print(f"Comparing key: {key}",flush=True)
|
| 74 |
+
# Only check if both result[key] and data[key] are not None and are lists
|
| 75 |
+
if result[key] is not None and data.get(key) is not None:
|
| 76 |
+
print(f"result[{key}]: {result[key]}",flush=True)
|
| 77 |
+
print(f"data[{key}]: {data[key]}",flush=True)
|
| 78 |
+
# Ensure both are lists (skip if not)
|
| 79 |
+
if isinstance(result[key], list) and isinstance(data[key], list):
|
| 80 |
+
for idx,val in enumerate(result[key]):
|
| 81 |
+
print(f"Comparing value: {val}",flush=True)
|
| 82 |
+
if val in data[key]:
|
| 83 |
+
print(f"'{val}' found in data['{key}']")
|
| 84 |
+
else:
|
| 85 |
+
print(f"'{val}' NOT found in data['{key}']")
|
| 86 |
+
for data_val in data[key]:
|
| 87 |
+
similarity = MetadataService.cosine_similarity(val, data_val,embedding_model)
|
| 88 |
+
print(f"Cosine similarity between '{val}' and '{data_val}': {similarity}")
|
| 89 |
+
if similarity > 0.90:
|
| 90 |
+
print(f"'{val}' is similar to '{data_val}' with similarity {similarity}",flush=True)
|
| 91 |
+
## if similarity is greater than 0.90, then consider it as matched and replace the value in result with data value
|
| 92 |
+
result[key][idx] = data_val
|
| 93 |
+
else:
|
| 94 |
+
print(f"'{val}' is NOT similar to '{data_val}' with similarity {similarity}",flush=True)
|
| 95 |
+
return result
|