File size: 8,437 Bytes
cff1a2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import json
import re
from pathlib import Path
from typing import Dict, List

# Section headers for semantic chunking
SECTION_HEADERS = [
    "Overview", "About", "Introduction", "Key Features", "Features",
    "Eligibility", "Who Can Buy", "Entry Age", "Benefits", "Death Benefit",
    "Maturity Benefit", "Survival Benefit", "Riders", "Add-on", "Premium",
    "Charges", "Policy Charges", "Fund Management", "Policy Term", "Term",
    "Exclusions", "What is Not Covered", "Tax Benefits", "Tax Benefit",
    "Claim Process", "How to Claim", "Claims", "Disclaimers", "Disclaimer",
    "Sum Assured", "Cover Amount", "Waiting Period", "Free Look", "Grace Period",
    "Surrender", "Loan", "Revival", "Settlement", "Nomination"
]

# Compile regex patterns for section detection
SECTION_PATTERNS = [re.compile(rf'\b{header}\b', re.IGNORECASE) for header in SECTION_HEADERS]


class MetadataExtractor:
    """
    Extracts metadata from file paths and enriches them with an external JSON config.
    docs/Insurer/InsuranceType/Product.pdf + plan_metadata.json
    """
    
    def __init__(self, base_path: str, config_path: str = "configs/plan_metadata.json"):
        self.base_path = Path(base_path).resolve()
        self.config_path = Path(config_path)
        self.external_metadata = self._load_external_metadata()

    def _load_external_metadata(self) -> Dict[str, Dict]:
        if self.config_path.exists():
            with open(self.config_path, 'r') as f:
                return json.load(f)
        return {}

    def _generate_plan_id(self, provider: str, insurance_type: str, product_name: str) -> str:
        """
        Generate deterministic plan_id for linking brochure and CIS.
        Format: provider_insurancetype_productname (lowercase, underscores)
        """
        # Remove doc type suffixes from product name for consistent plan_id
        clean_product = re.sub(r'[\s_-]?(brochure|cis)$', '', product_name, flags=re.IGNORECASE).strip()
        normalized = f"{provider}_{insurance_type}_{clean_product}"
        return re.sub(r'[^a-z0-9]+', '_', normalized.lower()).strip('_')

    def extract_from_path(self, file_path: str) -> Dict[str, str]:
        """
        Parses the file path to extract insurer and insurance type (category) from folders.
        Enhanced with plan_id and document_type for RAG compliance.
        """
        path = Path(file_path).resolve()
        
        if not str(path).lower().startswith(str(self.base_path).lower()):
            raise ValueError(f"File {file_path} is not inside base directory {self.base_path}")

        relative_path = path.relative_to(self.base_path)
        parts = relative_path.parts
        raw_stem = path.stem

        # Initialize product_name and doc_type
        product_name = raw_stem
        doc_type = "brochure"  # Default to brochure if not specified

        # Normalize common typos in product name using regex for safety
        product_name = re.sub(r'\bEdelwiess\b', 'Edelweiss', product_name, flags=re.IGNORECASE)
        product_name = re.sub(r'Edelweis(?!s)', 'Edelweiss', product_name, flags=re.IGNORECASE)
        product_name = re.sub(r'Smapoorna', 'Sampoorna', product_name, flags=re.IGNORECASE)
        product_name = re.sub(r'Fotune', 'Fortune', product_name, flags=re.IGNORECASE)
        product_name = re.sub(r'^(Tata\s*AIA|TATA_AIA)\b', 'TATA AIA', product_name, flags=re.IGNORECASE)
        
        stem_lower = product_name.lower()
        
        # Detect document type (CIS vs Brochure)
        if "cis" in stem_lower:
            doc_type = "cis"
            product_name = re.sub(r'[\s_-]cis\b', '', product_name, flags=re.IGNORECASE).strip()
        elif "brochure" in stem_lower:
            doc_type = "brochure"
            product_name = re.sub(r'[\s_-]brochure\b', '', product_name, flags=re.IGNORECASE).strip()

        # Extract Category (Insurance Type) from Folder Structure
        # Expected structure: docs/Insurer/Category/Product.pdf
        category = "General"
        insurer = "Other"
        
        if len(parts) >= 2:
            insurer = parts[0]
            if len(parts) >= 3:
                category = parts[1]
            else:
                category = "General"

        # Standardize Categories
        category_mapping = {
            "Term Insurance Plans": "Term Insurance",
            "Term Plans": "Term Insurance",
            "Term Plan": "Term Insurance",
            "ULIP Plans": "Unit Linked Insurance Plan",
            "ULIP Plan": "Unit Linked Insurance Plan",
            "Wealth Creation Plans": "Unit Linked Insurance Plan",
            "Savings Plan": "Savings Plan",
            "Savings Plans": "Savings Plan",
            "Guaranteed Income Plans": "Savings Plan",
            "Retirement Plans": "Retirement and Pension",
            "Retirement and Pension Plan": "Retirement and Pension",
            "Pension Plan": "Retirement and Pension",
            "Health Plan": "Health Insurance",
            "Health Plans": "Health Insurance",
            "Group Solutions": "Group Plan",
            "Micro Plans": "Micro Insurance",
            "Combo": "Combo Plan"
        }
        
        standard_category = category_mapping.get(category, category)

        # Generate plan_id for linking brochure and CIS
        plan_id = self._generate_plan_id(insurer, standard_category, product_name)

        metadata = {
            "source": str(file_path),
            "filename": path.name,
            "product_name": product_name,
            "document_type": doc_type,  # "brochure" or "cis"
            "insurer": insurer,
            "insurance_type": standard_category,
            "plan_id": plan_id,  # Links brochure and CIS together
        }

        # Optional: Merge additional static info if product matches exactly
        if product_name in self.external_metadata:
            ext_data = self.external_metadata[product_name]
            if isinstance(ext_data, dict):
                for k, v in ext_data.items():
                    if k == "category":
                        continue 
                    if isinstance(v, list):
                        metadata[k] = ", ".join(v)
                    else:
                        metadata[k] = v
        
        return metadata

    @staticmethod
    def detect_section(text: str) -> str:
        """
        Detect the most likely section based on content headers.
        Returns the section name or 'General' if no match.
        """
        # Check first 500 chars for section headers
        sample = text[:500].lower()
        
        section_scores = {}
        for header in SECTION_HEADERS:
            if header.lower() in sample:
                section_scores[header] = sample.index(header.lower())
        
        if section_scores:
            # Return the earliest matching section
            return min(section_scores, key=section_scores.get)
        
        # Keyword-based fallback detection
        keyword_map = {
            "Eligibility": ["age", "entry age", "minimum age", "maximum age", "who can"],
            "Benefits": ["death benefit", "maturity benefit", "survival benefit", "sum assured"],
            "Exclusions": ["not covered", "excluded", "suicide", "war", "pre-existing"],
            "Charges": ["premium", "fund management", "mortality", "allocation", "admin"],
            "Tax Benefits": ["80c", "80d", "10(10d)", "income tax", "tax benefit"],
            "Riders": ["rider", "accidental", "critical illness", "waiver"],
            "Claim Process": ["claim", "intimation", "documents required", "settlement"]
        }
        
        for section, keywords in keyword_map.items():
            if any(kw in sample for kw in keywords):
                return section
        
        return "General"


# Quick validation block
if __name__ == "__main__":
    extractor = MetadataExtractor("docs")
    
    # Test brochure
    sample1 = "docs/TATA AIA/Term Plans/TATA AIA Smart Value Income Plan Brochure.pdf"
    print("Brochure:", extractor.extract_from_path(sample1))
    
    # Test CIS
    sample2 = "docs/TATA AIA/Term Plans/TATA AIA Maha Raksha Supreme Select CIS.docx"
    print("CIS:", extractor.extract_from_path(sample2))
    
    # Test section detection
    test_text = "Eligibility Criteria: The minimum entry age is 18 years and maximum is 65 years."
    print("Section detected:", MetadataExtractor.detect_section(test_text))