datasciencesage commited on
Commit
95ff1e1
·
verified ·
1 Parent(s): 363db8d

Upload 8 files

Browse files
core/document_parser.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import pdfplumber
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Dict, Any, Optional, List, Tuple
7
+ from dataclasses import dataclass
8
+ from loguru import logger
9
+
10
+
11
+ @dataclass
12
+ class DocumentChunk:
13
+ """chunk of text from document"""
14
+ chunk_id: str
15
+ text: str
16
+ page_num: int
17
+ start_char: int
18
+ end_char: int
19
+ metadata: Dict[str, Any]
20
+
21
+
22
+ @dataclass
23
+ class ParsedDocument:
24
+ """parsed document data"""
25
+ file_name: str
26
+ total_pages: int
27
+ text_content: str
28
+ pages: List[Dict[str, Any]]
29
+ tables: List[Dict[str, Any]]
30
+ chunks: List[DocumentChunk]
31
+ metadata: Dict[str, Any]
32
+
33
+
34
+ class DocumentParser:
35
+ # PDF parser with chunking for RAG
36
+
37
+ def __init__(self, chunk_size=1000, chunk_overlap=200):
38
+ self.chunk_size = chunk_size
39
+ self.chunk_overlap = chunk_overlap
40
+ logger.info(f"Parser initialized - chunk_size={chunk_size}, overlap={chunk_overlap}")
41
+
42
+ def parse_pdf(self, pdf_path):
43
+ """
44
+ parse PDF and extract content
45
+ """
46
+ logger.info(f"Parsing: {Path(pdf_path).name}")
47
+
48
+ try:
49
+ with pdfplumber.open(pdf_path) as pdf:
50
+ all_text = []
51
+ pages_data = []
52
+ tables_data = []
53
+
54
+ # go through each page
55
+ for page_num, page in enumerate(pdf.pages, start=1):
56
+ try:
57
+ page_result = self._parse_page(page, page_num)
58
+
59
+ all_text.append(page_result["text"])
60
+ pages_data.append(page_result["page_data"])
61
+ tables_data.extend(page_result["tables"])
62
+
63
+ logger.debug(f"Page {page_num}: {len(page_result['text'])} chars, {len(page_result['tables'])} tables")
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error on page {page_num}: {str(e)}")
67
+ continue # skip problematic pages
68
+
69
+ full_text = "\n\n".join(all_text)
70
+
71
+ # create chunks for embeddings
72
+ chunks = self._create_chunks(full_text, Path(pdf_path).name)
73
+
74
+ metadata = {
75
+ "file_path": pdf_path,
76
+ "file_name": Path(pdf_path).name,
77
+ "total_pages": len(pdf.pages),
78
+ "total_tables": len(tables_data),
79
+ "total_chunks": len(chunks),
80
+ "text_length": len(full_text)
81
+ }
82
+
83
+ parsed_doc = ParsedDocument(
84
+ file_name=Path(pdf_path).name,
85
+ total_pages=len(pdf.pages),
86
+ text_content=full_text,
87
+ pages=pages_data,
88
+ tables=tables_data,
89
+ chunks=chunks,
90
+ metadata=metadata
91
+ )
92
+
93
+ logger.success(f"Parsed {len(pdf.pages)} pages, {len(tables_data)} tables, {len(chunks)} chunks")
94
+
95
+ return parsed_doc
96
+
97
+ except FileNotFoundError:
98
+ logger.error(f"File not found: {pdf_path}")
99
+ return None
100
+ except Exception as e:
101
+ logger.error(f"Failed to parse {pdf_path}: {str(e)}")
102
+ return None
103
+
104
+ def _parse_page(self, page, page_num):
105
+ """parse single page"""
106
+ try:
107
+ # grab text
108
+ page_text = page.extract_text()
109
+ if page_text is None:
110
+ page_text = ""
111
+
112
+ # extract tables
113
+ tables = []
114
+ raw_tables = page.extract_tables()
115
+
116
+ for table_idx, table in enumerate(raw_tables):
117
+ if table and len(table) > 0:
118
+ try:
119
+ table_data = {
120
+ "page": page_num,
121
+ "table_id": f"p{page_num}_t{table_idx + 1}",
122
+ "headers": table[0] if table else [],
123
+ "rows": table[1:] if len(table) > 1 else [],
124
+ "raw_data": table
125
+ }
126
+ tables.append(table_data)
127
+ except Exception as e:
128
+ logger.warning(f"Table {table_idx} error on page {page_num}: {str(e)}")
129
+
130
+ page_data = {
131
+ "page_num": page_num,
132
+ "text": page_text,
133
+ "text_length": len(page_text),
134
+ "tables_count": len(tables),
135
+ "width": page.width,
136
+ "height": page.height
137
+ }
138
+
139
+ return {
140
+ "text": page_text,
141
+ "tables": tables,
142
+ "page_data": page_data
143
+ }
144
+
145
+ except Exception as e:
146
+ logger.error(f"_parse_page error for page {page_num}: {str(e)}")
147
+ return {
148
+ "text": "",
149
+ "tables": [],
150
+ "page_data": {
151
+ "page_num": page_num,
152
+ "text": "",
153
+ "text_length": 0,
154
+ "tables_count": 0
155
+ }
156
+ }
157
+
158
+ def _create_chunks(self, text, file_name):
159
+ """
160
+ break text into chunks with overlap
161
+ TODO: maybe improve the chunking logic later
162
+ """
163
+ try:
164
+ chunks = []
165
+
166
+ if not text:
167
+ logger.warning("Empty text for chunking")
168
+ return chunks
169
+
170
+ # split by paragraphs
171
+ paragraphs = text.split('\n\n')
172
+
173
+ current_chunk = ""
174
+ current_start = 0
175
+ chunk_id = 0
176
+
177
+ for para in paragraphs:
178
+ para = para.strip()
179
+ if not para:
180
+ continue
181
+
182
+ # check if adding para exceeds size
183
+ if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
184
+ # save chunk
185
+ chunk = DocumentChunk(
186
+ chunk_id=f"chunk_{chunk_id}",
187
+ text=current_chunk.strip(),
188
+ page_num=0, # not tracking page num for now
189
+ start_char=current_start,
190
+ end_char=current_start + len(current_chunk),
191
+ metadata={
192
+ "source_file": file_name,
193
+ "chunk_length": len(current_chunk)
194
+ }
195
+ )
196
+ chunks.append(chunk)
197
+ chunk_id += 1
198
+
199
+ # start new chunk with overlap
200
+ if len(current_chunk) > self.chunk_overlap:
201
+ overlap_text = current_chunk[-self.chunk_overlap:]
202
+ else:
203
+ overlap_text = current_chunk
204
+ current_start = current_start + len(current_chunk) - len(overlap_text)
205
+ current_chunk = overlap_text + "\n\n" + para
206
+ else:
207
+ # add to current chunk
208
+ if current_chunk:
209
+ current_chunk += "\n\n" + para
210
+ else:
211
+ current_chunk = para
212
+
213
+ # add final chunk
214
+ if current_chunk:
215
+ chunk = DocumentChunk(
216
+ chunk_id=f"chunk_{chunk_id}",
217
+ text=current_chunk.strip(),
218
+ page_num=0,
219
+ start_char=current_start,
220
+ end_char=current_start + len(current_chunk),
221
+ metadata={
222
+ "source_file": file_name,
223
+ "chunk_length": len(current_chunk)
224
+ }
225
+ )
226
+ chunks.append(chunk)
227
+
228
+ logger.info(f"Created {len(chunks)} chunks")
229
+ return chunks
230
+
231
+ except Exception as e:
232
+ logger.error(f"Chunking error: {str(e)}")
233
+ return []
234
+
235
+ def extract_bureau_score(self, parsed_doc):
236
+ """
237
+ grab CIBIL score from CRIF report
238
+ looks for pattern like "PERFORM CONSUMER 2.2 300-900 627"
239
+ """
240
+ try:
241
+ text = parsed_doc.text_content
242
+
243
+ # main pattern - score after range
244
+ pattern = r'PERFORM\s+CONSUMER.*?300-900\s+(\d{3})'
245
+ match = re.search(pattern, text, re.IGNORECASE)
246
+
247
+ if match:
248
+ score = int(match.group(1))
249
+ if 300 <= score <= 900:
250
+ logger.info(f"Found bureau score: {score}")
251
+ return {
252
+ "value": score,
253
+ "source": "CRIF Report – Score Section"
254
+ }
255
+
256
+ # fallback - check first couple pages
257
+ for page in parsed_doc.pages[:2]:
258
+ page_text = page["text"]
259
+ numbers = re.findall(r'\b(\d{3})\b', page_text)
260
+
261
+ for num_str in numbers:
262
+ num = int(num_str)
263
+ if 300 <= num <= 900:
264
+ # check if its actually a score
265
+ idx = page_text.find(num_str)
266
+ context = page_text[max(0, idx-100):idx+100]
267
+
268
+ keywords = ['score', 'cibil', 'credit', 'bureau']
269
+ if any(kw in context.lower() for kw in keywords):
270
+ logger.info(f"Found score (fallback): {num}")
271
+ return {
272
+ "value": num,
273
+ "source": f"CRIF Report – Page {page['page_num']}"
274
+ }
275
+
276
+ logger.warning("Bureau score not found")
277
+ return None
278
+
279
+ except Exception as e:
280
+ logger.error(f"Error extracting bureau score: {str(e)}")
281
+ return None
282
+
283
+ def extract_gst_sales(self, parsed_doc):
284
+ """extract sales from GSTR-3B table"""
285
+ try:
286
+ text = parsed_doc.text_content
287
+ filename = parsed_doc.file_name
288
+
289
+ # get month from document
290
+ month_match = re.search(r'Period\s+(\w+)', text)
291
+ month_name = month_match.group(1) if month_match else "Unknown"
292
+
293
+ # extract year from filename (GSTR3B_..._012025.pdf format)
294
+ filename_year_match = re.search(r'_(\d{2})(\d{4})\.pdf', filename)
295
+ if filename_year_match:
296
+ year = filename_year_match.group(2)
297
+ else:
298
+ # fallback
299
+ year_match = re.search(r'Year\s+(\d{4})', text)
300
+ year = year_match.group(1) if year_match else "2025"
301
+
302
+ formatted_month = f"{month_name} {year}"
303
+
304
+ # search tables for sales
305
+ for table in parsed_doc.tables:
306
+ rows = table.get("rows", [])
307
+
308
+ for row in rows:
309
+ if row and len(row) > 1:
310
+ first_cell = str(row[0]).replace('\n', ' ')
311
+
312
+ # find row (a) with outward supplies
313
+ if "(a)" in first_cell and "Outward taxable supplies" in first_cell:
314
+ if len(row) > 1 and row[1]:
315
+ value_str = str(row[1])
316
+ clean_value = re.sub(r'[^\d.]', '', value_str)
317
+
318
+ if clean_value:
319
+ try:
320
+ sales = float(clean_value)
321
+ logger.info(f"GST sales: {sales} for {formatted_month}")
322
+ return {
323
+ "month": formatted_month,
324
+ "sales": sales,
325
+ "source": "GSTR-3B Table 3.1(a)"
326
+ }
327
+ except ValueError as e:
328
+ logger.warning(f"Couldn't parse sales value '{clean_value}': {str(e)}")
329
+
330
+ logger.warning(f"Sales data not found for {formatted_month}")
331
+ return None
332
+
333
+ except Exception as e:
334
+ logger.error(f"Error extracting GST sales: {str(e)}")
335
+ return None
336
+
337
+ def get_chunks_text(self, chunks):
338
+ """get text from chunks for embedding"""
339
+ try:
340
+ return [chunk.text for chunk in chunks]
341
+ except Exception as e:
342
+ logger.error(f"Error getting chunks text: {str(e)}")
343
+ return []
core/domain_knowledge.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ BUREAU_TERMINOLOGY = [
3
+ "Bureau credit score: Numerical representation of creditworthiness ranging from 300 (poor) to 900 (excellent). Higher scores indicate better credit history and lower risk.",
4
+ "DPD (Days Past Due): Number of days a payment is overdue beyond the due date. Common thresholds monitored are 30+, 60+, and 90+ DPD.",
5
+ "30+ DPD: Count of accounts with payments overdue by 30 or more days in the specified monitoring period. Indicates early stage delinquency.",
6
+ "60+ DPD: Count of accounts with payments overdue by 60 or more days in the specified monitoring period. Indicates moderate delinquency.",
7
+ "90+ DPD: Count of accounts with payments overdue by 90 or more days in the specified monitoring period. Indicates serious delinquency.",
8
+ "Settlement: Debt resolved by borrower paying less than the full amount owed, typically after negotiation with creditor. Marked negatively on credit report.",
9
+ "Write-off: Debt declared unrecoverable by lender and removed from active accounts. Severely impacts credit score and indicates non-payment.",
10
+ "NTC (No-Track-Case): Credit applicants with insufficient credit history or no previous credit accounts in bureau database. Also called 'New to Credit'.",
11
+ "Suit Filed: Legal action initiated by creditor for debt recovery through courts. Indicates serious delinquency and unwillingness to pay.",
12
+ "Wilful Default: Deliberate non-payment of debt despite having the financial ability to pay. Considered fraudulent behavior and severely impacts creditworthiness.",
13
+ "Live PL/BL: Active Personal Loan or Business Loan currently being serviced by the borrower with regular payments.",
14
+ "Overdue amount: Total unpaid amount across all accounts that is past the due date. Sum of all overdue balances.",
15
+ "Credit inquiry: Request made by lender to check credit report when applicant applies for credit. Too many inquiries indicate credit hunger.",
16
+ "Active loans: Loans currently being serviced by borrower, not yet closed or settled. Indicates current credit obligations.",
17
+ "Loan exposure: Total outstanding amount across all loans. Also called total debt or credit exposure.",
18
+ ]
19
+
20
+ # GST and GSTR-3B Terminology
21
+ GST_TERMINOLOGY = [
22
+ "GSTR-3B: Monthly return filing summarizing outward supplies, input tax credit claimed, and net tax liability for the tax period.",
23
+ "Table 3.1(a): Section in GSTR-3B reporting outward taxable supplies (other than zero rated, nil rated and exempted). This is the main sales figure.",
24
+ "Outward supplies: Goods or services provided by the registered GST taxpayer to customers. This is the sales/revenue of the business.",
25
+ "Taxable supplies: Supplies on which GST is levied at applicable rates (5%, 12%, 18%, or 28%). Excludes exempted and nil-rated supplies.",
26
+ "Taxable value: The base value on which GST is calculated, excluding the GST amount itself. This is the pre-tax revenue.",
27
+ "Outward taxable supplies: Sales of goods/services on which GST is applicable. Found in GSTR-3B Table 3.1, row (a).",
28
+ "GSTR-3B structure: Contains multiple tables - Table 3.1 for outward supplies, Table 3.2 for inter-state supplies, Table 4 for input tax credit.",
29
+ "Tax period: The month and year for which the GST return is filed. Format is usually 'Month YYYY' (e.g., January 2025).",
30
+ "GSTIN: GST Identification Number, unique 15-digit alphanumeric code assigned to each registered taxpayer.",
31
+ ]
32
+
33
+ # Validation and Business Rules
34
+ VALIDATION_RULES = [
35
+ "Valid bureau credit scores: Must be between 300 and 900 inclusive. Scores outside this range are invalid.",
36
+ "Credit score interpretation: 300-579 is Poor, 580-669 is Fair, 670-739 is Good, 740-799 is Very Good, 800-900 is Excellent.",
37
+ "DPD hierarchy rule: 90+ DPD count ≤ 60+ DPD count ≤ 30+ DPD count. If this is violated, data may be incorrect.",
38
+ "GST sales validation: Taxable value should be non-negative numbers. Negative sales indicate data entry error.",
39
+ "Suspicious GST amounts: Values over 10 crore (100,000,000 rupees) should be flagged for verification as potentially incorrect.",
40
+ "Written-off debt amount: Should be non-negative. Negative values indicate error in extraction or data.",
41
+ "Loan counts validation: Max loans and max active loans should be non-negative integers. Cannot have negative loan counts.",
42
+ "Overdue threshold: Maximum allowable overdue amount, typically ranging from 0 to several lakhs. Depends on risk appetite.",
43
+ "Credit inquiry limits: Excessive inquiries (>5 in 6 months) indicate credit hunger and should be flagged.",
44
+ "Zero values interpretation: Zero or null values may indicate either absence of the attribute or that the parameter is not applicable.",
45
+ ]
46
+
47
+ # Extraction Hints and Location Guidance
48
+ EXTRACTION_HINTS = [
49
+ "Bureau credit score location: Typically appears near terms like 'PERFORM', 'CONSUMER', 'Score', 'CIBIL', or in a dedicated score section on first page.",
50
+ "Credit score format: Usually displayed as a 3-digit number between 300-900, sometimes with a gauge or range indicator.",
51
+ "DPD information location: Often found in payment history tables, delinquency sections, or account performance summary.",
52
+ "Settlement and write-off status: Usually marked explicitly in account status columns with keywords 'Settled', 'Written Off', or status codes.",
53
+ "Live loan indicators: Marked with 'Active', 'Current', 'Live', or similar status in account listings.",
54
+ "GSTR-3B sales extraction: Sales figures are in Table 3.1, row labeled '(a) Outward taxable supplies', second column shows taxable value.",
55
+ "GSTR-3B month extraction: Month information appears as 'Period' followed by month name (January, February, etc.).",
56
+ "GSTR-3B year extraction: Year appears in 'Year' field in format 'YYYY-YY' (e.g., 2024-25) or in filename as MMYYYY (e.g., 012025).",
57
+ "Table structure in PDFs: Tables may span multiple pages. Look for continuation rows and merged cells.",
58
+ "Multiple bureau reports: When processing multiple reports, extract parameters separately for each person/entity.",
59
+ "NTC acceptance: Check for explicit mentions of 'No Track Case', 'NTC', 'New to Credit' status in summary or remarks.",
60
+ "Suit filed indicators: Look for keywords 'Suit Filed', 'Legal Action', 'Court Case' in account remarks or status.",
61
+ ]
62
+
63
+ # Common Patterns and Formats
64
+ COMMON_PATTERNS = [
65
+ "Date formats in bureau reports: DD-MM-YYYY, DD/MM/YYYY, or MMM-YYYY for month-year format.",
66
+ "Currency representation: Indian Rupees shown as '₹', 'Rs.', 'INR', or just numbers with commas (e.g., 1,50,000).",
67
+ "Percentage formats: Shown with '%' symbol or as decimals (0.15 = 15%).",
68
+ "Boolean values: Yes/No, True/False, Y/N, 1/0, or Present/Absent for presence/absence of attributes.",
69
+ "Account types: PL (Personal Loan), BL (Business Loan), CC (Credit Card), HL (Home Loan), AL (Auto Loan).",
70
+ "Status codes in bureau: STD (Standard), SMA (Special Mention Account), SUB (Sub-standard), DBT (Doubtful), LSS (Loss).",
71
+ ]
72
+
73
+ # All knowledge combined for easy iteration
74
+ ALL_KNOWLEDGE = (
75
+ BUREAU_TERMINOLOGY +
76
+ GST_TERMINOLOGY +
77
+ VALIDATION_RULES +
78
+ EXTRACTION_HINTS +
79
+ COMMON_PATTERNS
80
+ )
81
+
82
+ # Category mapping for retrieval filtering
83
+ KNOWLEDGE_CATEGORIES = {
84
+ "bureau_terminology": BUREAU_TERMINOLOGY,
85
+ "gst_terminology": GST_TERMINOLOGY,
86
+ "validation_rules": VALIDATION_RULES,
87
+ "extraction_hints": EXTRACTION_HINTS,
88
+ "common_patterns": COMMON_PATTERNS,
89
+ }
core/domain_rag.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ from dataclasses import dataclass
3
+ import numpy as np
4
+ from loguru import logger
5
+
6
+ from .domain_knowledge import ALL_KNOWLEDGE, KNOWLEDGE_CATEGORIES
7
+ from .embeddings import EmbeddingService
8
+
9
+
10
+ @dataclass
11
+ class DomainSnippet:
12
+ """domain knowledge snippet with its embedding"""
13
+ text: str
14
+ embedding: np.ndarray
15
+ category: str
16
+ index: int
17
+
18
+
19
+ class DomainRAG:
20
+ """
21
+ RAG for domain knowledge
22
+ embeds hardcoded domain notes on startup
23
+ """
24
+
25
+ def __init__(self, embedding_service):
26
+ try:
27
+ self.embedding_service = embedding_service
28
+ self.snippets = []
29
+ self._initialize_knowledge()
30
+
31
+ except Exception as e:
32
+ logger.error(f"Failed to init DomainRAG: {str(e)}")
33
+ raise
34
+
35
+ def _initialize_knowledge(self):
36
+ """embed all domain knowledge on startup"""
37
+ try:
38
+ logger.info("Initializing domain knowledge RAG...")
39
+ logger.info(f"Total snippets to embed: {len(ALL_KNOWLEDGE)}")
40
+
41
+ # batch embed all snippets
42
+ embeddings = self.embedding_service.create_embeddings_batch(
43
+ texts=ALL_KNOWLEDGE
44
+ )
45
+
46
+ if not embeddings or len(embeddings) != len(ALL_KNOWLEDGE):
47
+ logger.error("Failed to create embeddings for domain knowledge")
48
+ return
49
+
50
+ # store snippets with their embeddings
51
+ for i, text in enumerate(ALL_KNOWLEDGE):
52
+ category = self._categorize_snippet(text)
53
+ self.snippets.append(DomainSnippet(
54
+ text=text,
55
+ embedding=embeddings[i],
56
+ category=category,
57
+ index=i
58
+ ))
59
+
60
+ logger.success(f"Domain RAG ready: {len(self.snippets)} snippets embedded")
61
+
62
+ # log category breakdown
63
+ category_counts = {}
64
+ for snippet in self.snippets:
65
+ if snippet.category in category_counts:
66
+ category_counts[snippet.category] += 1
67
+ else:
68
+ category_counts[snippet.category] = 1
69
+
70
+ for category, count in category_counts.items():
71
+ logger.info(f" - {category}: {count} snippets")
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error initializing domain knowledge: {str(e)}")
75
+ # don't crash - system can work without it
76
+ self.snippets = []
77
+
78
+ def _categorize_snippet(self, text):
79
+ """figure out what category this snippet belongs to"""
80
+ try:
81
+ text_lower = text.lower()
82
+
83
+ # bureau stuff
84
+ bureau_keywords = ['bureau', 'credit', 'dpd', 'score', 'cibil', 'loan',
85
+ 'settlement', 'write-off', 'ntc', 'suit']
86
+ if any(kw in text_lower for kw in bureau_keywords):
87
+ return "bureau"
88
+
89
+ # gst related
90
+ gst_keywords = ['gst', 'gstr', 'table', 'supply', 'taxable', 'outward',
91
+ 'gstin', 'tax period']
92
+ if any(kw in text_lower for kw in gst_keywords):
93
+ return "gst"
94
+
95
+ # validation rules
96
+ validation_keywords = ['valid', 'should', 'must', 'rule', 'between', 'range',
97
+ 'validation', 'suspicious', 'negative']
98
+ if any(kw in text_lower for kw in validation_keywords):
99
+ return "validation"
100
+
101
+ # extraction hints
102
+ hint_keywords = ['location', 'found', 'appears', 'extraction', 'look for',
103
+ 'typically', 'usually', 'marked']
104
+ if any(kw in text_lower for kw in hint_keywords):
105
+ return "extraction_hint"
106
+
107
+ # pattern stuff
108
+ pattern_keywords = ['format', 'pattern', 'representation', 'shown as',
109
+ 'display', 'code', 'type']
110
+ if any(kw in text_lower for kw in pattern_keywords):
111
+ return "common_pattern"
112
+
113
+ return "general"
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error categorizing snippet: {str(e)}")
117
+ return "general"
118
+
119
+ def retrieve(self, query, top_k=3, min_similarity=0.3, category_filter=None):
120
+ """
121
+ get most relevant domain snippets for query
122
+ """
123
+ try:
124
+ if not self.snippets:
125
+ logger.warning("No domain knowledge available")
126
+ return []
127
+
128
+ logger.info(f"Retrieving domain knowledge for: '{query[:100]}...'")
129
+
130
+ # embed the query
131
+ query_embedding = self.embedding_service.create_embedding(query)
132
+
133
+ if query_embedding is None:
134
+ logger.error("Failed to create query embedding")
135
+ return []
136
+
137
+ # filter by category if needed
138
+ filtered_snippets = self.snippets
139
+ if category_filter:
140
+ filtered_snippets = [s for s in self.snippets if s.category == category_filter]
141
+ logger.info(f"Filtered to {len(filtered_snippets)} snippets in '{category_filter}'")
142
+
143
+ if not filtered_snippets:
144
+ logger.warning(f"No snippets for category: {category_filter}")
145
+ return []
146
+
147
+ # prepare data for similarity search
148
+ snippet_embeddings = [s.embedding for s in filtered_snippets]
149
+ snippet_texts = [s.text for s in filtered_snippets]
150
+ snippet_metadata = [{"category": s.category, "index": s.index} for s in filtered_snippets]
151
+
152
+ # find similar snippets
153
+ results = self.embedding_service.find_most_similar(
154
+ query_embedding=query_embedding,
155
+ candidate_embeddings=snippet_embeddings,
156
+ candidate_texts=snippet_texts,
157
+ candidate_metadata=snippet_metadata,
158
+ top_k=top_k,
159
+ min_similarity=min_similarity
160
+ )
161
+
162
+ if results:
163
+ logger.success(f"Retrieved {len(results)} snippets (top: {results[0].similarity:.3f})")
164
+ return [r.text for r in results]
165
+ else:
166
+ logger.warning(f"No snippets above threshold {min_similarity}")
167
+ return []
168
+
169
+ except Exception as e:
170
+ logger.error(f"Error retrieving domain knowledge: {str(e)}")
171
+ return []
172
+
173
+ def retrieve_by_category(self, query, categories, snippets_per_category=2):
174
+ """get snippets grouped by category"""
175
+ try:
176
+ results = {}
177
+
178
+ for category in categories:
179
+ snippets = self.retrieve(
180
+ query=query,
181
+ top_k=snippets_per_category,
182
+ category_filter=category
183
+ )
184
+ if snippets:
185
+ results[category] = snippets
186
+
187
+ return results
188
+
189
+ except Exception as e:
190
+ logger.error(f"Error retrieving by category: {str(e)}")
191
+ return {}
192
+
193
+ def get_all_snippets(self, category=None):
194
+ """get all snippets, optionally filtered"""
195
+ try:
196
+ if category:
197
+ return [s.text for s in self.snippets if s.category == category]
198
+ else:
199
+ return [s.text for s in self.snippets]
200
+
201
+ except Exception as e:
202
+ logger.error(f"Error getting snippets: {str(e)}")
203
+ return []
204
+
205
+ def get_statistics(self):
206
+ """stats about domain knowledge"""
207
+ try:
208
+ category_counts = {}
209
+ for snippet in self.snippets:
210
+ if snippet.category in category_counts:
211
+ category_counts[snippet.category] += 1
212
+ else:
213
+ category_counts[snippet.category] = 1
214
+
215
+ embedding_dim = 0
216
+ if self.snippets and len(self.snippets) > 0:
217
+ embedding_dim = len(self.snippets[0].embedding)
218
+
219
+ return {
220
+ "total_snippets": len(self.snippets),
221
+ "categories": category_counts,
222
+ "embedding_dimension": embedding_dim
223
+ }
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error getting stats: {str(e)}")
227
+ return {"total_snippets": 0, "categories": {}, "embedding_dimension": 0}
core/embeddings.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import numpy as np
3
+ from typing import List, Dict, Any, Optional, Tuple
4
+ from loguru import logger
5
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
6
+ from dataclasses import dataclass
7
+ import time
8
+
9
+
10
+ @dataclass
11
+ class SimilarityResult:
12
+ """similarity search result"""
13
+ index: int
14
+ similarity: float
15
+ text: str
16
+ metadata: Dict[str, Any]
17
+
18
+
19
+ class EmbeddingService:
20
+ """
21
+ OpenAI embeddings service
22
+ handles embedding creation and similarity search
23
+ """
24
+
25
+ def __init__(self, api_key, model="text-embedding-3-large"):
26
+ try:
27
+ self.client = OpenAI(api_key=api_key)
28
+ self.model = model
29
+ # dimensions based on model
30
+ if "large" in model:
31
+ self.embedding_dim = 1536
32
+ else:
33
+ self.embedding_dim = 1024
34
+ self.max_tokens = 8191
35
+
36
+ logger.info(f"EmbeddingService ready (model={model}, dims={self.embedding_dim})")
37
+
38
+ except Exception as e:
39
+ logger.error(f"Failed to init EmbeddingService: {str(e)}")
40
+ raise
41
+
42
+ @retry(
43
+ stop=stop_after_attempt(3),
44
+ wait=wait_exponential(multiplier=1, min=2, max=10),
45
+ retry=retry_if_exception_type(Exception),
46
+ reraise=True
47
+ )
48
+ def create_embedding(self, text):
49
+ """create embedding for single text with retry"""
50
+ try:
51
+ # check if text is valid
52
+ if not text or not text.strip():
53
+ logger.warning("Empty text for embedding")
54
+ return None
55
+
56
+ # truncate long text (rough estimate: 4 chars per token)
57
+ if len(text) > 30000:
58
+ text = text[:30000]
59
+ logger.warning("Text truncated to 30k chars")
60
+
61
+ start_time = time.time()
62
+ response = self.client.embeddings.create(
63
+ model=self.model,
64
+ input=text,
65
+ encoding_format="float"
66
+ )
67
+
68
+ embedding = response.data[0].embedding
69
+ elapsed = time.time() - start_time
70
+
71
+ logger.debug(f"Created embedding in {elapsed:.2f}s ({len(text)} chars → {len(embedding)} dims)")
72
+
73
+ return embedding
74
+
75
+ except Exception as e:
76
+ logger.error(f"Error creating embedding: {str(e)}")
77
+ raise
78
+
79
+ @retry(
80
+ stop=stop_after_attempt(3),
81
+ wait=wait_exponential(multiplier=1, min=2, max=10),
82
+ retry=retry_if_exception_type(Exception),
83
+ reraise=True
84
+ )
85
+ def create_embeddings_batch(self, texts, batch_size=100):
86
+ """create embeddings for multiple texts in batches"""
87
+ try:
88
+ if not texts:
89
+ logger.warning("Empty text list for batch embedding")
90
+ return []
91
+
92
+ logger.info(f"Creating embeddings for {len(texts)} texts (batch_size={batch_size})")
93
+
94
+ all_embeddings = []
95
+
96
+ # process in chunks
97
+ for i in range(0, len(texts), batch_size):
98
+ batch = texts[i:i + batch_size]
99
+
100
+ # clean up batch
101
+ processed_batch = []
102
+ for text in batch:
103
+ if text and text.strip():
104
+ # truncate if needed
105
+ if len(text) > 30000:
106
+ processed_batch.append(text[:30000])
107
+ else:
108
+ processed_batch.append(text)
109
+ else:
110
+ processed_batch.append(" ") # fallback for empty
111
+
112
+ try:
113
+ start_time = time.time()
114
+ response = self.client.embeddings.create(
115
+ model=self.model,
116
+ input=processed_batch,
117
+ encoding_format="float"
118
+ )
119
+
120
+ batch_embeddings = [data.embedding for data in response.data]
121
+ all_embeddings.extend(batch_embeddings)
122
+
123
+ elapsed = time.time() - start_time
124
+ batch_num = i//batch_size + 1
125
+ logger.debug(f"Batch {batch_num}: {len(batch)} texts in {elapsed:.2f}s")
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error in batch {i//batch_size + 1}: {str(e)}")
129
+ # add None for failed ones
130
+ for _ in range(len(batch)):
131
+ all_embeddings.append(None)
132
+
133
+ # count successful embeddings
134
+ successful = 0
135
+ for e in all_embeddings:
136
+ if e is not None:
137
+ successful += 1
138
+
139
+ success_rate = (successful / len(texts)) * 100
140
+ logger.success(f"Created {successful}/{len(texts)} embeddings ({success_rate:.1f}% success)")
141
+
142
+ return all_embeddings
143
+
144
+ except Exception as e:
145
+ logger.error(f"Error in batch embedding: {str(e)}")
146
+ return None
147
+
148
+ def cosine_similarity(self, vec1, vec2):
149
+ """calculate cosine similarity between two vectors"""
150
+ try:
151
+ v1 = np.array(vec1)
152
+ v2 = np.array(vec2)
153
+
154
+ dot_product = np.dot(v1, v2)
155
+ norm1 = np.linalg.norm(v1)
156
+ norm2 = np.linalg.norm(v2)
157
+
158
+ # handle zero vectors
159
+ if norm1 == 0 or norm2 == 0:
160
+ logger.warning("Zero vector in cosine similarity")
161
+ return 0.0
162
+
163
+ similarity = dot_product / (norm1 * norm2)
164
+
165
+ # clip to valid range (sometimes gets > 1 due to numerical errors)
166
+ similarity = float(np.clip(similarity, 0.0, 1.0))
167
+
168
+ return similarity
169
+
170
+ except Exception as e:
171
+ logger.error(f"Error calculating cosine similarity: {str(e)}")
172
+ return 0.0
173
+
174
+ def find_most_similar(self, query_embedding, candidate_embeddings,
175
+ candidate_texts=None, candidate_metadata=None,
176
+ top_k=5, min_similarity=0.0):
177
+ """
178
+ find most similar embeddings using cosine similarity
179
+ returns sorted list of results
180
+ """
181
+ try:
182
+ if not query_embedding or not candidate_embeddings:
183
+ logger.warning("Empty embeddings for similarity search")
184
+ return []
185
+
186
+ logger.info(f"Finding top {top_k} from {len(candidate_embeddings)} candidates (min={min_similarity})")
187
+
188
+ similarities = []
189
+
190
+ for idx, candidate in enumerate(candidate_embeddings):
191
+ # skip None embeddings
192
+ if candidate is None:
193
+ continue
194
+
195
+ try:
196
+ similarity = self.cosine_similarity(query_embedding, candidate)
197
+
198
+ # filter by threshold
199
+ if similarity >= min_similarity:
200
+ text = ""
201
+ if candidate_texts:
202
+ text = candidate_texts[idx]
203
+
204
+ metadata = {}
205
+ if candidate_metadata:
206
+ metadata = candidate_metadata[idx]
207
+
208
+ result = SimilarityResult(
209
+ index=idx,
210
+ similarity=similarity,
211
+ text=text,
212
+ metadata=metadata
213
+ )
214
+ similarities.append(result)
215
+
216
+ except Exception as e:
217
+ logger.warning(f"Error computing similarity for idx {idx}: {str(e)}")
218
+ continue
219
+
220
+ # sort by similarity descending
221
+ similarities.sort(key=lambda x: x.similarity, reverse=True)
222
+
223
+ # get top k
224
+ top_results = similarities[:top_k]
225
+
226
+ if top_results:
227
+ logger.success(f"Found {len(top_results)} results (top: {top_results[0].similarity:.3f})")
228
+ else:
229
+ logger.warning("No results above threshold")
230
+
231
+ return top_results
232
+
233
+ except Exception as e:
234
+ logger.error(f"Error in find_most_similar: {str(e)}")
235
+ return []
236
+
237
+ def embed_documents(self, texts, metadata=None):
238
+ """embed multiple documents with metadata"""
239
+ try:
240
+ logger.info(f"Embedding {len(texts)} documents")
241
+
242
+ embeddings = self.create_embeddings_batch(texts)
243
+
244
+ if embeddings is None:
245
+ logger.error("Batch embedding failed")
246
+ return [], []
247
+
248
+ # create metadata if missing
249
+ if metadata is None:
250
+ metadata = []
251
+ for _ in texts:
252
+ metadata.append({})
253
+
254
+ # add embedding info to metadata
255
+ for i in range(len(embeddings)):
256
+ embedding = embeddings[i]
257
+ meta = metadata[i]
258
+
259
+ if embedding is not None:
260
+ meta["embedding_dim"] = len(embedding)
261
+ meta["has_embedding"] = True
262
+ else:
263
+ meta["has_embedding"] = False
264
+
265
+ return embeddings, metadata
266
+
267
+ except Exception as e:
268
+ logger.error(f"Error embedding documents: {str(e)}")
269
+ return [], []
270
+
271
+ def get_embedding_stats(self, embeddings):
272
+ """get stats about embeddings"""
273
+ try:
274
+ valid_embeddings = []
275
+ for e in embeddings:
276
+ if e is not None:
277
+ valid_embeddings.append(e)
278
+
279
+ if not valid_embeddings:
280
+ return {
281
+ "total": len(embeddings),
282
+ "valid": 0,
283
+ "invalid": len(embeddings),
284
+ "success_rate": 0.0
285
+ }
286
+
287
+ # convert to numpy for calculations
288
+ emb_array = np.array(valid_embeddings)
289
+
290
+ # calculate norms
291
+ norms = []
292
+ for e in valid_embeddings:
293
+ norms.append(np.linalg.norm(e))
294
+
295
+ stats = {
296
+ "total": len(embeddings),
297
+ "valid": len(valid_embeddings),
298
+ "invalid": len(embeddings) - len(valid_embeddings),
299
+ "success_rate": len(valid_embeddings) / len(embeddings),
300
+ "dimensions": len(valid_embeddings[0]) if valid_embeddings else 0,
301
+ "mean_norm": float(np.mean(norms)),
302
+ "std_norm": float(np.std(norms))
303
+ }
304
+
305
+ logger.info(f"Embedding stats: {stats['valid']}/{stats['total']} valid ({stats['success_rate']*100:.1f}%)")
306
+
307
+ return stats
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error calculating stats: {str(e)}")
311
+ return {}
core/extractor.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json
3
+ from typing import Dict, Any, List
4
+
5
+
6
+ class ExtractionService:
7
+ """extract parameters using GPT-4"""
8
+
9
+ def __init__(self, api_key):
10
+ self.client = OpenAI(api_key=api_key)
11
+ self.model = "gpt-4-turbo-preview"
12
+
13
+ def extract_parameter(self, parameter_name, parameter_description, context, source_location):
14
+ """
15
+ extract single parameter using GPT-4
16
+ """
17
+ # build the prompt
18
+ prompt = f"""Extract the following parameter from the document:
19
+
20
+ PARAMETER: {parameter_name}
21
+ DESCRIPTION: {parameter_description}
22
+
23
+ DOCUMENT CONTEXT:
24
+ {context}
25
+
26
+ INSTRUCTIONS:
27
+ 1. Extract the exact value (no formatting - remove commas, currency symbols)
28
+ 2. If not found, return null
29
+ 3. Be precise - no approximations
30
+
31
+ Return ONLY a JSON object:
32
+ {{
33
+ "value": <number or string or null>,
34
+ "source": "{source_location}"
35
+ }}"""
36
+
37
+ try:
38
+ response = self.client.chat.completions.create(
39
+ model=self.model,
40
+ messages=[
41
+ {
42
+ "role": "system",
43
+ "content": "You are an expert at extracting financial data from documents. Extract exact values without any formatting."
44
+ },
45
+ {
46
+ "role": "user",
47
+ "content": prompt
48
+ }
49
+ ],
50
+ temperature=0.0,
51
+ response_format={"type": "json_object"}
52
+ )
53
+
54
+ result = json.loads(response.choices[0].message.content)
55
+ return result
56
+
57
+ except Exception as e:
58
+ return {
59
+ "value": None,
60
+ "source": f"Error: {str(e)}"
61
+ }
62
+
63
+ def extract_from_table(self, parameter_name, tables):
64
+ """
65
+ extract parameter from tables
66
+ TODO: maybe optimize this for large tables
67
+ """
68
+ # convert tables to text format
69
+ tables_text = ""
70
+ table_count = 0
71
+ for i, table in enumerate(tables):
72
+ if table_count >= 5: # limit to 5 tables
73
+ break
74
+
75
+ tables_text += f"\n[Table {i+1} - Page {table['page']}]\n"
76
+
77
+ headers = table.get("headers", [])
78
+ rows = table.get("rows", [])
79
+
80
+ # add headers
81
+ if headers:
82
+ header_str = ""
83
+ for h in headers:
84
+ header_str += str(h) + " | "
85
+ tables_text += header_str + "\n"
86
+
87
+ # add rows (max 20 per table)
88
+ row_count = 0
89
+ for row in rows:
90
+ if row_count >= 20:
91
+ break
92
+
93
+ row_str = ""
94
+ for c in row:
95
+ if c:
96
+ row_str += str(c) + " | "
97
+ else:
98
+ row_str += " | "
99
+ tables_text += row_str + "\n"
100
+ row_count += 1
101
+
102
+ table_count += 1
103
+
104
+ # build prompt
105
+ prompt = f"""Find '{parameter_name}' in these tables:
106
+
107
+ {tables_text}
108
+
109
+ Return JSON:
110
+ {{
111
+ "value": <extracted value without formatting>,
112
+ "source": "<table number and location>"
113
+ }}"""
114
+
115
+ try:
116
+ response = self.client.chat.completions.create(
117
+ model=self.model,
118
+ messages=[
119
+ {
120
+ "role": "system",
121
+ "content": "Extract data from tables. Return exact numbers without formatting."
122
+ },
123
+ {
124
+ "role": "user",
125
+ "content": prompt
126
+ }
127
+ ],
128
+ temperature=0.0,
129
+ response_format={"type": "json_object"}
130
+ )
131
+
132
+ result = json.loads(response.choices[0].message.content)
133
+ return result
134
+
135
+ except Exception as e:
136
+ return {
137
+ "value": None,
138
+ "source": f"Error: {str(e)}"
139
+ }
140
+
141
+
core/rag_pipeline.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional, Tuple
2
+ from dataclasses import dataclass
3
+ import numpy as np
4
+ from loguru import logger
5
+ from openai import OpenAI
6
+
7
+ from core.document_parser import ParsedDocument
8
+ from core.embeddings import EmbeddingService
9
+ from core.vision_parser import VisionDocumentParser, VisionExtractionResult
10
+
11
+
12
+ @dataclass
13
+ class ExtractionResult:
14
+ """result from parameter extraction"""
15
+ parameter_id: str
16
+ parameter_name: str
17
+ value: Any
18
+ source: str
19
+ confidence: float
20
+ context_used: str
21
+ metadata: Dict[str, Any]
22
+
23
+
24
+ class EnhancedRAGPipeline:
25
+ """
26
+ RAG Pipeline with Vision support
27
+ tries vision first, falls back to traditional RAG
28
+ """
29
+
30
+ def __init__(self, embedding_service, openai_client, domain_rag=None,
31
+ top_k=5, similarity_threshold=0.3, model="gpt-4o-mini",
32
+ vision_model="gpt-4o", temperature=0.0, use_vision=True):
33
+ self.embedding_service = embedding_service
34
+ self.client = openai_client
35
+ self.domain_rag = domain_rag
36
+ self.top_k = top_k
37
+ self.similarity_threshold = similarity_threshold
38
+ self.model = model
39
+ self.temperature = temperature
40
+ self.use_vision = use_vision
41
+
42
+ # setup vision parser if needed
43
+ if use_vision:
44
+ self.vision_parser = VisionDocumentParser(
45
+ openai_client=openai_client,
46
+ model=vision_model
47
+ )
48
+ logger.success("Vision parser ready")
49
+ else:
50
+ self.vision_parser = None
51
+ logger.info("Vision extraction disabled")
52
+
53
+ def extract_parameter_with_vision(self, pdf_path, parameter_id,
54
+ parameter_name, parameter_description):
55
+ """
56
+ extract using GPT-4 Vision (most accurate method)
57
+ """
58
+ if not self.use_vision or not self.vision_parser:
59
+ return None
60
+
61
+ try:
62
+ logger.info(f"[VISION] Extracting {parameter_name}...")
63
+
64
+ # figure out what type of parameter this is
65
+ param_type = self._infer_parameter_type(parameter_id, parameter_description)
66
+
67
+ # use vision to extract
68
+ vision_result = self.vision_parser.extract_parameter_from_pdf(
69
+ pdf_path=pdf_path,
70
+ parameter_name=parameter_name,
71
+ parameter_description=parameter_description,
72
+ parameter_type=param_type,
73
+ search_all_pages=True
74
+ )
75
+
76
+ if vision_result:
77
+ # convert to our format
78
+ result = ExtractionResult(
79
+ parameter_id=parameter_id,
80
+ parameter_name=parameter_name,
81
+ value=vision_result.value,
82
+ source=vision_result.source,
83
+ confidence=vision_result.confidence,
84
+ context_used=vision_result.context,
85
+ metadata={
86
+ "method": "vision",
87
+ "page_number": vision_result.page_number,
88
+ "model": self.vision_parser.model
89
+ }
90
+ )
91
+
92
+ logger.success(f"[VISION] Found: {result.value} (conf: {result.confidence:.2f})")
93
+ return result
94
+ else:
95
+ logger.warning(f"[VISION] Not found: {parameter_name}")
96
+ return None
97
+
98
+ except Exception as e:
99
+ logger.error(f"[VISION] Error: {str(e)}")
100
+ return None
101
+
102
+ def _infer_parameter_type(self, parameter_id, description):
103
+ """guess parameter type from id and description"""
104
+ param_lower = parameter_id.lower()
105
+ desc_lower = description.lower()
106
+
107
+ # boolean stuff
108
+ boolean_keywords = ["accepted", "flag", "status", "yes/no", "true/false",
109
+ "settlement", "writeoff", "suit", "default"]
110
+ for keyword in boolean_keywords:
111
+ if keyword in param_lower or keyword in desc_lower:
112
+ return "boolean"
113
+
114
+ # numeric stuff
115
+ numeric_keywords = ["amount", "count", "number", "dpd", "loans",
116
+ "threshold", "score", "inquiries"]
117
+ for keyword in numeric_keywords:
118
+ if keyword in param_lower or keyword in desc_lower:
119
+ return "number"
120
+
121
+ # dates
122
+ if "date" in param_lower or "date" in desc_lower:
123
+ return "date"
124
+
125
+ return "text"
126
+
127
+ def prepare_document(self, parsed_doc):
128
+ """
129
+ prepare doc for traditional RAG (fallback)
130
+ """
131
+ try:
132
+ chunk_texts = []
133
+ chunk_metadata = []
134
+
135
+ for chunk in parsed_doc.chunks:
136
+ chunk_texts.append(chunk.text)
137
+ chunk_metadata.append({
138
+ "chunk_id": chunk.chunk_id,
139
+ "page_num": chunk.page_num,
140
+ "start_char": chunk.start_char,
141
+ "end_char": chunk.end_char
142
+ })
143
+
144
+ # create embeddings
145
+ embeddings_list = self.embedding_service.create_embeddings_batch(chunk_texts)
146
+
147
+ if embeddings_list is None:
148
+ logger.error("Failed to create embeddings")
149
+ return None, None, None
150
+
151
+ # convert to numpy
152
+ embeddings = np.array(embeddings_list)
153
+
154
+ return embeddings, chunk_texts, chunk_metadata
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error preparing document: {str(e)}")
158
+ return None, None, None
159
+
160
+ def extract_parameter_full_pipeline(self, parameter_id, parameter_name,
161
+ parameter_description, parsed_doc,
162
+ chunk_embeddings, chunk_texts,
163
+ chunk_metadata, pdf_path=None):
164
+ """
165
+ full extraction pipeline
166
+ tries vision first, then RAG as fallback
167
+ """
168
+ try:
169
+ # try vision first (best accuracy)
170
+ if pdf_path and self.use_vision:
171
+ vision_result = self.extract_parameter_with_vision(
172
+ pdf_path=pdf_path,
173
+ parameter_id=parameter_id,
174
+ parameter_name=parameter_name,
175
+ parameter_description=parameter_description
176
+ )
177
+
178
+ # if vision found it with good confidence, use that
179
+ if vision_result and vision_result.confidence >= 0.7:
180
+ logger.success(f"[PIPELINE] Using VISION result (conf: {vision_result.confidence:.2f})")
181
+ return vision_result
182
+
183
+ # try traditional RAG
184
+ logger.info(f"[PIPELINE] Trying traditional RAG for {parameter_name}...")
185
+ rag_result = self._extract_with_rag(
186
+ parameter_id=parameter_id,
187
+ parameter_name=parameter_name,
188
+ parameter_description=parameter_description,
189
+ chunk_embeddings=chunk_embeddings,
190
+ chunk_texts=chunk_texts,
191
+ chunk_metadata=chunk_metadata,
192
+ parsed_doc=parsed_doc
193
+ )
194
+
195
+ # if we have both, compare them
196
+ if vision_result and rag_result:
197
+ if vision_result.confidence > rag_result.confidence:
198
+ logger.info(f"[PIPELINE] Vision wins: {vision_result.confidence:.2f} > {rag_result.confidence:.2f}")
199
+ return vision_result
200
+ else:
201
+ logger.info(f"[PIPELINE] RAG wins: {rag_result.confidence:.2f} > {vision_result.confidence:.2f}")
202
+ return rag_result
203
+
204
+ # return whatever worked
205
+ if vision_result:
206
+ return vision_result
207
+ return rag_result
208
+
209
+ except Exception as e:
210
+ logger.error(f"Error in extraction pipeline: {str(e)}")
211
+ return None
212
+
213
+ def _extract_with_rag(self, parameter_id, parameter_name, parameter_description,
214
+ chunk_embeddings, chunk_texts, chunk_metadata, parsed_doc):
215
+ """traditional RAG extraction (fallback method)"""
216
+ try:
217
+ # build query
218
+ query = f"{parameter_name}: {parameter_description}"
219
+ query_embedding = self.embedding_service.create_embedding(query)
220
+
221
+ if query_embedding is None:
222
+ return None
223
+
224
+ # get relevant chunks
225
+ similarities = np.dot(chunk_embeddings, query_embedding)
226
+ top_indices = np.argsort(similarities)[::-1][:self.top_k]
227
+
228
+ # filter by threshold
229
+ relevant_chunks = []
230
+ for idx in top_indices:
231
+ if similarities[idx] >= self.similarity_threshold:
232
+ relevant_chunks.append({
233
+ "text": chunk_texts[idx],
234
+ "similarity": float(similarities[idx]),
235
+ "metadata": chunk_metadata[idx]
236
+ })
237
+
238
+ if not relevant_chunks:
239
+ return None
240
+
241
+ # get domain knowledge if available
242
+ domain_context = ""
243
+ if self.domain_rag:
244
+ domain_snippets = self.domain_rag.retrieve(query, top_k=3)
245
+ if domain_snippets:
246
+ formatted_snippets = []
247
+ for s in domain_snippets:
248
+ # handle different types
249
+ if isinstance(s, str):
250
+ formatted_snippets.append(f"- {s}")
251
+ elif hasattr(s, 'text'):
252
+ formatted_snippets.append(f"- {s.text}")
253
+ else:
254
+ formatted_snippets.append(f"- {str(s)}")
255
+ domain_context = "\n".join(formatted_snippets)
256
+
257
+ # build context from chunks
258
+ context_parts = []
259
+ for i, c in enumerate(relevant_chunks):
260
+ chunk_text = f"[Chunk {i+1}, Page {c['metadata']['page_num']}, Similarity: {c['similarity']:.2f}]\n{c['text']}"
261
+ context_parts.append(chunk_text)
262
+ context_text = "\n\n".join(context_parts)
263
+
264
+ # build prompt
265
+ prompt = f"""Extract the following parameter from the document context.
266
+
267
+ Parameter: {parameter_name}
268
+ Description: {parameter_description}
269
+
270
+ """
271
+ if domain_context:
272
+ prompt += f"Domain Knowledge:\n{domain_context}\n\n"
273
+
274
+ prompt += f"""Document Context:
275
+ {context_text}
276
+
277
+ Extract the value and provide the specific source location (e.g., "Account Summary Table, Row 3" not just the filename).
278
+
279
+ Return JSON:
280
+ {{
281
+ "value": <extracted value or null>,
282
+ "source": "<specific section/table/location>",
283
+ "confidence": <0.0-1.0>
284
+ }}"""
285
+
286
+ response = self.client.chat.completions.create(
287
+ model=self.model,
288
+ messages=[{"role": "user", "content": prompt}],
289
+ temperature=self.temperature,
290
+ max_tokens=300
291
+ )
292
+
293
+ result_text = response.choices[0].message.content
294
+
295
+ # parse JSON response
296
+ import json
297
+ json_text = result_text.strip()
298
+ if "```json" in json_text:
299
+ json_text = json_text.split("```json")[1].split("```")[0].strip()
300
+ elif "```" in json_text:
301
+ json_text = json_text.split("```")[1].split("```")[0].strip()
302
+
303
+ data = json.loads(json_text)
304
+
305
+ if data.get("value") is not None:
306
+ return ExtractionResult(
307
+ parameter_id=parameter_id,
308
+ parameter_name=parameter_name,
309
+ value=data["value"],
310
+ source=data.get("source", f"Page {relevant_chunks[0]['metadata']['page_num']}"),
311
+ confidence=float(data.get("confidence", 0.5)),
312
+ context_used=context_text[:200],
313
+ metadata={"method": "rag", "chunks_used": len(relevant_chunks)}
314
+ )
315
+
316
+ return None
317
+
318
+ except Exception as e:
319
+ logger.error(f"RAG extraction error: {str(e)}")
320
+ return None
321
+
322
+ def calculate_overall_confidence(self, results):
323
+ """calculate overall confidence score"""
324
+ if not results:
325
+ return 0.0
326
+
327
+ # count successful extractions
328
+ successful = []
329
+ for r in results:
330
+ if r.value is not None:
331
+ successful.append(r)
332
+
333
+ if not successful:
334
+ return 0.0
335
+
336
+ # average confidence
337
+ total_conf = 0.0
338
+ for r in successful:
339
+ total_conf += r.confidence
340
+ avg_confidence = total_conf / len(successful)
341
+
342
+ # success rate
343
+ success_rate = len(successful) / len(results)
344
+
345
+ # combine them
346
+ overall = (avg_confidence * 0.7) + (success_rate * 0.3)
347
+
348
+ return round(overall, 2)
core/vision_parser.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import base64
3
+ import io
4
+ import json # For parsing Vision API responses
5
+ from pathlib import Path
6
+ from typing import Dict, Any, List, Optional, Tuple
7
+ from dataclasses import dataclass
8
+ from loguru import logger
9
+ from PIL import Image
10
+ import fitz # PyMuPDF - Pure Python, no poppler needed!
11
+ from openai import OpenAI
12
+
13
+
14
+ @dataclass
15
+ class VisionExtractionResult:
16
+ """Result from vision-based extraction"""
17
+ parameter_id: str
18
+ parameter_name: str
19
+ value: Any
20
+ source: str # Specific section/location
21
+ page_number: int
22
+ confidence: float
23
+ context: str # Surrounding text/context
24
+
25
+
26
+ class VisionDocumentParser:
27
+
28
+ def __init__(self, openai_client: OpenAI, model: str = "gpt-4o"):
29
+
30
+ self.client = openai_client
31
+ self.model = model
32
+ self._image_cache = {} # Cache converted images by PDF path
33
+ logger.info(f"VisionDocumentParser initialized with model: {model}")
34
+
35
+
36
+ def pdf_to_images(self, pdf_path: str, dpi: int = 200) -> List[Image.Image]:
37
+
38
+ try:
39
+ # Check cache first - ONLY OPTIMIZATION!
40
+ cache_key = f"{pdf_path}_{dpi}"
41
+ if cache_key in self._image_cache:
42
+ logger.info(f"✅ Using CACHED images for: {Path(pdf_path).name} (skipping conversion)")
43
+ return self._image_cache[cache_key]
44
+
45
+ logger.info(f"Converting PDF to images: {Path(pdf_path).name} (DPI: {dpi})")
46
+
47
+ # Open PDF with PyMuPDF
48
+ doc = fitz.open(pdf_path)
49
+ images = []
50
+
51
+ # Convert each page to image
52
+ for page_num in range(len(doc)):
53
+ page = doc[page_num]
54
+
55
+ # Calculate zoom factor for DPI
56
+ # 72 DPI is default, so zoom = target_dpi / 72
57
+ zoom = dpi / 72
58
+ mat = fitz.Matrix(zoom, zoom)
59
+
60
+ # Render page to pixmap
61
+ pix = page.get_pixmap(matrix=mat)
62
+
63
+ # Convert pixmap to PIL Image
64
+ img_data = pix.tobytes("png")
65
+ img = Image.open(io.BytesIO(img_data))
66
+
67
+ images.append(img)
68
+
69
+ doc.close()
70
+
71
+ # Cache for reuse - ONLY OPTIMIZATION!
72
+ self._image_cache[cache_key] = images
73
+
74
+ logger.success(f"Converted {len(images)} pages to images (PyMuPDF) - CACHED for reuse ✅")
75
+ return images
76
+
77
+ except Exception as e:
78
+ logger.error(f"Error converting PDF to images: {str(e)}")
79
+ return []
80
+
81
+
82
+ def image_to_base64(self, image: Image.Image) -> str:
83
+
84
+ try:
85
+ buffered = io.BytesIO()
86
+ image.save(buffered, format="PNG")
87
+ img_str = base64.b64encode(buffered.getvalue()).decode()
88
+ return img_str
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error encoding image: {str(e)}")
92
+ return ""
93
+
94
+
95
+ def extract_all_parameters_from_page(
96
+ self,
97
+ image: Image.Image,
98
+ page_num: int,
99
+ parameters: List[Dict[str, str]]
100
+ ) -> Dict[str, VisionExtractionResult]:
101
+
102
+ try:
103
+ # Build comprehensive prompt for ALL parameters
104
+ param_descriptions = []
105
+ for i, param in enumerate(parameters, 1):
106
+ param_type = param.get('type', 'text')
107
+ type_hint = {
108
+ 'boolean': '(true/false)',
109
+ 'number': '(numeric value)',
110
+ 'date': '(date format)',
111
+ 'text': '(text value)'
112
+ }.get(param_type, '')
113
+
114
+ param_descriptions.append(
115
+ f"{i}. **{param['name']}** {type_hint}: {param['description']}"
116
+ )
117
+
118
+ params_text = "\n".join(param_descriptions)
119
+
120
+ prompt = f"""Analyze this document page and extract ALL of the following parameters that you can find:
121
+
122
+ {params_text}
123
+
124
+ IMPORTANT INSTRUCTIONS:
125
+ 1. Return a JSON object with ONLY the parameters you found on this page
126
+ 2. For each parameter found, provide:
127
+ - "value": The actual value (use correct data type: number, boolean, string, or null)
128
+ - "source": SPECIFIC location (e.g., "Account Summary Table - Settlement column, Row 2")
129
+ - "confidence": Your confidence level (0.0 to 1.0)
130
+ - "context": Brief surrounding text for verification
131
+
132
+ 3. Skip parameters not visible on this page (don't include them in response)
133
+ 4. Be precise with sources - include table names, section headers, row/column identifiers
134
+ 5. For booleans, return true/false, NOT "yes"/"no" or 1/0
135
+
136
+ Return ONLY valid JSON, no markdown formatting:
137
+ {{
138
+ "parameter_id_1": {{
139
+ "found": true,
140
+ "value": <actual_value>,
141
+ "source": "Specific location with details",
142
+ "confidence": 0.95,
143
+ "context": "Surrounding text..."
144
+ }},
145
+ "parameter_id_2": {{
146
+ "found": true,
147
+ "value": <actual_value>,
148
+ "source": "Another specific location",
149
+ "confidence": 0.90,
150
+ "context": "More context..."
151
+ }}
152
+ }}
153
+
154
+ Parameter IDs to use: {', '.join([p['id'] for p in parameters])}"""
155
+
156
+ # Convert image to base64
157
+ buffered = io.BytesIO()
158
+ image.save(buffered, format="PNG")
159
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
160
+
161
+ # Single API call for ALL parameters!
162
+ response = self.client.chat.completions.create(
163
+ model=self.model,
164
+ messages=[
165
+ {
166
+ "role": "user",
167
+ "content": [
168
+ {
169
+ "type": "image_url",
170
+ "image_url": {
171
+ "url": f"data:image/png;base64,{img_base64}"
172
+ }
173
+ },
174
+ {
175
+ "type": "text",
176
+ "text": prompt
177
+ }
178
+ ]
179
+ }
180
+ ],
181
+ max_tokens=2000,
182
+ temperature=0.0
183
+ )
184
+
185
+ # Parse response
186
+ content = response.choices[0].message.content.strip()
187
+
188
+ # Remove markdown if present
189
+ if content.startswith("```json"):
190
+ content = content[7:]
191
+ if content.startswith("```"):
192
+ content = content[3:]
193
+ if content.endswith("```"):
194
+ content = content[:-3]
195
+ content = content.strip()
196
+
197
+ # Parse JSON
198
+ results_dict = json.loads(content)
199
+
200
+ # Create mapping of param_id to param_name for lookup
201
+ param_name_map = {p['id']: p['name'] for p in parameters}
202
+
203
+ # Convert to VisionExtractionResult objects
204
+ extraction_results = {}
205
+ for param_id, result_data in results_dict.items():
206
+ if result_data.get('found', False):
207
+ extraction_results[param_id] = VisionExtractionResult(
208
+ parameter_id=param_id,
209
+ parameter_name=param_name_map.get(param_id, param_id), # Get name from map
210
+ value=result_data.get('value'),
211
+ source=result_data.get('source', f'Page {page_num}'),
212
+ page_number=page_num,
213
+ confidence=result_data.get('confidence', 0.7),
214
+ context=result_data.get('context', '')
215
+ )
216
+
217
+ logger.success(
218
+ f"Page {page_num}: Found {len(extraction_results)}/{len(parameters)} parameters "
219
+ f"in ONE call ⚡"
220
+ )
221
+
222
+ return extraction_results
223
+
224
+ except json.JSONDecodeError as e:
225
+ logger.error(f"Failed to parse JSON from page {page_num}: {str(e)}")
226
+ return {}
227
+ except Exception as e:
228
+ logger.error(f"Error extracting from page {page_num}: {str(e)}")
229
+ return {}
230
+
231
+ def extract_all_parameters_batch(
232
+ self,
233
+ pdf_path: str,
234
+ parameters: List[Dict[str, str]]
235
+ ) -> Dict[str, VisionExtractionResult]:
236
+
237
+ try:
238
+ logger.info(
239
+ f"⚡ BATCH EXTRACTION: Processing {len(parameters)} parameters "
240
+ f"from {Path(pdf_path).name}"
241
+ )
242
+
243
+ # Convert PDF to images (uses cache!)
244
+ images = self.pdf_to_images(pdf_path, dpi=200)
245
+ if not images:
246
+ logger.error("Failed to convert PDF to images")
247
+ return {}
248
+
249
+ # Store best result for each parameter
250
+ best_results = {}
251
+
252
+ # Process each page once, extracting ALL parameters
253
+ for page_num, image in enumerate(images, start=1):
254
+ logger.info(f"⚡ Page {page_num}/{len(images)}: Extracting ALL parameters...")
255
+
256
+ # Extract all parameters from this page in ONE call!
257
+ page_results = self.extract_all_parameters_from_page(
258
+ image=image,
259
+ page_num=page_num,
260
+ parameters=parameters
261
+ )
262
+
263
+ # Update best results (keep highest confidence for each parameter)
264
+ for param_id, result in page_results.items():
265
+ if param_id not in best_results:
266
+ best_results[param_id] = result
267
+ logger.info(f" ✓ {param_id}: {result.value} (conf: {result.confidence})")
268
+ elif result.confidence > best_results[param_id].confidence:
269
+ logger.info(
270
+ f" ↑ {param_id}: {result.value} (conf: {result.confidence}) "
271
+ f"[better than {best_results[param_id].confidence}]"
272
+ )
273
+ best_results[param_id] = result
274
+
275
+ found_count = len(best_results)
276
+ logger.success(
277
+ f"⚡ BATCH COMPLETE: Found {found_count}/{len(parameters)} parameters "
278
+ f"in {len(images)} API calls (vs {len(parameters) * len(images)} with old method!)"
279
+ )
280
+
281
+ return best_results
282
+
283
+ except Exception as e:
284
+ logger.error(f"Error in batch extraction: {str(e)}")
285
+ return {}
286
+
287
+ def extract_parameter_from_page(
288
+ self,
289
+ image: Image.Image,
290
+ page_num: int,
291
+ parameter_name: str,
292
+ parameter_description: str,
293
+ parameter_type: str = "text"
294
+ ) -> Optional[VisionExtractionResult]:
295
+
296
+ try:
297
+ # Convert image to base64
298
+ img_base64 = self.image_to_base64(image)
299
+ if not img_base64:
300
+ return None
301
+
302
+ # Build prompt based on parameter type
303
+ prompt = self._build_extraction_prompt(
304
+ parameter_name,
305
+ parameter_description,
306
+ parameter_type
307
+ )
308
+
309
+ # Call GPT-4 Vision
310
+ response = self.client.chat.completions.create(
311
+ model=self.model,
312
+ messages=[
313
+ {
314
+ "role": "user",
315
+ "content": [
316
+ {
317
+ "type": "text",
318
+ "text": prompt
319
+ },
320
+ {
321
+ "type": "image_url",
322
+ "image_url": {
323
+ "url": f"data:image/png;base64,{img_base64}",
324
+ "detail": "high"
325
+ }
326
+ }
327
+ ]
328
+ }
329
+ ],
330
+ max_tokens=500,
331
+ temperature=0.0 # Deterministic for data extraction
332
+ )
333
+
334
+ # Parse response
335
+ result_text = response.choices[0].message.content
336
+
337
+ # Parse structured response
338
+ return self._parse_vision_response(
339
+ result_text,
340
+ parameter_name,
341
+ page_num
342
+ )
343
+
344
+ except Exception as e:
345
+ logger.error(f"Error extracting {parameter_name} from page {page_num}: {str(e)}")
346
+ return None
347
+
348
+
349
+ def _build_extraction_prompt(
350
+ self,
351
+ parameter_name: str,
352
+ parameter_description: str,
353
+ parameter_type: str
354
+ ) -> str:
355
+ """Build prompt for GPT-4 Vision extraction"""
356
+
357
+ prompt = f"""You are analyzing a financial document (Bureau Credit Report or GST Return).
358
+
359
+ **TASK:** Extract the following parameter from this document page.
360
+
361
+ **Parameter Name:** {parameter_name}
362
+ **Description:** {parameter_description}
363
+ **Expected Type:** {parameter_type}
364
+
365
+ **INSTRUCTIONS:**
366
+ 1. Look for this parameter in the document
367
+ 2. If found, extract the exact value
368
+ 3. Note the specific section/location where you found it (e.g., "Account Summary Table, Row 3" or "DPD History Section")
369
+ 4. Provide surrounding context (nearby text)
370
+
371
+ **OUTPUT FORMAT (JSON):**
372
+ {{
373
+ "found": true/false,
374
+ "value": <extracted value or null>,
375
+ "source": "<specific section/table/location>",
376
+ "confidence": <0.0-1.0>,
377
+ "context": "<surrounding text for verification>"
378
+ }}
379
+
380
+ **EXAMPLES:**
381
+
382
+ For "DPD 30 Days" in a credit report:
383
+ {{
384
+ "found": true,
385
+ "value": 2,
386
+ "source": "Payment History Table - DPD 30 Days column",
387
+ "confidence": 0.95,
388
+ "context": "DPD History: 0-30 days: 2 occurrences"
389
+ }}
390
+
391
+ For "Settlement/Write-off" flag:
392
+ {{
393
+ "found": true,
394
+ "value": false,
395
+ "source": "Account Status Summary - Settlement Status field",
396
+ "confidence": 0.90,
397
+ "context": "Settlement Status: Not Applicable, Write-off Status: No"
398
+ }}
399
+
400
+ If parameter not found on this page:
401
+ {{
402
+ "found": false,
403
+ "value": null,
404
+ "source": "Not found on this page",
405
+ "confidence": 0.0,
406
+ "context": ""
407
+ }}
408
+
409
+ **CRITICAL RULES:**
410
+ - Be precise with locations (section names, table names, row/column)
411
+ - Extract EXACT values, don't interpret
412
+ - For boolean parameters, return true/false
413
+ - For numeric parameters, return numbers (not strings)
414
+ - If unsure, set confidence < 0.7
415
+ - Return ONLY valid JSON, no other text
416
+
417
+ Now analyze the document image and extract the parameter:"""
418
+
419
+ return prompt
420
+
421
+
422
+ def _parse_vision_response(
423
+ self,
424
+ response_text: str,
425
+ parameter_id: str,
426
+ page_num: int
427
+ ) -> Optional[VisionExtractionResult]:
428
+ """Parse GPT-4 Vision response into structured result"""
429
+ try:
430
+ import json
431
+
432
+ # Extract JSON from response (handle markdown code blocks)
433
+ json_text = response_text.strip()
434
+ if "```json" in json_text:
435
+ json_text = json_text.split("```json")[1].split("```")[0].strip()
436
+ elif "```" in json_text:
437
+ json_text = json_text.split("```")[1].split("```")[0].strip()
438
+
439
+ # Parse JSON
440
+ data = json.loads(json_text)
441
+
442
+ # Check if found
443
+ if not data.get("found", False):
444
+ return None
445
+
446
+ # Build result
447
+ result = VisionExtractionResult(
448
+ parameter_id=parameter_id,
449
+ parameter_name=parameter_id.replace("_", " ").title(),
450
+ value=data.get("value"),
451
+ source=data.get("source", "Unknown location"),
452
+ page_number=page_num,
453
+ confidence=float(data.get("confidence", 0.5)),
454
+ context=data.get("context", "")
455
+ )
456
+
457
+ return result
458
+
459
+ except Exception as e:
460
+ logger.error(f"Error parsing vision response: {str(e)}")
461
+ logger.debug(f"Response text: {response_text}")
462
+ return None
463
+
464
+
465
+ def extract_parameter_from_pdf(
466
+ self,
467
+ pdf_path: str,
468
+ parameter_name: str,
469
+ parameter_description: str,
470
+ parameter_type: str = "text",
471
+ search_all_pages: bool = True # Search all pages for best accuracy
472
+ ) -> Optional[VisionExtractionResult]:
473
+
474
+ try:
475
+ logger.info(f"Extracting '{parameter_name}' from {Path(pdf_path).name}")
476
+
477
+ # Convert PDF to images (uses cache if already converted! - ONLY OPTIMIZATION)
478
+ images = self.pdf_to_images(pdf_path, dpi=200)
479
+ if not images:
480
+ logger.error("Failed to convert PDF to images")
481
+ return None
482
+
483
+ # Search pages
484
+ results = []
485
+
486
+ for page_num, image in enumerate(images, start=1):
487
+ logger.info(f"Searching page {page_num}/{len(images)}...")
488
+
489
+ result = self.extract_parameter_from_page(
490
+ image=image,
491
+ page_num=page_num,
492
+ parameter_name=parameter_name,
493
+ parameter_description=parameter_description,
494
+ parameter_type=parameter_type
495
+ )
496
+
497
+ if result and result.value is not None:
498
+ logger.success(f"Found on page {page_num}: {result.value} (confidence: {result.confidence})")
499
+ results.append(result)
500
+
501
+ # Stop if we found a good match and not searching all pages
502
+ if not search_all_pages and result.confidence > 0.7:
503
+ break
504
+
505
+ # Return best result
506
+ if results:
507
+ best_result = max(results, key=lambda r: r.confidence)
508
+ logger.success(f"Best match: page {best_result.page_number}, confidence {best_result.confidence}")
509
+ return best_result
510
+ else:
511
+ logger.warning(f"Parameter '{parameter_name}' not found in document")
512
+ return None
513
+
514
+ except Exception as e:
515
+ logger.error(f"Error extracting parameter from PDF: {str(e)}")
516
+ return None
517
+
518
+
519
+ def extract_gst_sales_with_vision(
520
+ self,
521
+ pdf_path: str
522
+ ) -> Optional[Dict[str, Any]]:
523
+
524
+ try:
525
+ logger.info(f"Extracting GST sales from {Path(pdf_path).name}")
526
+
527
+ # Convert PDF to images
528
+ images = self.pdf_to_images(pdf_path)
529
+ if not images:
530
+ return None
531
+
532
+ # Prompt for GST sales
533
+ prompt = """You are analyzing a GSTR-3B (GST Return) document.
534
+
535
+ **TASK:** Extract the total taxable sales value from Table 3.1(a).
536
+
537
+ **WHAT TO LOOK FOR:**
538
+ - Table 3.1(a): "Details of Outward Supplies and inward supplies liable to reverse charge"
539
+ - Look for "Taxable value" or "Total Taxable value"
540
+ - This is usually in the first row of Table 3.1
541
+
542
+ **OUTPUT FORMAT (JSON):**
543
+ {{
544
+ "found": true/false,
545
+ "month": "<month and year, e.g., January 2025>",
546
+ "sales": <numeric value>,
547
+ "source": "GSTR-3B Table 3.1(a)",
548
+ "confidence": <0.0-1.0>
549
+ }}
550
+
551
+ **EXAMPLE:**
552
+ {{
553
+ "found": true,
554
+ "month": "January 2025",
555
+ "sales": 951381,
556
+ "source": "GSTR-3B Table 3.1(a) - Taxable outward supplies",
557
+ "confidence": 0.95
558
+ }}
559
+
560
+ Return ONLY valid JSON, no other text."""
561
+
562
+ # Try each page
563
+ for page_num, image in enumerate(images, start=1):
564
+ try:
565
+ img_base64 = self.image_to_base64(image)
566
+
567
+ response = self.client.chat.completions.create(
568
+ model=self.model,
569
+ messages=[
570
+ {
571
+ "role": "user",
572
+ "content": [
573
+ {"type": "text", "text": prompt},
574
+ {
575
+ "type": "image_url",
576
+ "image_url": {
577
+ "url": f"data:image/png;base64,{img_base64}",
578
+ "detail": "high"
579
+ }
580
+ }
581
+ ]
582
+ }
583
+ ],
584
+ max_tokens=300,
585
+ temperature=0.0
586
+ )
587
+
588
+ result_text = response.choices[0].message.content
589
+
590
+ # Parse JSON
591
+ import json
592
+ json_text = result_text.strip()
593
+ if "```json" in json_text:
594
+ json_text = json_text.split("```json")[1].split("```")[0].strip()
595
+ elif "```" in json_text:
596
+ json_text = json_text.split("```")[1].split("```")[0].strip()
597
+
598
+ data = json.loads(json_text)
599
+
600
+ if data.get("found") and data.get("sales"):
601
+ logger.success(f"Found GST sales on page {page_num}: {data['sales']}")
602
+ return {
603
+ "month": data.get("month", "Unknown"),
604
+ "sales": data["sales"],
605
+ "source": data.get("source", "GSTR-3B Table 3.1(a)")
606
+ }
607
+
608
+ except Exception as e:
609
+ logger.debug(f"Page {page_num} - no sales data: {str(e)}")
610
+ continue
611
+
612
+ logger.warning("GST sales not found in document")
613
+ return None
614
+
615
+ except Exception as e:
616
+ logger.error(f"Error extracting GST sales: {str(e)}")
617
+ return None
618
+
619
+
620
+