Upload 8 files
Browse files- core/document_parser.py +343 -0
- core/domain_knowledge.py +89 -0
- core/domain_rag.py +227 -0
- core/embeddings.py +311 -0
- core/extractor.py +141 -0
- core/rag_pipeline.py +348 -0
- core/vision_parser.py +620 -0
core/document_parser.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import pdfplumber
|
| 4 |
+
import re
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from loguru import logger
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class DocumentChunk:
|
| 13 |
+
"""chunk of text from document"""
|
| 14 |
+
chunk_id: str
|
| 15 |
+
text: str
|
| 16 |
+
page_num: int
|
| 17 |
+
start_char: int
|
| 18 |
+
end_char: int
|
| 19 |
+
metadata: Dict[str, Any]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class ParsedDocument:
|
| 24 |
+
"""parsed document data"""
|
| 25 |
+
file_name: str
|
| 26 |
+
total_pages: int
|
| 27 |
+
text_content: str
|
| 28 |
+
pages: List[Dict[str, Any]]
|
| 29 |
+
tables: List[Dict[str, Any]]
|
| 30 |
+
chunks: List[DocumentChunk]
|
| 31 |
+
metadata: Dict[str, Any]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class DocumentParser:
|
| 35 |
+
# PDF parser with chunking for RAG
|
| 36 |
+
|
| 37 |
+
def __init__(self, chunk_size=1000, chunk_overlap=200):
|
| 38 |
+
self.chunk_size = chunk_size
|
| 39 |
+
self.chunk_overlap = chunk_overlap
|
| 40 |
+
logger.info(f"Parser initialized - chunk_size={chunk_size}, overlap={chunk_overlap}")
|
| 41 |
+
|
| 42 |
+
def parse_pdf(self, pdf_path):
|
| 43 |
+
"""
|
| 44 |
+
parse PDF and extract content
|
| 45 |
+
"""
|
| 46 |
+
logger.info(f"Parsing: {Path(pdf_path).name}")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 50 |
+
all_text = []
|
| 51 |
+
pages_data = []
|
| 52 |
+
tables_data = []
|
| 53 |
+
|
| 54 |
+
# go through each page
|
| 55 |
+
for page_num, page in enumerate(pdf.pages, start=1):
|
| 56 |
+
try:
|
| 57 |
+
page_result = self._parse_page(page, page_num)
|
| 58 |
+
|
| 59 |
+
all_text.append(page_result["text"])
|
| 60 |
+
pages_data.append(page_result["page_data"])
|
| 61 |
+
tables_data.extend(page_result["tables"])
|
| 62 |
+
|
| 63 |
+
logger.debug(f"Page {page_num}: {len(page_result['text'])} chars, {len(page_result['tables'])} tables")
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error on page {page_num}: {str(e)}")
|
| 67 |
+
continue # skip problematic pages
|
| 68 |
+
|
| 69 |
+
full_text = "\n\n".join(all_text)
|
| 70 |
+
|
| 71 |
+
# create chunks for embeddings
|
| 72 |
+
chunks = self._create_chunks(full_text, Path(pdf_path).name)
|
| 73 |
+
|
| 74 |
+
metadata = {
|
| 75 |
+
"file_path": pdf_path,
|
| 76 |
+
"file_name": Path(pdf_path).name,
|
| 77 |
+
"total_pages": len(pdf.pages),
|
| 78 |
+
"total_tables": len(tables_data),
|
| 79 |
+
"total_chunks": len(chunks),
|
| 80 |
+
"text_length": len(full_text)
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
parsed_doc = ParsedDocument(
|
| 84 |
+
file_name=Path(pdf_path).name,
|
| 85 |
+
total_pages=len(pdf.pages),
|
| 86 |
+
text_content=full_text,
|
| 87 |
+
pages=pages_data,
|
| 88 |
+
tables=tables_data,
|
| 89 |
+
chunks=chunks,
|
| 90 |
+
metadata=metadata
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
logger.success(f"Parsed {len(pdf.pages)} pages, {len(tables_data)} tables, {len(chunks)} chunks")
|
| 94 |
+
|
| 95 |
+
return parsed_doc
|
| 96 |
+
|
| 97 |
+
except FileNotFoundError:
|
| 98 |
+
logger.error(f"File not found: {pdf_path}")
|
| 99 |
+
return None
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"Failed to parse {pdf_path}: {str(e)}")
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
def _parse_page(self, page, page_num):
|
| 105 |
+
"""parse single page"""
|
| 106 |
+
try:
|
| 107 |
+
# grab text
|
| 108 |
+
page_text = page.extract_text()
|
| 109 |
+
if page_text is None:
|
| 110 |
+
page_text = ""
|
| 111 |
+
|
| 112 |
+
# extract tables
|
| 113 |
+
tables = []
|
| 114 |
+
raw_tables = page.extract_tables()
|
| 115 |
+
|
| 116 |
+
for table_idx, table in enumerate(raw_tables):
|
| 117 |
+
if table and len(table) > 0:
|
| 118 |
+
try:
|
| 119 |
+
table_data = {
|
| 120 |
+
"page": page_num,
|
| 121 |
+
"table_id": f"p{page_num}_t{table_idx + 1}",
|
| 122 |
+
"headers": table[0] if table else [],
|
| 123 |
+
"rows": table[1:] if len(table) > 1 else [],
|
| 124 |
+
"raw_data": table
|
| 125 |
+
}
|
| 126 |
+
tables.append(table_data)
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.warning(f"Table {table_idx} error on page {page_num}: {str(e)}")
|
| 129 |
+
|
| 130 |
+
page_data = {
|
| 131 |
+
"page_num": page_num,
|
| 132 |
+
"text": page_text,
|
| 133 |
+
"text_length": len(page_text),
|
| 134 |
+
"tables_count": len(tables),
|
| 135 |
+
"width": page.width,
|
| 136 |
+
"height": page.height
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
return {
|
| 140 |
+
"text": page_text,
|
| 141 |
+
"tables": tables,
|
| 142 |
+
"page_data": page_data
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.error(f"_parse_page error for page {page_num}: {str(e)}")
|
| 147 |
+
return {
|
| 148 |
+
"text": "",
|
| 149 |
+
"tables": [],
|
| 150 |
+
"page_data": {
|
| 151 |
+
"page_num": page_num,
|
| 152 |
+
"text": "",
|
| 153 |
+
"text_length": 0,
|
| 154 |
+
"tables_count": 0
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
def _create_chunks(self, text, file_name):
|
| 159 |
+
"""
|
| 160 |
+
break text into chunks with overlap
|
| 161 |
+
TODO: maybe improve the chunking logic later
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
chunks = []
|
| 165 |
+
|
| 166 |
+
if not text:
|
| 167 |
+
logger.warning("Empty text for chunking")
|
| 168 |
+
return chunks
|
| 169 |
+
|
| 170 |
+
# split by paragraphs
|
| 171 |
+
paragraphs = text.split('\n\n')
|
| 172 |
+
|
| 173 |
+
current_chunk = ""
|
| 174 |
+
current_start = 0
|
| 175 |
+
chunk_id = 0
|
| 176 |
+
|
| 177 |
+
for para in paragraphs:
|
| 178 |
+
para = para.strip()
|
| 179 |
+
if not para:
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
# check if adding para exceeds size
|
| 183 |
+
if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
|
| 184 |
+
# save chunk
|
| 185 |
+
chunk = DocumentChunk(
|
| 186 |
+
chunk_id=f"chunk_{chunk_id}",
|
| 187 |
+
text=current_chunk.strip(),
|
| 188 |
+
page_num=0, # not tracking page num for now
|
| 189 |
+
start_char=current_start,
|
| 190 |
+
end_char=current_start + len(current_chunk),
|
| 191 |
+
metadata={
|
| 192 |
+
"source_file": file_name,
|
| 193 |
+
"chunk_length": len(current_chunk)
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
+
chunks.append(chunk)
|
| 197 |
+
chunk_id += 1
|
| 198 |
+
|
| 199 |
+
# start new chunk with overlap
|
| 200 |
+
if len(current_chunk) > self.chunk_overlap:
|
| 201 |
+
overlap_text = current_chunk[-self.chunk_overlap:]
|
| 202 |
+
else:
|
| 203 |
+
overlap_text = current_chunk
|
| 204 |
+
current_start = current_start + len(current_chunk) - len(overlap_text)
|
| 205 |
+
current_chunk = overlap_text + "\n\n" + para
|
| 206 |
+
else:
|
| 207 |
+
# add to current chunk
|
| 208 |
+
if current_chunk:
|
| 209 |
+
current_chunk += "\n\n" + para
|
| 210 |
+
else:
|
| 211 |
+
current_chunk = para
|
| 212 |
+
|
| 213 |
+
# add final chunk
|
| 214 |
+
if current_chunk:
|
| 215 |
+
chunk = DocumentChunk(
|
| 216 |
+
chunk_id=f"chunk_{chunk_id}",
|
| 217 |
+
text=current_chunk.strip(),
|
| 218 |
+
page_num=0,
|
| 219 |
+
start_char=current_start,
|
| 220 |
+
end_char=current_start + len(current_chunk),
|
| 221 |
+
metadata={
|
| 222 |
+
"source_file": file_name,
|
| 223 |
+
"chunk_length": len(current_chunk)
|
| 224 |
+
}
|
| 225 |
+
)
|
| 226 |
+
chunks.append(chunk)
|
| 227 |
+
|
| 228 |
+
logger.info(f"Created {len(chunks)} chunks")
|
| 229 |
+
return chunks
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
logger.error(f"Chunking error: {str(e)}")
|
| 233 |
+
return []
|
| 234 |
+
|
| 235 |
+
def extract_bureau_score(self, parsed_doc):
|
| 236 |
+
"""
|
| 237 |
+
grab CIBIL score from CRIF report
|
| 238 |
+
looks for pattern like "PERFORM CONSUMER 2.2 300-900 627"
|
| 239 |
+
"""
|
| 240 |
+
try:
|
| 241 |
+
text = parsed_doc.text_content
|
| 242 |
+
|
| 243 |
+
# main pattern - score after range
|
| 244 |
+
pattern = r'PERFORM\s+CONSUMER.*?300-900\s+(\d{3})'
|
| 245 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 246 |
+
|
| 247 |
+
if match:
|
| 248 |
+
score = int(match.group(1))
|
| 249 |
+
if 300 <= score <= 900:
|
| 250 |
+
logger.info(f"Found bureau score: {score}")
|
| 251 |
+
return {
|
| 252 |
+
"value": score,
|
| 253 |
+
"source": "CRIF Report – Score Section"
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
# fallback - check first couple pages
|
| 257 |
+
for page in parsed_doc.pages[:2]:
|
| 258 |
+
page_text = page["text"]
|
| 259 |
+
numbers = re.findall(r'\b(\d{3})\b', page_text)
|
| 260 |
+
|
| 261 |
+
for num_str in numbers:
|
| 262 |
+
num = int(num_str)
|
| 263 |
+
if 300 <= num <= 900:
|
| 264 |
+
# check if its actually a score
|
| 265 |
+
idx = page_text.find(num_str)
|
| 266 |
+
context = page_text[max(0, idx-100):idx+100]
|
| 267 |
+
|
| 268 |
+
keywords = ['score', 'cibil', 'credit', 'bureau']
|
| 269 |
+
if any(kw in context.lower() for kw in keywords):
|
| 270 |
+
logger.info(f"Found score (fallback): {num}")
|
| 271 |
+
return {
|
| 272 |
+
"value": num,
|
| 273 |
+
"source": f"CRIF Report – Page {page['page_num']}"
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
logger.warning("Bureau score not found")
|
| 277 |
+
return None
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.error(f"Error extracting bureau score: {str(e)}")
|
| 281 |
+
return None
|
| 282 |
+
|
| 283 |
+
def extract_gst_sales(self, parsed_doc):
|
| 284 |
+
"""extract sales from GSTR-3B table"""
|
| 285 |
+
try:
|
| 286 |
+
text = parsed_doc.text_content
|
| 287 |
+
filename = parsed_doc.file_name
|
| 288 |
+
|
| 289 |
+
# get month from document
|
| 290 |
+
month_match = re.search(r'Period\s+(\w+)', text)
|
| 291 |
+
month_name = month_match.group(1) if month_match else "Unknown"
|
| 292 |
+
|
| 293 |
+
# extract year from filename (GSTR3B_..._012025.pdf format)
|
| 294 |
+
filename_year_match = re.search(r'_(\d{2})(\d{4})\.pdf', filename)
|
| 295 |
+
if filename_year_match:
|
| 296 |
+
year = filename_year_match.group(2)
|
| 297 |
+
else:
|
| 298 |
+
# fallback
|
| 299 |
+
year_match = re.search(r'Year\s+(\d{4})', text)
|
| 300 |
+
year = year_match.group(1) if year_match else "2025"
|
| 301 |
+
|
| 302 |
+
formatted_month = f"{month_name} {year}"
|
| 303 |
+
|
| 304 |
+
# search tables for sales
|
| 305 |
+
for table in parsed_doc.tables:
|
| 306 |
+
rows = table.get("rows", [])
|
| 307 |
+
|
| 308 |
+
for row in rows:
|
| 309 |
+
if row and len(row) > 1:
|
| 310 |
+
first_cell = str(row[0]).replace('\n', ' ')
|
| 311 |
+
|
| 312 |
+
# find row (a) with outward supplies
|
| 313 |
+
if "(a)" in first_cell and "Outward taxable supplies" in first_cell:
|
| 314 |
+
if len(row) > 1 and row[1]:
|
| 315 |
+
value_str = str(row[1])
|
| 316 |
+
clean_value = re.sub(r'[^\d.]', '', value_str)
|
| 317 |
+
|
| 318 |
+
if clean_value:
|
| 319 |
+
try:
|
| 320 |
+
sales = float(clean_value)
|
| 321 |
+
logger.info(f"GST sales: {sales} for {formatted_month}")
|
| 322 |
+
return {
|
| 323 |
+
"month": formatted_month,
|
| 324 |
+
"sales": sales,
|
| 325 |
+
"source": "GSTR-3B Table 3.1(a)"
|
| 326 |
+
}
|
| 327 |
+
except ValueError as e:
|
| 328 |
+
logger.warning(f"Couldn't parse sales value '{clean_value}': {str(e)}")
|
| 329 |
+
|
| 330 |
+
logger.warning(f"Sales data not found for {formatted_month}")
|
| 331 |
+
return None
|
| 332 |
+
|
| 333 |
+
except Exception as e:
|
| 334 |
+
logger.error(f"Error extracting GST sales: {str(e)}")
|
| 335 |
+
return None
|
| 336 |
+
|
| 337 |
+
def get_chunks_text(self, chunks):
|
| 338 |
+
"""get text from chunks for embedding"""
|
| 339 |
+
try:
|
| 340 |
+
return [chunk.text for chunk in chunks]
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.error(f"Error getting chunks text: {str(e)}")
|
| 343 |
+
return []
|
core/domain_knowledge.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
BUREAU_TERMINOLOGY = [
|
| 3 |
+
"Bureau credit score: Numerical representation of creditworthiness ranging from 300 (poor) to 900 (excellent). Higher scores indicate better credit history and lower risk.",
|
| 4 |
+
"DPD (Days Past Due): Number of days a payment is overdue beyond the due date. Common thresholds monitored are 30+, 60+, and 90+ DPD.",
|
| 5 |
+
"30+ DPD: Count of accounts with payments overdue by 30 or more days in the specified monitoring period. Indicates early stage delinquency.",
|
| 6 |
+
"60+ DPD: Count of accounts with payments overdue by 60 or more days in the specified monitoring period. Indicates moderate delinquency.",
|
| 7 |
+
"90+ DPD: Count of accounts with payments overdue by 90 or more days in the specified monitoring period. Indicates serious delinquency.",
|
| 8 |
+
"Settlement: Debt resolved by borrower paying less than the full amount owed, typically after negotiation with creditor. Marked negatively on credit report.",
|
| 9 |
+
"Write-off: Debt declared unrecoverable by lender and removed from active accounts. Severely impacts credit score and indicates non-payment.",
|
| 10 |
+
"NTC (No-Track-Case): Credit applicants with insufficient credit history or no previous credit accounts in bureau database. Also called 'New to Credit'.",
|
| 11 |
+
"Suit Filed: Legal action initiated by creditor for debt recovery through courts. Indicates serious delinquency and unwillingness to pay.",
|
| 12 |
+
"Wilful Default: Deliberate non-payment of debt despite having the financial ability to pay. Considered fraudulent behavior and severely impacts creditworthiness.",
|
| 13 |
+
"Live PL/BL: Active Personal Loan or Business Loan currently being serviced by the borrower with regular payments.",
|
| 14 |
+
"Overdue amount: Total unpaid amount across all accounts that is past the due date. Sum of all overdue balances.",
|
| 15 |
+
"Credit inquiry: Request made by lender to check credit report when applicant applies for credit. Too many inquiries indicate credit hunger.",
|
| 16 |
+
"Active loans: Loans currently being serviced by borrower, not yet closed or settled. Indicates current credit obligations.",
|
| 17 |
+
"Loan exposure: Total outstanding amount across all loans. Also called total debt or credit exposure.",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
# GST and GSTR-3B Terminology
|
| 21 |
+
GST_TERMINOLOGY = [
|
| 22 |
+
"GSTR-3B: Monthly return filing summarizing outward supplies, input tax credit claimed, and net tax liability for the tax period.",
|
| 23 |
+
"Table 3.1(a): Section in GSTR-3B reporting outward taxable supplies (other than zero rated, nil rated and exempted). This is the main sales figure.",
|
| 24 |
+
"Outward supplies: Goods or services provided by the registered GST taxpayer to customers. This is the sales/revenue of the business.",
|
| 25 |
+
"Taxable supplies: Supplies on which GST is levied at applicable rates (5%, 12%, 18%, or 28%). Excludes exempted and nil-rated supplies.",
|
| 26 |
+
"Taxable value: The base value on which GST is calculated, excluding the GST amount itself. This is the pre-tax revenue.",
|
| 27 |
+
"Outward taxable supplies: Sales of goods/services on which GST is applicable. Found in GSTR-3B Table 3.1, row (a).",
|
| 28 |
+
"GSTR-3B structure: Contains multiple tables - Table 3.1 for outward supplies, Table 3.2 for inter-state supplies, Table 4 for input tax credit.",
|
| 29 |
+
"Tax period: The month and year for which the GST return is filed. Format is usually 'Month YYYY' (e.g., January 2025).",
|
| 30 |
+
"GSTIN: GST Identification Number, unique 15-digit alphanumeric code assigned to each registered taxpayer.",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
# Validation and Business Rules
|
| 34 |
+
VALIDATION_RULES = [
|
| 35 |
+
"Valid bureau credit scores: Must be between 300 and 900 inclusive. Scores outside this range are invalid.",
|
| 36 |
+
"Credit score interpretation: 300-579 is Poor, 580-669 is Fair, 670-739 is Good, 740-799 is Very Good, 800-900 is Excellent.",
|
| 37 |
+
"DPD hierarchy rule: 90+ DPD count ≤ 60+ DPD count ≤ 30+ DPD count. If this is violated, data may be incorrect.",
|
| 38 |
+
"GST sales validation: Taxable value should be non-negative numbers. Negative sales indicate data entry error.",
|
| 39 |
+
"Suspicious GST amounts: Values over 10 crore (100,000,000 rupees) should be flagged for verification as potentially incorrect.",
|
| 40 |
+
"Written-off debt amount: Should be non-negative. Negative values indicate error in extraction or data.",
|
| 41 |
+
"Loan counts validation: Max loans and max active loans should be non-negative integers. Cannot have negative loan counts.",
|
| 42 |
+
"Overdue threshold: Maximum allowable overdue amount, typically ranging from 0 to several lakhs. Depends on risk appetite.",
|
| 43 |
+
"Credit inquiry limits: Excessive inquiries (>5 in 6 months) indicate credit hunger and should be flagged.",
|
| 44 |
+
"Zero values interpretation: Zero or null values may indicate either absence of the attribute or that the parameter is not applicable.",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# Extraction Hints and Location Guidance
|
| 48 |
+
EXTRACTION_HINTS = [
|
| 49 |
+
"Bureau credit score location: Typically appears near terms like 'PERFORM', 'CONSUMER', 'Score', 'CIBIL', or in a dedicated score section on first page.",
|
| 50 |
+
"Credit score format: Usually displayed as a 3-digit number between 300-900, sometimes with a gauge or range indicator.",
|
| 51 |
+
"DPD information location: Often found in payment history tables, delinquency sections, or account performance summary.",
|
| 52 |
+
"Settlement and write-off status: Usually marked explicitly in account status columns with keywords 'Settled', 'Written Off', or status codes.",
|
| 53 |
+
"Live loan indicators: Marked with 'Active', 'Current', 'Live', or similar status in account listings.",
|
| 54 |
+
"GSTR-3B sales extraction: Sales figures are in Table 3.1, row labeled '(a) Outward taxable supplies', second column shows taxable value.",
|
| 55 |
+
"GSTR-3B month extraction: Month information appears as 'Period' followed by month name (January, February, etc.).",
|
| 56 |
+
"GSTR-3B year extraction: Year appears in 'Year' field in format 'YYYY-YY' (e.g., 2024-25) or in filename as MMYYYY (e.g., 012025).",
|
| 57 |
+
"Table structure in PDFs: Tables may span multiple pages. Look for continuation rows and merged cells.",
|
| 58 |
+
"Multiple bureau reports: When processing multiple reports, extract parameters separately for each person/entity.",
|
| 59 |
+
"NTC acceptance: Check for explicit mentions of 'No Track Case', 'NTC', 'New to Credit' status in summary or remarks.",
|
| 60 |
+
"Suit filed indicators: Look for keywords 'Suit Filed', 'Legal Action', 'Court Case' in account remarks or status.",
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
# Common Patterns and Formats
|
| 64 |
+
COMMON_PATTERNS = [
|
| 65 |
+
"Date formats in bureau reports: DD-MM-YYYY, DD/MM/YYYY, or MMM-YYYY for month-year format.",
|
| 66 |
+
"Currency representation: Indian Rupees shown as '₹', 'Rs.', 'INR', or just numbers with commas (e.g., 1,50,000).",
|
| 67 |
+
"Percentage formats: Shown with '%' symbol or as decimals (0.15 = 15%).",
|
| 68 |
+
"Boolean values: Yes/No, True/False, Y/N, 1/0, or Present/Absent for presence/absence of attributes.",
|
| 69 |
+
"Account types: PL (Personal Loan), BL (Business Loan), CC (Credit Card), HL (Home Loan), AL (Auto Loan).",
|
| 70 |
+
"Status codes in bureau: STD (Standard), SMA (Special Mention Account), SUB (Sub-standard), DBT (Doubtful), LSS (Loss).",
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# All knowledge combined for easy iteration
|
| 74 |
+
ALL_KNOWLEDGE = (
|
| 75 |
+
BUREAU_TERMINOLOGY +
|
| 76 |
+
GST_TERMINOLOGY +
|
| 77 |
+
VALIDATION_RULES +
|
| 78 |
+
EXTRACTION_HINTS +
|
| 79 |
+
COMMON_PATTERNS
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Category mapping for retrieval filtering
|
| 83 |
+
KNOWLEDGE_CATEGORIES = {
|
| 84 |
+
"bureau_terminology": BUREAU_TERMINOLOGY,
|
| 85 |
+
"gst_terminology": GST_TERMINOLOGY,
|
| 86 |
+
"validation_rules": VALIDATION_RULES,
|
| 87 |
+
"extraction_hints": EXTRACTION_HINTS,
|
| 88 |
+
"common_patterns": COMMON_PATTERNS,
|
| 89 |
+
}
|
core/domain_rag.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
import numpy as np
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
+
from .domain_knowledge import ALL_KNOWLEDGE, KNOWLEDGE_CATEGORIES
|
| 7 |
+
from .embeddings import EmbeddingService
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class DomainSnippet:
|
| 12 |
+
"""domain knowledge snippet with its embedding"""
|
| 13 |
+
text: str
|
| 14 |
+
embedding: np.ndarray
|
| 15 |
+
category: str
|
| 16 |
+
index: int
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DomainRAG:
|
| 20 |
+
"""
|
| 21 |
+
RAG for domain knowledge
|
| 22 |
+
embeds hardcoded domain notes on startup
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, embedding_service):
|
| 26 |
+
try:
|
| 27 |
+
self.embedding_service = embedding_service
|
| 28 |
+
self.snippets = []
|
| 29 |
+
self._initialize_knowledge()
|
| 30 |
+
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Failed to init DomainRAG: {str(e)}")
|
| 33 |
+
raise
|
| 34 |
+
|
| 35 |
+
def _initialize_knowledge(self):
|
| 36 |
+
"""embed all domain knowledge on startup"""
|
| 37 |
+
try:
|
| 38 |
+
logger.info("Initializing domain knowledge RAG...")
|
| 39 |
+
logger.info(f"Total snippets to embed: {len(ALL_KNOWLEDGE)}")
|
| 40 |
+
|
| 41 |
+
# batch embed all snippets
|
| 42 |
+
embeddings = self.embedding_service.create_embeddings_batch(
|
| 43 |
+
texts=ALL_KNOWLEDGE
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if not embeddings or len(embeddings) != len(ALL_KNOWLEDGE):
|
| 47 |
+
logger.error("Failed to create embeddings for domain knowledge")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
# store snippets with their embeddings
|
| 51 |
+
for i, text in enumerate(ALL_KNOWLEDGE):
|
| 52 |
+
category = self._categorize_snippet(text)
|
| 53 |
+
self.snippets.append(DomainSnippet(
|
| 54 |
+
text=text,
|
| 55 |
+
embedding=embeddings[i],
|
| 56 |
+
category=category,
|
| 57 |
+
index=i
|
| 58 |
+
))
|
| 59 |
+
|
| 60 |
+
logger.success(f"Domain RAG ready: {len(self.snippets)} snippets embedded")
|
| 61 |
+
|
| 62 |
+
# log category breakdown
|
| 63 |
+
category_counts = {}
|
| 64 |
+
for snippet in self.snippets:
|
| 65 |
+
if snippet.category in category_counts:
|
| 66 |
+
category_counts[snippet.category] += 1
|
| 67 |
+
else:
|
| 68 |
+
category_counts[snippet.category] = 1
|
| 69 |
+
|
| 70 |
+
for category, count in category_counts.items():
|
| 71 |
+
logger.info(f" - {category}: {count} snippets")
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Error initializing domain knowledge: {str(e)}")
|
| 75 |
+
# don't crash - system can work without it
|
| 76 |
+
self.snippets = []
|
| 77 |
+
|
| 78 |
+
def _categorize_snippet(self, text):
|
| 79 |
+
"""figure out what category this snippet belongs to"""
|
| 80 |
+
try:
|
| 81 |
+
text_lower = text.lower()
|
| 82 |
+
|
| 83 |
+
# bureau stuff
|
| 84 |
+
bureau_keywords = ['bureau', 'credit', 'dpd', 'score', 'cibil', 'loan',
|
| 85 |
+
'settlement', 'write-off', 'ntc', 'suit']
|
| 86 |
+
if any(kw in text_lower for kw in bureau_keywords):
|
| 87 |
+
return "bureau"
|
| 88 |
+
|
| 89 |
+
# gst related
|
| 90 |
+
gst_keywords = ['gst', 'gstr', 'table', 'supply', 'taxable', 'outward',
|
| 91 |
+
'gstin', 'tax period']
|
| 92 |
+
if any(kw in text_lower for kw in gst_keywords):
|
| 93 |
+
return "gst"
|
| 94 |
+
|
| 95 |
+
# validation rules
|
| 96 |
+
validation_keywords = ['valid', 'should', 'must', 'rule', 'between', 'range',
|
| 97 |
+
'validation', 'suspicious', 'negative']
|
| 98 |
+
if any(kw in text_lower for kw in validation_keywords):
|
| 99 |
+
return "validation"
|
| 100 |
+
|
| 101 |
+
# extraction hints
|
| 102 |
+
hint_keywords = ['location', 'found', 'appears', 'extraction', 'look for',
|
| 103 |
+
'typically', 'usually', 'marked']
|
| 104 |
+
if any(kw in text_lower for kw in hint_keywords):
|
| 105 |
+
return "extraction_hint"
|
| 106 |
+
|
| 107 |
+
# pattern stuff
|
| 108 |
+
pattern_keywords = ['format', 'pattern', 'representation', 'shown as',
|
| 109 |
+
'display', 'code', 'type']
|
| 110 |
+
if any(kw in text_lower for kw in pattern_keywords):
|
| 111 |
+
return "common_pattern"
|
| 112 |
+
|
| 113 |
+
return "general"
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error categorizing snippet: {str(e)}")
|
| 117 |
+
return "general"
|
| 118 |
+
|
| 119 |
+
def retrieve(self, query, top_k=3, min_similarity=0.3, category_filter=None):
|
| 120 |
+
"""
|
| 121 |
+
get most relevant domain snippets for query
|
| 122 |
+
"""
|
| 123 |
+
try:
|
| 124 |
+
if not self.snippets:
|
| 125 |
+
logger.warning("No domain knowledge available")
|
| 126 |
+
return []
|
| 127 |
+
|
| 128 |
+
logger.info(f"Retrieving domain knowledge for: '{query[:100]}...'")
|
| 129 |
+
|
| 130 |
+
# embed the query
|
| 131 |
+
query_embedding = self.embedding_service.create_embedding(query)
|
| 132 |
+
|
| 133 |
+
if query_embedding is None:
|
| 134 |
+
logger.error("Failed to create query embedding")
|
| 135 |
+
return []
|
| 136 |
+
|
| 137 |
+
# filter by category if needed
|
| 138 |
+
filtered_snippets = self.snippets
|
| 139 |
+
if category_filter:
|
| 140 |
+
filtered_snippets = [s for s in self.snippets if s.category == category_filter]
|
| 141 |
+
logger.info(f"Filtered to {len(filtered_snippets)} snippets in '{category_filter}'")
|
| 142 |
+
|
| 143 |
+
if not filtered_snippets:
|
| 144 |
+
logger.warning(f"No snippets for category: {category_filter}")
|
| 145 |
+
return []
|
| 146 |
+
|
| 147 |
+
# prepare data for similarity search
|
| 148 |
+
snippet_embeddings = [s.embedding for s in filtered_snippets]
|
| 149 |
+
snippet_texts = [s.text for s in filtered_snippets]
|
| 150 |
+
snippet_metadata = [{"category": s.category, "index": s.index} for s in filtered_snippets]
|
| 151 |
+
|
| 152 |
+
# find similar snippets
|
| 153 |
+
results = self.embedding_service.find_most_similar(
|
| 154 |
+
query_embedding=query_embedding,
|
| 155 |
+
candidate_embeddings=snippet_embeddings,
|
| 156 |
+
candidate_texts=snippet_texts,
|
| 157 |
+
candidate_metadata=snippet_metadata,
|
| 158 |
+
top_k=top_k,
|
| 159 |
+
min_similarity=min_similarity
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
if results:
|
| 163 |
+
logger.success(f"Retrieved {len(results)} snippets (top: {results[0].similarity:.3f})")
|
| 164 |
+
return [r.text for r in results]
|
| 165 |
+
else:
|
| 166 |
+
logger.warning(f"No snippets above threshold {min_similarity}")
|
| 167 |
+
return []
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"Error retrieving domain knowledge: {str(e)}")
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
def retrieve_by_category(self, query, categories, snippets_per_category=2):
|
| 174 |
+
"""get snippets grouped by category"""
|
| 175 |
+
try:
|
| 176 |
+
results = {}
|
| 177 |
+
|
| 178 |
+
for category in categories:
|
| 179 |
+
snippets = self.retrieve(
|
| 180 |
+
query=query,
|
| 181 |
+
top_k=snippets_per_category,
|
| 182 |
+
category_filter=category
|
| 183 |
+
)
|
| 184 |
+
if snippets:
|
| 185 |
+
results[category] = snippets
|
| 186 |
+
|
| 187 |
+
return results
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Error retrieving by category: {str(e)}")
|
| 191 |
+
return {}
|
| 192 |
+
|
| 193 |
+
def get_all_snippets(self, category=None):
|
| 194 |
+
"""get all snippets, optionally filtered"""
|
| 195 |
+
try:
|
| 196 |
+
if category:
|
| 197 |
+
return [s.text for s in self.snippets if s.category == category]
|
| 198 |
+
else:
|
| 199 |
+
return [s.text for s in self.snippets]
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"Error getting snippets: {str(e)}")
|
| 203 |
+
return []
|
| 204 |
+
|
| 205 |
+
def get_statistics(self):
|
| 206 |
+
"""stats about domain knowledge"""
|
| 207 |
+
try:
|
| 208 |
+
category_counts = {}
|
| 209 |
+
for snippet in self.snippets:
|
| 210 |
+
if snippet.category in category_counts:
|
| 211 |
+
category_counts[snippet.category] += 1
|
| 212 |
+
else:
|
| 213 |
+
category_counts[snippet.category] = 1
|
| 214 |
+
|
| 215 |
+
embedding_dim = 0
|
| 216 |
+
if self.snippets and len(self.snippets) > 0:
|
| 217 |
+
embedding_dim = len(self.snippets[0].embedding)
|
| 218 |
+
|
| 219 |
+
return {
|
| 220 |
+
"total_snippets": len(self.snippets),
|
| 221 |
+
"categories": category_counts,
|
| 222 |
+
"embedding_dimension": embedding_dim
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.error(f"Error getting stats: {str(e)}")
|
| 227 |
+
return {"total_snippets": 0, "categories": {}, "embedding_dimension": 0}
|
core/embeddings.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
+
import numpy as np
|
| 3 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class SimilarityResult:
|
| 12 |
+
"""similarity search result"""
|
| 13 |
+
index: int
|
| 14 |
+
similarity: float
|
| 15 |
+
text: str
|
| 16 |
+
metadata: Dict[str, Any]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class EmbeddingService:
|
| 20 |
+
"""
|
| 21 |
+
OpenAI embeddings service
|
| 22 |
+
handles embedding creation and similarity search
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, api_key, model="text-embedding-3-large"):
|
| 26 |
+
try:
|
| 27 |
+
self.client = OpenAI(api_key=api_key)
|
| 28 |
+
self.model = model
|
| 29 |
+
# dimensions based on model
|
| 30 |
+
if "large" in model:
|
| 31 |
+
self.embedding_dim = 1536
|
| 32 |
+
else:
|
| 33 |
+
self.embedding_dim = 1024
|
| 34 |
+
self.max_tokens = 8191
|
| 35 |
+
|
| 36 |
+
logger.info(f"EmbeddingService ready (model={model}, dims={self.embedding_dim})")
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Failed to init EmbeddingService: {str(e)}")
|
| 40 |
+
raise
|
| 41 |
+
|
| 42 |
+
@retry(
|
| 43 |
+
stop=stop_after_attempt(3),
|
| 44 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 45 |
+
retry=retry_if_exception_type(Exception),
|
| 46 |
+
reraise=True
|
| 47 |
+
)
|
| 48 |
+
def create_embedding(self, text):
|
| 49 |
+
"""create embedding for single text with retry"""
|
| 50 |
+
try:
|
| 51 |
+
# check if text is valid
|
| 52 |
+
if not text or not text.strip():
|
| 53 |
+
logger.warning("Empty text for embedding")
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
# truncate long text (rough estimate: 4 chars per token)
|
| 57 |
+
if len(text) > 30000:
|
| 58 |
+
text = text[:30000]
|
| 59 |
+
logger.warning("Text truncated to 30k chars")
|
| 60 |
+
|
| 61 |
+
start_time = time.time()
|
| 62 |
+
response = self.client.embeddings.create(
|
| 63 |
+
model=self.model,
|
| 64 |
+
input=text,
|
| 65 |
+
encoding_format="float"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
embedding = response.data[0].embedding
|
| 69 |
+
elapsed = time.time() - start_time
|
| 70 |
+
|
| 71 |
+
logger.debug(f"Created embedding in {elapsed:.2f}s ({len(text)} chars → {len(embedding)} dims)")
|
| 72 |
+
|
| 73 |
+
return embedding
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Error creating embedding: {str(e)}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
@retry(
|
| 80 |
+
stop=stop_after_attempt(3),
|
| 81 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 82 |
+
retry=retry_if_exception_type(Exception),
|
| 83 |
+
reraise=True
|
| 84 |
+
)
|
| 85 |
+
def create_embeddings_batch(self, texts, batch_size=100):
|
| 86 |
+
"""create embeddings for multiple texts in batches"""
|
| 87 |
+
try:
|
| 88 |
+
if not texts:
|
| 89 |
+
logger.warning("Empty text list for batch embedding")
|
| 90 |
+
return []
|
| 91 |
+
|
| 92 |
+
logger.info(f"Creating embeddings for {len(texts)} texts (batch_size={batch_size})")
|
| 93 |
+
|
| 94 |
+
all_embeddings = []
|
| 95 |
+
|
| 96 |
+
# process in chunks
|
| 97 |
+
for i in range(0, len(texts), batch_size):
|
| 98 |
+
batch = texts[i:i + batch_size]
|
| 99 |
+
|
| 100 |
+
# clean up batch
|
| 101 |
+
processed_batch = []
|
| 102 |
+
for text in batch:
|
| 103 |
+
if text and text.strip():
|
| 104 |
+
# truncate if needed
|
| 105 |
+
if len(text) > 30000:
|
| 106 |
+
processed_batch.append(text[:30000])
|
| 107 |
+
else:
|
| 108 |
+
processed_batch.append(text)
|
| 109 |
+
else:
|
| 110 |
+
processed_batch.append(" ") # fallback for empty
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
start_time = time.time()
|
| 114 |
+
response = self.client.embeddings.create(
|
| 115 |
+
model=self.model,
|
| 116 |
+
input=processed_batch,
|
| 117 |
+
encoding_format="float"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
batch_embeddings = [data.embedding for data in response.data]
|
| 121 |
+
all_embeddings.extend(batch_embeddings)
|
| 122 |
+
|
| 123 |
+
elapsed = time.time() - start_time
|
| 124 |
+
batch_num = i//batch_size + 1
|
| 125 |
+
logger.debug(f"Batch {batch_num}: {len(batch)} texts in {elapsed:.2f}s")
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Error in batch {i//batch_size + 1}: {str(e)}")
|
| 129 |
+
# add None for failed ones
|
| 130 |
+
for _ in range(len(batch)):
|
| 131 |
+
all_embeddings.append(None)
|
| 132 |
+
|
| 133 |
+
# count successful embeddings
|
| 134 |
+
successful = 0
|
| 135 |
+
for e in all_embeddings:
|
| 136 |
+
if e is not None:
|
| 137 |
+
successful += 1
|
| 138 |
+
|
| 139 |
+
success_rate = (successful / len(texts)) * 100
|
| 140 |
+
logger.success(f"Created {successful}/{len(texts)} embeddings ({success_rate:.1f}% success)")
|
| 141 |
+
|
| 142 |
+
return all_embeddings
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"Error in batch embedding: {str(e)}")
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
def cosine_similarity(self, vec1, vec2):
|
| 149 |
+
"""calculate cosine similarity between two vectors"""
|
| 150 |
+
try:
|
| 151 |
+
v1 = np.array(vec1)
|
| 152 |
+
v2 = np.array(vec2)
|
| 153 |
+
|
| 154 |
+
dot_product = np.dot(v1, v2)
|
| 155 |
+
norm1 = np.linalg.norm(v1)
|
| 156 |
+
norm2 = np.linalg.norm(v2)
|
| 157 |
+
|
| 158 |
+
# handle zero vectors
|
| 159 |
+
if norm1 == 0 or norm2 == 0:
|
| 160 |
+
logger.warning("Zero vector in cosine similarity")
|
| 161 |
+
return 0.0
|
| 162 |
+
|
| 163 |
+
similarity = dot_product / (norm1 * norm2)
|
| 164 |
+
|
| 165 |
+
# clip to valid range (sometimes gets > 1 due to numerical errors)
|
| 166 |
+
similarity = float(np.clip(similarity, 0.0, 1.0))
|
| 167 |
+
|
| 168 |
+
return similarity
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"Error calculating cosine similarity: {str(e)}")
|
| 172 |
+
return 0.0
|
| 173 |
+
|
| 174 |
+
def find_most_similar(self, query_embedding, candidate_embeddings,
|
| 175 |
+
candidate_texts=None, candidate_metadata=None,
|
| 176 |
+
top_k=5, min_similarity=0.0):
|
| 177 |
+
"""
|
| 178 |
+
find most similar embeddings using cosine similarity
|
| 179 |
+
returns sorted list of results
|
| 180 |
+
"""
|
| 181 |
+
try:
|
| 182 |
+
if not query_embedding or not candidate_embeddings:
|
| 183 |
+
logger.warning("Empty embeddings for similarity search")
|
| 184 |
+
return []
|
| 185 |
+
|
| 186 |
+
logger.info(f"Finding top {top_k} from {len(candidate_embeddings)} candidates (min={min_similarity})")
|
| 187 |
+
|
| 188 |
+
similarities = []
|
| 189 |
+
|
| 190 |
+
for idx, candidate in enumerate(candidate_embeddings):
|
| 191 |
+
# skip None embeddings
|
| 192 |
+
if candidate is None:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
similarity = self.cosine_similarity(query_embedding, candidate)
|
| 197 |
+
|
| 198 |
+
# filter by threshold
|
| 199 |
+
if similarity >= min_similarity:
|
| 200 |
+
text = ""
|
| 201 |
+
if candidate_texts:
|
| 202 |
+
text = candidate_texts[idx]
|
| 203 |
+
|
| 204 |
+
metadata = {}
|
| 205 |
+
if candidate_metadata:
|
| 206 |
+
metadata = candidate_metadata[idx]
|
| 207 |
+
|
| 208 |
+
result = SimilarityResult(
|
| 209 |
+
index=idx,
|
| 210 |
+
similarity=similarity,
|
| 211 |
+
text=text,
|
| 212 |
+
metadata=metadata
|
| 213 |
+
)
|
| 214 |
+
similarities.append(result)
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logger.warning(f"Error computing similarity for idx {idx}: {str(e)}")
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
# sort by similarity descending
|
| 221 |
+
similarities.sort(key=lambda x: x.similarity, reverse=True)
|
| 222 |
+
|
| 223 |
+
# get top k
|
| 224 |
+
top_results = similarities[:top_k]
|
| 225 |
+
|
| 226 |
+
if top_results:
|
| 227 |
+
logger.success(f"Found {len(top_results)} results (top: {top_results[0].similarity:.3f})")
|
| 228 |
+
else:
|
| 229 |
+
logger.warning("No results above threshold")
|
| 230 |
+
|
| 231 |
+
return top_results
|
| 232 |
+
|
| 233 |
+
except Exception as e:
|
| 234 |
+
logger.error(f"Error in find_most_similar: {str(e)}")
|
| 235 |
+
return []
|
| 236 |
+
|
| 237 |
+
def embed_documents(self, texts, metadata=None):
|
| 238 |
+
"""embed multiple documents with metadata"""
|
| 239 |
+
try:
|
| 240 |
+
logger.info(f"Embedding {len(texts)} documents")
|
| 241 |
+
|
| 242 |
+
embeddings = self.create_embeddings_batch(texts)
|
| 243 |
+
|
| 244 |
+
if embeddings is None:
|
| 245 |
+
logger.error("Batch embedding failed")
|
| 246 |
+
return [], []
|
| 247 |
+
|
| 248 |
+
# create metadata if missing
|
| 249 |
+
if metadata is None:
|
| 250 |
+
metadata = []
|
| 251 |
+
for _ in texts:
|
| 252 |
+
metadata.append({})
|
| 253 |
+
|
| 254 |
+
# add embedding info to metadata
|
| 255 |
+
for i in range(len(embeddings)):
|
| 256 |
+
embedding = embeddings[i]
|
| 257 |
+
meta = metadata[i]
|
| 258 |
+
|
| 259 |
+
if embedding is not None:
|
| 260 |
+
meta["embedding_dim"] = len(embedding)
|
| 261 |
+
meta["has_embedding"] = True
|
| 262 |
+
else:
|
| 263 |
+
meta["has_embedding"] = False
|
| 264 |
+
|
| 265 |
+
return embeddings, metadata
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logger.error(f"Error embedding documents: {str(e)}")
|
| 269 |
+
return [], []
|
| 270 |
+
|
| 271 |
+
def get_embedding_stats(self, embeddings):
|
| 272 |
+
"""get stats about embeddings"""
|
| 273 |
+
try:
|
| 274 |
+
valid_embeddings = []
|
| 275 |
+
for e in embeddings:
|
| 276 |
+
if e is not None:
|
| 277 |
+
valid_embeddings.append(e)
|
| 278 |
+
|
| 279 |
+
if not valid_embeddings:
|
| 280 |
+
return {
|
| 281 |
+
"total": len(embeddings),
|
| 282 |
+
"valid": 0,
|
| 283 |
+
"invalid": len(embeddings),
|
| 284 |
+
"success_rate": 0.0
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
# convert to numpy for calculations
|
| 288 |
+
emb_array = np.array(valid_embeddings)
|
| 289 |
+
|
| 290 |
+
# calculate norms
|
| 291 |
+
norms = []
|
| 292 |
+
for e in valid_embeddings:
|
| 293 |
+
norms.append(np.linalg.norm(e))
|
| 294 |
+
|
| 295 |
+
stats = {
|
| 296 |
+
"total": len(embeddings),
|
| 297 |
+
"valid": len(valid_embeddings),
|
| 298 |
+
"invalid": len(embeddings) - len(valid_embeddings),
|
| 299 |
+
"success_rate": len(valid_embeddings) / len(embeddings),
|
| 300 |
+
"dimensions": len(valid_embeddings[0]) if valid_embeddings else 0,
|
| 301 |
+
"mean_norm": float(np.mean(norms)),
|
| 302 |
+
"std_norm": float(np.std(norms))
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
logger.info(f"Embedding stats: {stats['valid']}/{stats['total']} valid ({stats['success_rate']*100:.1f}%)")
|
| 306 |
+
|
| 307 |
+
return stats
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.error(f"Error calculating stats: {str(e)}")
|
| 311 |
+
return {}
|
core/extractor.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
+
import json
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ExtractionService:
|
| 7 |
+
"""extract parameters using GPT-4"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, api_key):
|
| 10 |
+
self.client = OpenAI(api_key=api_key)
|
| 11 |
+
self.model = "gpt-4-turbo-preview"
|
| 12 |
+
|
| 13 |
+
def extract_parameter(self, parameter_name, parameter_description, context, source_location):
|
| 14 |
+
"""
|
| 15 |
+
extract single parameter using GPT-4
|
| 16 |
+
"""
|
| 17 |
+
# build the prompt
|
| 18 |
+
prompt = f"""Extract the following parameter from the document:
|
| 19 |
+
|
| 20 |
+
PARAMETER: {parameter_name}
|
| 21 |
+
DESCRIPTION: {parameter_description}
|
| 22 |
+
|
| 23 |
+
DOCUMENT CONTEXT:
|
| 24 |
+
{context}
|
| 25 |
+
|
| 26 |
+
INSTRUCTIONS:
|
| 27 |
+
1. Extract the exact value (no formatting - remove commas, currency symbols)
|
| 28 |
+
2. If not found, return null
|
| 29 |
+
3. Be precise - no approximations
|
| 30 |
+
|
| 31 |
+
Return ONLY a JSON object:
|
| 32 |
+
{{
|
| 33 |
+
"value": <number or string or null>,
|
| 34 |
+
"source": "{source_location}"
|
| 35 |
+
}}"""
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
response = self.client.chat.completions.create(
|
| 39 |
+
model=self.model,
|
| 40 |
+
messages=[
|
| 41 |
+
{
|
| 42 |
+
"role": "system",
|
| 43 |
+
"content": "You are an expert at extracting financial data from documents. Extract exact values without any formatting."
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"role": "user",
|
| 47 |
+
"content": prompt
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
temperature=0.0,
|
| 51 |
+
response_format={"type": "json_object"}
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
result = json.loads(response.choices[0].message.content)
|
| 55 |
+
return result
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
return {
|
| 59 |
+
"value": None,
|
| 60 |
+
"source": f"Error: {str(e)}"
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
def extract_from_table(self, parameter_name, tables):
|
| 64 |
+
"""
|
| 65 |
+
extract parameter from tables
|
| 66 |
+
TODO: maybe optimize this for large tables
|
| 67 |
+
"""
|
| 68 |
+
# convert tables to text format
|
| 69 |
+
tables_text = ""
|
| 70 |
+
table_count = 0
|
| 71 |
+
for i, table in enumerate(tables):
|
| 72 |
+
if table_count >= 5: # limit to 5 tables
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
tables_text += f"\n[Table {i+1} - Page {table['page']}]\n"
|
| 76 |
+
|
| 77 |
+
headers = table.get("headers", [])
|
| 78 |
+
rows = table.get("rows", [])
|
| 79 |
+
|
| 80 |
+
# add headers
|
| 81 |
+
if headers:
|
| 82 |
+
header_str = ""
|
| 83 |
+
for h in headers:
|
| 84 |
+
header_str += str(h) + " | "
|
| 85 |
+
tables_text += header_str + "\n"
|
| 86 |
+
|
| 87 |
+
# add rows (max 20 per table)
|
| 88 |
+
row_count = 0
|
| 89 |
+
for row in rows:
|
| 90 |
+
if row_count >= 20:
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
row_str = ""
|
| 94 |
+
for c in row:
|
| 95 |
+
if c:
|
| 96 |
+
row_str += str(c) + " | "
|
| 97 |
+
else:
|
| 98 |
+
row_str += " | "
|
| 99 |
+
tables_text += row_str + "\n"
|
| 100 |
+
row_count += 1
|
| 101 |
+
|
| 102 |
+
table_count += 1
|
| 103 |
+
|
| 104 |
+
# build prompt
|
| 105 |
+
prompt = f"""Find '{parameter_name}' in these tables:
|
| 106 |
+
|
| 107 |
+
{tables_text}
|
| 108 |
+
|
| 109 |
+
Return JSON:
|
| 110 |
+
{{
|
| 111 |
+
"value": <extracted value without formatting>,
|
| 112 |
+
"source": "<table number and location>"
|
| 113 |
+
}}"""
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
response = self.client.chat.completions.create(
|
| 117 |
+
model=self.model,
|
| 118 |
+
messages=[
|
| 119 |
+
{
|
| 120 |
+
"role": "system",
|
| 121 |
+
"content": "Extract data from tables. Return exact numbers without formatting."
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"role": "user",
|
| 125 |
+
"content": prompt
|
| 126 |
+
}
|
| 127 |
+
],
|
| 128 |
+
temperature=0.0,
|
| 129 |
+
response_format={"type": "json_object"}
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
result = json.loads(response.choices[0].message.content)
|
| 133 |
+
return result
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
return {
|
| 137 |
+
"value": None,
|
| 138 |
+
"source": f"Error: {str(e)}"
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
|
core/rag_pipeline.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
import numpy as np
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
|
| 7 |
+
from core.document_parser import ParsedDocument
|
| 8 |
+
from core.embeddings import EmbeddingService
|
| 9 |
+
from core.vision_parser import VisionDocumentParser, VisionExtractionResult
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class ExtractionResult:
|
| 14 |
+
"""result from parameter extraction"""
|
| 15 |
+
parameter_id: str
|
| 16 |
+
parameter_name: str
|
| 17 |
+
value: Any
|
| 18 |
+
source: str
|
| 19 |
+
confidence: float
|
| 20 |
+
context_used: str
|
| 21 |
+
metadata: Dict[str, Any]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class EnhancedRAGPipeline:
|
| 25 |
+
"""
|
| 26 |
+
RAG Pipeline with Vision support
|
| 27 |
+
tries vision first, falls back to traditional RAG
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, embedding_service, openai_client, domain_rag=None,
|
| 31 |
+
top_k=5, similarity_threshold=0.3, model="gpt-4o-mini",
|
| 32 |
+
vision_model="gpt-4o", temperature=0.0, use_vision=True):
|
| 33 |
+
self.embedding_service = embedding_service
|
| 34 |
+
self.client = openai_client
|
| 35 |
+
self.domain_rag = domain_rag
|
| 36 |
+
self.top_k = top_k
|
| 37 |
+
self.similarity_threshold = similarity_threshold
|
| 38 |
+
self.model = model
|
| 39 |
+
self.temperature = temperature
|
| 40 |
+
self.use_vision = use_vision
|
| 41 |
+
|
| 42 |
+
# setup vision parser if needed
|
| 43 |
+
if use_vision:
|
| 44 |
+
self.vision_parser = VisionDocumentParser(
|
| 45 |
+
openai_client=openai_client,
|
| 46 |
+
model=vision_model
|
| 47 |
+
)
|
| 48 |
+
logger.success("Vision parser ready")
|
| 49 |
+
else:
|
| 50 |
+
self.vision_parser = None
|
| 51 |
+
logger.info("Vision extraction disabled")
|
| 52 |
+
|
| 53 |
+
def extract_parameter_with_vision(self, pdf_path, parameter_id,
|
| 54 |
+
parameter_name, parameter_description):
|
| 55 |
+
"""
|
| 56 |
+
extract using GPT-4 Vision (most accurate method)
|
| 57 |
+
"""
|
| 58 |
+
if not self.use_vision or not self.vision_parser:
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
logger.info(f"[VISION] Extracting {parameter_name}...")
|
| 63 |
+
|
| 64 |
+
# figure out what type of parameter this is
|
| 65 |
+
param_type = self._infer_parameter_type(parameter_id, parameter_description)
|
| 66 |
+
|
| 67 |
+
# use vision to extract
|
| 68 |
+
vision_result = self.vision_parser.extract_parameter_from_pdf(
|
| 69 |
+
pdf_path=pdf_path,
|
| 70 |
+
parameter_name=parameter_name,
|
| 71 |
+
parameter_description=parameter_description,
|
| 72 |
+
parameter_type=param_type,
|
| 73 |
+
search_all_pages=True
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if vision_result:
|
| 77 |
+
# convert to our format
|
| 78 |
+
result = ExtractionResult(
|
| 79 |
+
parameter_id=parameter_id,
|
| 80 |
+
parameter_name=parameter_name,
|
| 81 |
+
value=vision_result.value,
|
| 82 |
+
source=vision_result.source,
|
| 83 |
+
confidence=vision_result.confidence,
|
| 84 |
+
context_used=vision_result.context,
|
| 85 |
+
metadata={
|
| 86 |
+
"method": "vision",
|
| 87 |
+
"page_number": vision_result.page_number,
|
| 88 |
+
"model": self.vision_parser.model
|
| 89 |
+
}
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
logger.success(f"[VISION] Found: {result.value} (conf: {result.confidence:.2f})")
|
| 93 |
+
return result
|
| 94 |
+
else:
|
| 95 |
+
logger.warning(f"[VISION] Not found: {parameter_name}")
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"[VISION] Error: {str(e)}")
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
def _infer_parameter_type(self, parameter_id, description):
|
| 103 |
+
"""guess parameter type from id and description"""
|
| 104 |
+
param_lower = parameter_id.lower()
|
| 105 |
+
desc_lower = description.lower()
|
| 106 |
+
|
| 107 |
+
# boolean stuff
|
| 108 |
+
boolean_keywords = ["accepted", "flag", "status", "yes/no", "true/false",
|
| 109 |
+
"settlement", "writeoff", "suit", "default"]
|
| 110 |
+
for keyword in boolean_keywords:
|
| 111 |
+
if keyword in param_lower or keyword in desc_lower:
|
| 112 |
+
return "boolean"
|
| 113 |
+
|
| 114 |
+
# numeric stuff
|
| 115 |
+
numeric_keywords = ["amount", "count", "number", "dpd", "loans",
|
| 116 |
+
"threshold", "score", "inquiries"]
|
| 117 |
+
for keyword in numeric_keywords:
|
| 118 |
+
if keyword in param_lower or keyword in desc_lower:
|
| 119 |
+
return "number"
|
| 120 |
+
|
| 121 |
+
# dates
|
| 122 |
+
if "date" in param_lower or "date" in desc_lower:
|
| 123 |
+
return "date"
|
| 124 |
+
|
| 125 |
+
return "text"
|
| 126 |
+
|
| 127 |
+
def prepare_document(self, parsed_doc):
|
| 128 |
+
"""
|
| 129 |
+
prepare doc for traditional RAG (fallback)
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
chunk_texts = []
|
| 133 |
+
chunk_metadata = []
|
| 134 |
+
|
| 135 |
+
for chunk in parsed_doc.chunks:
|
| 136 |
+
chunk_texts.append(chunk.text)
|
| 137 |
+
chunk_metadata.append({
|
| 138 |
+
"chunk_id": chunk.chunk_id,
|
| 139 |
+
"page_num": chunk.page_num,
|
| 140 |
+
"start_char": chunk.start_char,
|
| 141 |
+
"end_char": chunk.end_char
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
# create embeddings
|
| 145 |
+
embeddings_list = self.embedding_service.create_embeddings_batch(chunk_texts)
|
| 146 |
+
|
| 147 |
+
if embeddings_list is None:
|
| 148 |
+
logger.error("Failed to create embeddings")
|
| 149 |
+
return None, None, None
|
| 150 |
+
|
| 151 |
+
# convert to numpy
|
| 152 |
+
embeddings = np.array(embeddings_list)
|
| 153 |
+
|
| 154 |
+
return embeddings, chunk_texts, chunk_metadata
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Error preparing document: {str(e)}")
|
| 158 |
+
return None, None, None
|
| 159 |
+
|
| 160 |
+
def extract_parameter_full_pipeline(self, parameter_id, parameter_name,
|
| 161 |
+
parameter_description, parsed_doc,
|
| 162 |
+
chunk_embeddings, chunk_texts,
|
| 163 |
+
chunk_metadata, pdf_path=None):
|
| 164 |
+
"""
|
| 165 |
+
full extraction pipeline
|
| 166 |
+
tries vision first, then RAG as fallback
|
| 167 |
+
"""
|
| 168 |
+
try:
|
| 169 |
+
# try vision first (best accuracy)
|
| 170 |
+
if pdf_path and self.use_vision:
|
| 171 |
+
vision_result = self.extract_parameter_with_vision(
|
| 172 |
+
pdf_path=pdf_path,
|
| 173 |
+
parameter_id=parameter_id,
|
| 174 |
+
parameter_name=parameter_name,
|
| 175 |
+
parameter_description=parameter_description
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
# if vision found it with good confidence, use that
|
| 179 |
+
if vision_result and vision_result.confidence >= 0.7:
|
| 180 |
+
logger.success(f"[PIPELINE] Using VISION result (conf: {vision_result.confidence:.2f})")
|
| 181 |
+
return vision_result
|
| 182 |
+
|
| 183 |
+
# try traditional RAG
|
| 184 |
+
logger.info(f"[PIPELINE] Trying traditional RAG for {parameter_name}...")
|
| 185 |
+
rag_result = self._extract_with_rag(
|
| 186 |
+
parameter_id=parameter_id,
|
| 187 |
+
parameter_name=parameter_name,
|
| 188 |
+
parameter_description=parameter_description,
|
| 189 |
+
chunk_embeddings=chunk_embeddings,
|
| 190 |
+
chunk_texts=chunk_texts,
|
| 191 |
+
chunk_metadata=chunk_metadata,
|
| 192 |
+
parsed_doc=parsed_doc
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# if we have both, compare them
|
| 196 |
+
if vision_result and rag_result:
|
| 197 |
+
if vision_result.confidence > rag_result.confidence:
|
| 198 |
+
logger.info(f"[PIPELINE] Vision wins: {vision_result.confidence:.2f} > {rag_result.confidence:.2f}")
|
| 199 |
+
return vision_result
|
| 200 |
+
else:
|
| 201 |
+
logger.info(f"[PIPELINE] RAG wins: {rag_result.confidence:.2f} > {vision_result.confidence:.2f}")
|
| 202 |
+
return rag_result
|
| 203 |
+
|
| 204 |
+
# return whatever worked
|
| 205 |
+
if vision_result:
|
| 206 |
+
return vision_result
|
| 207 |
+
return rag_result
|
| 208 |
+
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.error(f"Error in extraction pipeline: {str(e)}")
|
| 211 |
+
return None
|
| 212 |
+
|
| 213 |
+
def _extract_with_rag(self, parameter_id, parameter_name, parameter_description,
|
| 214 |
+
chunk_embeddings, chunk_texts, chunk_metadata, parsed_doc):
|
| 215 |
+
"""traditional RAG extraction (fallback method)"""
|
| 216 |
+
try:
|
| 217 |
+
# build query
|
| 218 |
+
query = f"{parameter_name}: {parameter_description}"
|
| 219 |
+
query_embedding = self.embedding_service.create_embedding(query)
|
| 220 |
+
|
| 221 |
+
if query_embedding is None:
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
# get relevant chunks
|
| 225 |
+
similarities = np.dot(chunk_embeddings, query_embedding)
|
| 226 |
+
top_indices = np.argsort(similarities)[::-1][:self.top_k]
|
| 227 |
+
|
| 228 |
+
# filter by threshold
|
| 229 |
+
relevant_chunks = []
|
| 230 |
+
for idx in top_indices:
|
| 231 |
+
if similarities[idx] >= self.similarity_threshold:
|
| 232 |
+
relevant_chunks.append({
|
| 233 |
+
"text": chunk_texts[idx],
|
| 234 |
+
"similarity": float(similarities[idx]),
|
| 235 |
+
"metadata": chunk_metadata[idx]
|
| 236 |
+
})
|
| 237 |
+
|
| 238 |
+
if not relevant_chunks:
|
| 239 |
+
return None
|
| 240 |
+
|
| 241 |
+
# get domain knowledge if available
|
| 242 |
+
domain_context = ""
|
| 243 |
+
if self.domain_rag:
|
| 244 |
+
domain_snippets = self.domain_rag.retrieve(query, top_k=3)
|
| 245 |
+
if domain_snippets:
|
| 246 |
+
formatted_snippets = []
|
| 247 |
+
for s in domain_snippets:
|
| 248 |
+
# handle different types
|
| 249 |
+
if isinstance(s, str):
|
| 250 |
+
formatted_snippets.append(f"- {s}")
|
| 251 |
+
elif hasattr(s, 'text'):
|
| 252 |
+
formatted_snippets.append(f"- {s.text}")
|
| 253 |
+
else:
|
| 254 |
+
formatted_snippets.append(f"- {str(s)}")
|
| 255 |
+
domain_context = "\n".join(formatted_snippets)
|
| 256 |
+
|
| 257 |
+
# build context from chunks
|
| 258 |
+
context_parts = []
|
| 259 |
+
for i, c in enumerate(relevant_chunks):
|
| 260 |
+
chunk_text = f"[Chunk {i+1}, Page {c['metadata']['page_num']}, Similarity: {c['similarity']:.2f}]\n{c['text']}"
|
| 261 |
+
context_parts.append(chunk_text)
|
| 262 |
+
context_text = "\n\n".join(context_parts)
|
| 263 |
+
|
| 264 |
+
# build prompt
|
| 265 |
+
prompt = f"""Extract the following parameter from the document context.
|
| 266 |
+
|
| 267 |
+
Parameter: {parameter_name}
|
| 268 |
+
Description: {parameter_description}
|
| 269 |
+
|
| 270 |
+
"""
|
| 271 |
+
if domain_context:
|
| 272 |
+
prompt += f"Domain Knowledge:\n{domain_context}\n\n"
|
| 273 |
+
|
| 274 |
+
prompt += f"""Document Context:
|
| 275 |
+
{context_text}
|
| 276 |
+
|
| 277 |
+
Extract the value and provide the specific source location (e.g., "Account Summary Table, Row 3" not just the filename).
|
| 278 |
+
|
| 279 |
+
Return JSON:
|
| 280 |
+
{{
|
| 281 |
+
"value": <extracted value or null>,
|
| 282 |
+
"source": "<specific section/table/location>",
|
| 283 |
+
"confidence": <0.0-1.0>
|
| 284 |
+
}}"""
|
| 285 |
+
|
| 286 |
+
response = self.client.chat.completions.create(
|
| 287 |
+
model=self.model,
|
| 288 |
+
messages=[{"role": "user", "content": prompt}],
|
| 289 |
+
temperature=self.temperature,
|
| 290 |
+
max_tokens=300
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
result_text = response.choices[0].message.content
|
| 294 |
+
|
| 295 |
+
# parse JSON response
|
| 296 |
+
import json
|
| 297 |
+
json_text = result_text.strip()
|
| 298 |
+
if "```json" in json_text:
|
| 299 |
+
json_text = json_text.split("```json")[1].split("```")[0].strip()
|
| 300 |
+
elif "```" in json_text:
|
| 301 |
+
json_text = json_text.split("```")[1].split("```")[0].strip()
|
| 302 |
+
|
| 303 |
+
data = json.loads(json_text)
|
| 304 |
+
|
| 305 |
+
if data.get("value") is not None:
|
| 306 |
+
return ExtractionResult(
|
| 307 |
+
parameter_id=parameter_id,
|
| 308 |
+
parameter_name=parameter_name,
|
| 309 |
+
value=data["value"],
|
| 310 |
+
source=data.get("source", f"Page {relevant_chunks[0]['metadata']['page_num']}"),
|
| 311 |
+
confidence=float(data.get("confidence", 0.5)),
|
| 312 |
+
context_used=context_text[:200],
|
| 313 |
+
metadata={"method": "rag", "chunks_used": len(relevant_chunks)}
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
return None
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logger.error(f"RAG extraction error: {str(e)}")
|
| 320 |
+
return None
|
| 321 |
+
|
| 322 |
+
def calculate_overall_confidence(self, results):
|
| 323 |
+
"""calculate overall confidence score"""
|
| 324 |
+
if not results:
|
| 325 |
+
return 0.0
|
| 326 |
+
|
| 327 |
+
# count successful extractions
|
| 328 |
+
successful = []
|
| 329 |
+
for r in results:
|
| 330 |
+
if r.value is not None:
|
| 331 |
+
successful.append(r)
|
| 332 |
+
|
| 333 |
+
if not successful:
|
| 334 |
+
return 0.0
|
| 335 |
+
|
| 336 |
+
# average confidence
|
| 337 |
+
total_conf = 0.0
|
| 338 |
+
for r in successful:
|
| 339 |
+
total_conf += r.confidence
|
| 340 |
+
avg_confidence = total_conf / len(successful)
|
| 341 |
+
|
| 342 |
+
# success rate
|
| 343 |
+
success_rate = len(successful) / len(results)
|
| 344 |
+
|
| 345 |
+
# combine them
|
| 346 |
+
overall = (avg_confidence * 0.7) + (success_rate * 0.3)
|
| 347 |
+
|
| 348 |
+
return round(overall, 2)
|
core/vision_parser.py
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import base64
|
| 3 |
+
import io
|
| 4 |
+
import json # For parsing Vision API responses
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Dict, Any, List, Optional, Tuple
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from loguru import logger
|
| 9 |
+
from PIL import Image
|
| 10 |
+
import fitz # PyMuPDF - Pure Python, no poppler needed!
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class VisionExtractionResult:
|
| 16 |
+
"""Result from vision-based extraction"""
|
| 17 |
+
parameter_id: str
|
| 18 |
+
parameter_name: str
|
| 19 |
+
value: Any
|
| 20 |
+
source: str # Specific section/location
|
| 21 |
+
page_number: int
|
| 22 |
+
confidence: float
|
| 23 |
+
context: str # Surrounding text/context
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class VisionDocumentParser:
|
| 27 |
+
|
| 28 |
+
def __init__(self, openai_client: OpenAI, model: str = "gpt-4o"):
|
| 29 |
+
|
| 30 |
+
self.client = openai_client
|
| 31 |
+
self.model = model
|
| 32 |
+
self._image_cache = {} # Cache converted images by PDF path
|
| 33 |
+
logger.info(f"VisionDocumentParser initialized with model: {model}")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def pdf_to_images(self, pdf_path: str, dpi: int = 200) -> List[Image.Image]:
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Check cache first - ONLY OPTIMIZATION!
|
| 40 |
+
cache_key = f"{pdf_path}_{dpi}"
|
| 41 |
+
if cache_key in self._image_cache:
|
| 42 |
+
logger.info(f"✅ Using CACHED images for: {Path(pdf_path).name} (skipping conversion)")
|
| 43 |
+
return self._image_cache[cache_key]
|
| 44 |
+
|
| 45 |
+
logger.info(f"Converting PDF to images: {Path(pdf_path).name} (DPI: {dpi})")
|
| 46 |
+
|
| 47 |
+
# Open PDF with PyMuPDF
|
| 48 |
+
doc = fitz.open(pdf_path)
|
| 49 |
+
images = []
|
| 50 |
+
|
| 51 |
+
# Convert each page to image
|
| 52 |
+
for page_num in range(len(doc)):
|
| 53 |
+
page = doc[page_num]
|
| 54 |
+
|
| 55 |
+
# Calculate zoom factor for DPI
|
| 56 |
+
# 72 DPI is default, so zoom = target_dpi / 72
|
| 57 |
+
zoom = dpi / 72
|
| 58 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 59 |
+
|
| 60 |
+
# Render page to pixmap
|
| 61 |
+
pix = page.get_pixmap(matrix=mat)
|
| 62 |
+
|
| 63 |
+
# Convert pixmap to PIL Image
|
| 64 |
+
img_data = pix.tobytes("png")
|
| 65 |
+
img = Image.open(io.BytesIO(img_data))
|
| 66 |
+
|
| 67 |
+
images.append(img)
|
| 68 |
+
|
| 69 |
+
doc.close()
|
| 70 |
+
|
| 71 |
+
# Cache for reuse - ONLY OPTIMIZATION!
|
| 72 |
+
self._image_cache[cache_key] = images
|
| 73 |
+
|
| 74 |
+
logger.success(f"Converted {len(images)} pages to images (PyMuPDF) - CACHED for reuse ✅")
|
| 75 |
+
return images
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Error converting PDF to images: {str(e)}")
|
| 79 |
+
return []
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def image_to_base64(self, image: Image.Image) -> str:
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
buffered = io.BytesIO()
|
| 86 |
+
image.save(buffered, format="PNG")
|
| 87 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
| 88 |
+
return img_str
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Error encoding image: {str(e)}")
|
| 92 |
+
return ""
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def extract_all_parameters_from_page(
|
| 96 |
+
self,
|
| 97 |
+
image: Image.Image,
|
| 98 |
+
page_num: int,
|
| 99 |
+
parameters: List[Dict[str, str]]
|
| 100 |
+
) -> Dict[str, VisionExtractionResult]:
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Build comprehensive prompt for ALL parameters
|
| 104 |
+
param_descriptions = []
|
| 105 |
+
for i, param in enumerate(parameters, 1):
|
| 106 |
+
param_type = param.get('type', 'text')
|
| 107 |
+
type_hint = {
|
| 108 |
+
'boolean': '(true/false)',
|
| 109 |
+
'number': '(numeric value)',
|
| 110 |
+
'date': '(date format)',
|
| 111 |
+
'text': '(text value)'
|
| 112 |
+
}.get(param_type, '')
|
| 113 |
+
|
| 114 |
+
param_descriptions.append(
|
| 115 |
+
f"{i}. **{param['name']}** {type_hint}: {param['description']}"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
params_text = "\n".join(param_descriptions)
|
| 119 |
+
|
| 120 |
+
prompt = f"""Analyze this document page and extract ALL of the following parameters that you can find:
|
| 121 |
+
|
| 122 |
+
{params_text}
|
| 123 |
+
|
| 124 |
+
IMPORTANT INSTRUCTIONS:
|
| 125 |
+
1. Return a JSON object with ONLY the parameters you found on this page
|
| 126 |
+
2. For each parameter found, provide:
|
| 127 |
+
- "value": The actual value (use correct data type: number, boolean, string, or null)
|
| 128 |
+
- "source": SPECIFIC location (e.g., "Account Summary Table - Settlement column, Row 2")
|
| 129 |
+
- "confidence": Your confidence level (0.0 to 1.0)
|
| 130 |
+
- "context": Brief surrounding text for verification
|
| 131 |
+
|
| 132 |
+
3. Skip parameters not visible on this page (don't include them in response)
|
| 133 |
+
4. Be precise with sources - include table names, section headers, row/column identifiers
|
| 134 |
+
5. For booleans, return true/false, NOT "yes"/"no" or 1/0
|
| 135 |
+
|
| 136 |
+
Return ONLY valid JSON, no markdown formatting:
|
| 137 |
+
{{
|
| 138 |
+
"parameter_id_1": {{
|
| 139 |
+
"found": true,
|
| 140 |
+
"value": <actual_value>,
|
| 141 |
+
"source": "Specific location with details",
|
| 142 |
+
"confidence": 0.95,
|
| 143 |
+
"context": "Surrounding text..."
|
| 144 |
+
}},
|
| 145 |
+
"parameter_id_2": {{
|
| 146 |
+
"found": true,
|
| 147 |
+
"value": <actual_value>,
|
| 148 |
+
"source": "Another specific location",
|
| 149 |
+
"confidence": 0.90,
|
| 150 |
+
"context": "More context..."
|
| 151 |
+
}}
|
| 152 |
+
}}
|
| 153 |
+
|
| 154 |
+
Parameter IDs to use: {', '.join([p['id'] for p in parameters])}"""
|
| 155 |
+
|
| 156 |
+
# Convert image to base64
|
| 157 |
+
buffered = io.BytesIO()
|
| 158 |
+
image.save(buffered, format="PNG")
|
| 159 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
| 160 |
+
|
| 161 |
+
# Single API call for ALL parameters!
|
| 162 |
+
response = self.client.chat.completions.create(
|
| 163 |
+
model=self.model,
|
| 164 |
+
messages=[
|
| 165 |
+
{
|
| 166 |
+
"role": "user",
|
| 167 |
+
"content": [
|
| 168 |
+
{
|
| 169 |
+
"type": "image_url",
|
| 170 |
+
"image_url": {
|
| 171 |
+
"url": f"data:image/png;base64,{img_base64}"
|
| 172 |
+
}
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"type": "text",
|
| 176 |
+
"text": prompt
|
| 177 |
+
}
|
| 178 |
+
]
|
| 179 |
+
}
|
| 180 |
+
],
|
| 181 |
+
max_tokens=2000,
|
| 182 |
+
temperature=0.0
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# Parse response
|
| 186 |
+
content = response.choices[0].message.content.strip()
|
| 187 |
+
|
| 188 |
+
# Remove markdown if present
|
| 189 |
+
if content.startswith("```json"):
|
| 190 |
+
content = content[7:]
|
| 191 |
+
if content.startswith("```"):
|
| 192 |
+
content = content[3:]
|
| 193 |
+
if content.endswith("```"):
|
| 194 |
+
content = content[:-3]
|
| 195 |
+
content = content.strip()
|
| 196 |
+
|
| 197 |
+
# Parse JSON
|
| 198 |
+
results_dict = json.loads(content)
|
| 199 |
+
|
| 200 |
+
# Create mapping of param_id to param_name for lookup
|
| 201 |
+
param_name_map = {p['id']: p['name'] for p in parameters}
|
| 202 |
+
|
| 203 |
+
# Convert to VisionExtractionResult objects
|
| 204 |
+
extraction_results = {}
|
| 205 |
+
for param_id, result_data in results_dict.items():
|
| 206 |
+
if result_data.get('found', False):
|
| 207 |
+
extraction_results[param_id] = VisionExtractionResult(
|
| 208 |
+
parameter_id=param_id,
|
| 209 |
+
parameter_name=param_name_map.get(param_id, param_id), # Get name from map
|
| 210 |
+
value=result_data.get('value'),
|
| 211 |
+
source=result_data.get('source', f'Page {page_num}'),
|
| 212 |
+
page_number=page_num,
|
| 213 |
+
confidence=result_data.get('confidence', 0.7),
|
| 214 |
+
context=result_data.get('context', '')
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
logger.success(
|
| 218 |
+
f"Page {page_num}: Found {len(extraction_results)}/{len(parameters)} parameters "
|
| 219 |
+
f"in ONE call ⚡"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
return extraction_results
|
| 223 |
+
|
| 224 |
+
except json.JSONDecodeError as e:
|
| 225 |
+
logger.error(f"Failed to parse JSON from page {page_num}: {str(e)}")
|
| 226 |
+
return {}
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.error(f"Error extracting from page {page_num}: {str(e)}")
|
| 229 |
+
return {}
|
| 230 |
+
|
| 231 |
+
def extract_all_parameters_batch(
|
| 232 |
+
self,
|
| 233 |
+
pdf_path: str,
|
| 234 |
+
parameters: List[Dict[str, str]]
|
| 235 |
+
) -> Dict[str, VisionExtractionResult]:
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
logger.info(
|
| 239 |
+
f"⚡ BATCH EXTRACTION: Processing {len(parameters)} parameters "
|
| 240 |
+
f"from {Path(pdf_path).name}"
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# Convert PDF to images (uses cache!)
|
| 244 |
+
images = self.pdf_to_images(pdf_path, dpi=200)
|
| 245 |
+
if not images:
|
| 246 |
+
logger.error("Failed to convert PDF to images")
|
| 247 |
+
return {}
|
| 248 |
+
|
| 249 |
+
# Store best result for each parameter
|
| 250 |
+
best_results = {}
|
| 251 |
+
|
| 252 |
+
# Process each page once, extracting ALL parameters
|
| 253 |
+
for page_num, image in enumerate(images, start=1):
|
| 254 |
+
logger.info(f"⚡ Page {page_num}/{len(images)}: Extracting ALL parameters...")
|
| 255 |
+
|
| 256 |
+
# Extract all parameters from this page in ONE call!
|
| 257 |
+
page_results = self.extract_all_parameters_from_page(
|
| 258 |
+
image=image,
|
| 259 |
+
page_num=page_num,
|
| 260 |
+
parameters=parameters
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Update best results (keep highest confidence for each parameter)
|
| 264 |
+
for param_id, result in page_results.items():
|
| 265 |
+
if param_id not in best_results:
|
| 266 |
+
best_results[param_id] = result
|
| 267 |
+
logger.info(f" ✓ {param_id}: {result.value} (conf: {result.confidence})")
|
| 268 |
+
elif result.confidence > best_results[param_id].confidence:
|
| 269 |
+
logger.info(
|
| 270 |
+
f" ↑ {param_id}: {result.value} (conf: {result.confidence}) "
|
| 271 |
+
f"[better than {best_results[param_id].confidence}]"
|
| 272 |
+
)
|
| 273 |
+
best_results[param_id] = result
|
| 274 |
+
|
| 275 |
+
found_count = len(best_results)
|
| 276 |
+
logger.success(
|
| 277 |
+
f"⚡ BATCH COMPLETE: Found {found_count}/{len(parameters)} parameters "
|
| 278 |
+
f"in {len(images)} API calls (vs {len(parameters) * len(images)} with old method!)"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
return best_results
|
| 282 |
+
|
| 283 |
+
except Exception as e:
|
| 284 |
+
logger.error(f"Error in batch extraction: {str(e)}")
|
| 285 |
+
return {}
|
| 286 |
+
|
| 287 |
+
def extract_parameter_from_page(
|
| 288 |
+
self,
|
| 289 |
+
image: Image.Image,
|
| 290 |
+
page_num: int,
|
| 291 |
+
parameter_name: str,
|
| 292 |
+
parameter_description: str,
|
| 293 |
+
parameter_type: str = "text"
|
| 294 |
+
) -> Optional[VisionExtractionResult]:
|
| 295 |
+
|
| 296 |
+
try:
|
| 297 |
+
# Convert image to base64
|
| 298 |
+
img_base64 = self.image_to_base64(image)
|
| 299 |
+
if not img_base64:
|
| 300 |
+
return None
|
| 301 |
+
|
| 302 |
+
# Build prompt based on parameter type
|
| 303 |
+
prompt = self._build_extraction_prompt(
|
| 304 |
+
parameter_name,
|
| 305 |
+
parameter_description,
|
| 306 |
+
parameter_type
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
# Call GPT-4 Vision
|
| 310 |
+
response = self.client.chat.completions.create(
|
| 311 |
+
model=self.model,
|
| 312 |
+
messages=[
|
| 313 |
+
{
|
| 314 |
+
"role": "user",
|
| 315 |
+
"content": [
|
| 316 |
+
{
|
| 317 |
+
"type": "text",
|
| 318 |
+
"text": prompt
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"type": "image_url",
|
| 322 |
+
"image_url": {
|
| 323 |
+
"url": f"data:image/png;base64,{img_base64}",
|
| 324 |
+
"detail": "high"
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
]
|
| 328 |
+
}
|
| 329 |
+
],
|
| 330 |
+
max_tokens=500,
|
| 331 |
+
temperature=0.0 # Deterministic for data extraction
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# Parse response
|
| 335 |
+
result_text = response.choices[0].message.content
|
| 336 |
+
|
| 337 |
+
# Parse structured response
|
| 338 |
+
return self._parse_vision_response(
|
| 339 |
+
result_text,
|
| 340 |
+
parameter_name,
|
| 341 |
+
page_num
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.error(f"Error extracting {parameter_name} from page {page_num}: {str(e)}")
|
| 346 |
+
return None
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def _build_extraction_prompt(
|
| 350 |
+
self,
|
| 351 |
+
parameter_name: str,
|
| 352 |
+
parameter_description: str,
|
| 353 |
+
parameter_type: str
|
| 354 |
+
) -> str:
|
| 355 |
+
"""Build prompt for GPT-4 Vision extraction"""
|
| 356 |
+
|
| 357 |
+
prompt = f"""You are analyzing a financial document (Bureau Credit Report or GST Return).
|
| 358 |
+
|
| 359 |
+
**TASK:** Extract the following parameter from this document page.
|
| 360 |
+
|
| 361 |
+
**Parameter Name:** {parameter_name}
|
| 362 |
+
**Description:** {parameter_description}
|
| 363 |
+
**Expected Type:** {parameter_type}
|
| 364 |
+
|
| 365 |
+
**INSTRUCTIONS:**
|
| 366 |
+
1. Look for this parameter in the document
|
| 367 |
+
2. If found, extract the exact value
|
| 368 |
+
3. Note the specific section/location where you found it (e.g., "Account Summary Table, Row 3" or "DPD History Section")
|
| 369 |
+
4. Provide surrounding context (nearby text)
|
| 370 |
+
|
| 371 |
+
**OUTPUT FORMAT (JSON):**
|
| 372 |
+
{{
|
| 373 |
+
"found": true/false,
|
| 374 |
+
"value": <extracted value or null>,
|
| 375 |
+
"source": "<specific section/table/location>",
|
| 376 |
+
"confidence": <0.0-1.0>,
|
| 377 |
+
"context": "<surrounding text for verification>"
|
| 378 |
+
}}
|
| 379 |
+
|
| 380 |
+
**EXAMPLES:**
|
| 381 |
+
|
| 382 |
+
For "DPD 30 Days" in a credit report:
|
| 383 |
+
{{
|
| 384 |
+
"found": true,
|
| 385 |
+
"value": 2,
|
| 386 |
+
"source": "Payment History Table - DPD 30 Days column",
|
| 387 |
+
"confidence": 0.95,
|
| 388 |
+
"context": "DPD History: 0-30 days: 2 occurrences"
|
| 389 |
+
}}
|
| 390 |
+
|
| 391 |
+
For "Settlement/Write-off" flag:
|
| 392 |
+
{{
|
| 393 |
+
"found": true,
|
| 394 |
+
"value": false,
|
| 395 |
+
"source": "Account Status Summary - Settlement Status field",
|
| 396 |
+
"confidence": 0.90,
|
| 397 |
+
"context": "Settlement Status: Not Applicable, Write-off Status: No"
|
| 398 |
+
}}
|
| 399 |
+
|
| 400 |
+
If parameter not found on this page:
|
| 401 |
+
{{
|
| 402 |
+
"found": false,
|
| 403 |
+
"value": null,
|
| 404 |
+
"source": "Not found on this page",
|
| 405 |
+
"confidence": 0.0,
|
| 406 |
+
"context": ""
|
| 407 |
+
}}
|
| 408 |
+
|
| 409 |
+
**CRITICAL RULES:**
|
| 410 |
+
- Be precise with locations (section names, table names, row/column)
|
| 411 |
+
- Extract EXACT values, don't interpret
|
| 412 |
+
- For boolean parameters, return true/false
|
| 413 |
+
- For numeric parameters, return numbers (not strings)
|
| 414 |
+
- If unsure, set confidence < 0.7
|
| 415 |
+
- Return ONLY valid JSON, no other text
|
| 416 |
+
|
| 417 |
+
Now analyze the document image and extract the parameter:"""
|
| 418 |
+
|
| 419 |
+
return prompt
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
def _parse_vision_response(
|
| 423 |
+
self,
|
| 424 |
+
response_text: str,
|
| 425 |
+
parameter_id: str,
|
| 426 |
+
page_num: int
|
| 427 |
+
) -> Optional[VisionExtractionResult]:
|
| 428 |
+
"""Parse GPT-4 Vision response into structured result"""
|
| 429 |
+
try:
|
| 430 |
+
import json
|
| 431 |
+
|
| 432 |
+
# Extract JSON from response (handle markdown code blocks)
|
| 433 |
+
json_text = response_text.strip()
|
| 434 |
+
if "```json" in json_text:
|
| 435 |
+
json_text = json_text.split("```json")[1].split("```")[0].strip()
|
| 436 |
+
elif "```" in json_text:
|
| 437 |
+
json_text = json_text.split("```")[1].split("```")[0].strip()
|
| 438 |
+
|
| 439 |
+
# Parse JSON
|
| 440 |
+
data = json.loads(json_text)
|
| 441 |
+
|
| 442 |
+
# Check if found
|
| 443 |
+
if not data.get("found", False):
|
| 444 |
+
return None
|
| 445 |
+
|
| 446 |
+
# Build result
|
| 447 |
+
result = VisionExtractionResult(
|
| 448 |
+
parameter_id=parameter_id,
|
| 449 |
+
parameter_name=parameter_id.replace("_", " ").title(),
|
| 450 |
+
value=data.get("value"),
|
| 451 |
+
source=data.get("source", "Unknown location"),
|
| 452 |
+
page_number=page_num,
|
| 453 |
+
confidence=float(data.get("confidence", 0.5)),
|
| 454 |
+
context=data.get("context", "")
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
return result
|
| 458 |
+
|
| 459 |
+
except Exception as e:
|
| 460 |
+
logger.error(f"Error parsing vision response: {str(e)}")
|
| 461 |
+
logger.debug(f"Response text: {response_text}")
|
| 462 |
+
return None
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def extract_parameter_from_pdf(
|
| 466 |
+
self,
|
| 467 |
+
pdf_path: str,
|
| 468 |
+
parameter_name: str,
|
| 469 |
+
parameter_description: str,
|
| 470 |
+
parameter_type: str = "text",
|
| 471 |
+
search_all_pages: bool = True # Search all pages for best accuracy
|
| 472 |
+
) -> Optional[VisionExtractionResult]:
|
| 473 |
+
|
| 474 |
+
try:
|
| 475 |
+
logger.info(f"Extracting '{parameter_name}' from {Path(pdf_path).name}")
|
| 476 |
+
|
| 477 |
+
# Convert PDF to images (uses cache if already converted! - ONLY OPTIMIZATION)
|
| 478 |
+
images = self.pdf_to_images(pdf_path, dpi=200)
|
| 479 |
+
if not images:
|
| 480 |
+
logger.error("Failed to convert PDF to images")
|
| 481 |
+
return None
|
| 482 |
+
|
| 483 |
+
# Search pages
|
| 484 |
+
results = []
|
| 485 |
+
|
| 486 |
+
for page_num, image in enumerate(images, start=1):
|
| 487 |
+
logger.info(f"Searching page {page_num}/{len(images)}...")
|
| 488 |
+
|
| 489 |
+
result = self.extract_parameter_from_page(
|
| 490 |
+
image=image,
|
| 491 |
+
page_num=page_num,
|
| 492 |
+
parameter_name=parameter_name,
|
| 493 |
+
parameter_description=parameter_description,
|
| 494 |
+
parameter_type=parameter_type
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
if result and result.value is not None:
|
| 498 |
+
logger.success(f"Found on page {page_num}: {result.value} (confidence: {result.confidence})")
|
| 499 |
+
results.append(result)
|
| 500 |
+
|
| 501 |
+
# Stop if we found a good match and not searching all pages
|
| 502 |
+
if not search_all_pages and result.confidence > 0.7:
|
| 503 |
+
break
|
| 504 |
+
|
| 505 |
+
# Return best result
|
| 506 |
+
if results:
|
| 507 |
+
best_result = max(results, key=lambda r: r.confidence)
|
| 508 |
+
logger.success(f"Best match: page {best_result.page_number}, confidence {best_result.confidence}")
|
| 509 |
+
return best_result
|
| 510 |
+
else:
|
| 511 |
+
logger.warning(f"Parameter '{parameter_name}' not found in document")
|
| 512 |
+
return None
|
| 513 |
+
|
| 514 |
+
except Exception as e:
|
| 515 |
+
logger.error(f"Error extracting parameter from PDF: {str(e)}")
|
| 516 |
+
return None
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def extract_gst_sales_with_vision(
|
| 520 |
+
self,
|
| 521 |
+
pdf_path: str
|
| 522 |
+
) -> Optional[Dict[str, Any]]:
|
| 523 |
+
|
| 524 |
+
try:
|
| 525 |
+
logger.info(f"Extracting GST sales from {Path(pdf_path).name}")
|
| 526 |
+
|
| 527 |
+
# Convert PDF to images
|
| 528 |
+
images = self.pdf_to_images(pdf_path)
|
| 529 |
+
if not images:
|
| 530 |
+
return None
|
| 531 |
+
|
| 532 |
+
# Prompt for GST sales
|
| 533 |
+
prompt = """You are analyzing a GSTR-3B (GST Return) document.
|
| 534 |
+
|
| 535 |
+
**TASK:** Extract the total taxable sales value from Table 3.1(a).
|
| 536 |
+
|
| 537 |
+
**WHAT TO LOOK FOR:**
|
| 538 |
+
- Table 3.1(a): "Details of Outward Supplies and inward supplies liable to reverse charge"
|
| 539 |
+
- Look for "Taxable value" or "Total Taxable value"
|
| 540 |
+
- This is usually in the first row of Table 3.1
|
| 541 |
+
|
| 542 |
+
**OUTPUT FORMAT (JSON):**
|
| 543 |
+
{{
|
| 544 |
+
"found": true/false,
|
| 545 |
+
"month": "<month and year, e.g., January 2025>",
|
| 546 |
+
"sales": <numeric value>,
|
| 547 |
+
"source": "GSTR-3B Table 3.1(a)",
|
| 548 |
+
"confidence": <0.0-1.0>
|
| 549 |
+
}}
|
| 550 |
+
|
| 551 |
+
**EXAMPLE:**
|
| 552 |
+
{{
|
| 553 |
+
"found": true,
|
| 554 |
+
"month": "January 2025",
|
| 555 |
+
"sales": 951381,
|
| 556 |
+
"source": "GSTR-3B Table 3.1(a) - Taxable outward supplies",
|
| 557 |
+
"confidence": 0.95
|
| 558 |
+
}}
|
| 559 |
+
|
| 560 |
+
Return ONLY valid JSON, no other text."""
|
| 561 |
+
|
| 562 |
+
# Try each page
|
| 563 |
+
for page_num, image in enumerate(images, start=1):
|
| 564 |
+
try:
|
| 565 |
+
img_base64 = self.image_to_base64(image)
|
| 566 |
+
|
| 567 |
+
response = self.client.chat.completions.create(
|
| 568 |
+
model=self.model,
|
| 569 |
+
messages=[
|
| 570 |
+
{
|
| 571 |
+
"role": "user",
|
| 572 |
+
"content": [
|
| 573 |
+
{"type": "text", "text": prompt},
|
| 574 |
+
{
|
| 575 |
+
"type": "image_url",
|
| 576 |
+
"image_url": {
|
| 577 |
+
"url": f"data:image/png;base64,{img_base64}",
|
| 578 |
+
"detail": "high"
|
| 579 |
+
}
|
| 580 |
+
}
|
| 581 |
+
]
|
| 582 |
+
}
|
| 583 |
+
],
|
| 584 |
+
max_tokens=300,
|
| 585 |
+
temperature=0.0
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
result_text = response.choices[0].message.content
|
| 589 |
+
|
| 590 |
+
# Parse JSON
|
| 591 |
+
import json
|
| 592 |
+
json_text = result_text.strip()
|
| 593 |
+
if "```json" in json_text:
|
| 594 |
+
json_text = json_text.split("```json")[1].split("```")[0].strip()
|
| 595 |
+
elif "```" in json_text:
|
| 596 |
+
json_text = json_text.split("```")[1].split("```")[0].strip()
|
| 597 |
+
|
| 598 |
+
data = json.loads(json_text)
|
| 599 |
+
|
| 600 |
+
if data.get("found") and data.get("sales"):
|
| 601 |
+
logger.success(f"Found GST sales on page {page_num}: {data['sales']}")
|
| 602 |
+
return {
|
| 603 |
+
"month": data.get("month", "Unknown"),
|
| 604 |
+
"sales": data["sales"],
|
| 605 |
+
"source": data.get("source", "GSTR-3B Table 3.1(a)")
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
except Exception as e:
|
| 609 |
+
logger.debug(f"Page {page_num} - no sales data: {str(e)}")
|
| 610 |
+
continue
|
| 611 |
+
|
| 612 |
+
logger.warning("GST sales not found in document")
|
| 613 |
+
return None
|
| 614 |
+
|
| 615 |
+
except Exception as e:
|
| 616 |
+
logger.error(f"Error extracting GST sales: {str(e)}")
|
| 617 |
+
return None
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
|