Spaces:

datasciencesage
/

Document-Intelligence-Bureau-GST-Data-Extraction

Sleeping

App Files Files Community

datasciencesage commited on about 1 month ago

Commit

95ff1e1

verified ·

1 Parent(s): 363db8d

Upload 8 files

Browse files

Files changed (7) hide show

core/document_parser.py +343 -0
core/domain_knowledge.py +89 -0
core/domain_rag.py +227 -0
core/embeddings.py +311 -0
core/extractor.py +141 -0
core/rag_pipeline.py +348 -0
core/vision_parser.py +620 -0

core/document_parser.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import pdfplumber
+import re
+from pathlib import Path
+from typing import Dict, Any, Optional, List, Tuple
+from dataclasses import dataclass
+from loguru import logger
+@dataclass
+class DocumentChunk:
+    """chunk of text from document"""
+    chunk_id: str
+    text: str
+    page_num: int
+    start_char: int
+    end_char: int
+    metadata: Dict[str, Any]
+@dataclass
+class ParsedDocument:
+    """parsed document data"""
+    file_name: str
+    total_pages: int
+    text_content: str
+    pages: List[Dict[str, Any]]
+    tables: List[Dict[str, Any]]
+    chunks: List[DocumentChunk]
+    metadata: Dict[str, Any]
+class DocumentParser:
+    # PDF parser with chunking for RAG
+    def __init__(self, chunk_size=1000, chunk_overlap=200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        logger.info(f"Parser initialized - chunk_size={chunk_size}, overlap={chunk_overlap}")
+    def parse_pdf(self, pdf_path):
+        """
+        parse PDF and extract content
+        """
+        logger.info(f"Parsing: {Path(pdf_path).name}")
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                all_text = []
+                pages_data = []
+                tables_data = []
+                # go through each page
+                for page_num, page in enumerate(pdf.pages, start=1):
+                    try:
+                        page_result = self._parse_page(page, page_num)
+                        all_text.append(page_result["text"])
+                        pages_data.append(page_result["page_data"])
+                        tables_data.extend(page_result["tables"])
+                        logger.debug(f"Page {page_num}: {len(page_result['text'])} chars, {len(page_result['tables'])} tables")
+                    except Exception as e:
+                        logger.error(f"Error on page {page_num}: {str(e)}")
+                        continue  # skip problematic pages
+                full_text = "\n\n".join(all_text)
+                # create chunks for embeddings
+                chunks = self._create_chunks(full_text, Path(pdf_path).name)
+                metadata = {
+                    "file_path": pdf_path,
+                    "file_name": Path(pdf_path).name,
+                    "total_pages": len(pdf.pages),
+                    "total_tables": len(tables_data),
+                    "total_chunks": len(chunks),
+                    "text_length": len(full_text)
+                }
+                parsed_doc = ParsedDocument(
+                    file_name=Path(pdf_path).name,
+                    total_pages=len(pdf.pages),
+                    text_content=full_text,
+                    pages=pages_data,
+                    tables=tables_data,
+                    chunks=chunks,
+                    metadata=metadata
+                )
+                logger.success(f"Parsed {len(pdf.pages)} pages, {len(tables_data)} tables, {len(chunks)} chunks")
+                return parsed_doc
+        except FileNotFoundError:
+            logger.error(f"File not found: {pdf_path}")
+            return None
+        except Exception as e:
+            logger.error(f"Failed to parse {pdf_path}: {str(e)}")
+            return None
+    def _parse_page(self, page, page_num):
+        """parse single page"""
+        try:
+            # grab text
+            page_text = page.extract_text()
+            if page_text is None:
+                page_text = ""
+            # extract tables
+            tables = []
+            raw_tables = page.extract_tables()
+            for table_idx, table in enumerate(raw_tables):
+                if table and len(table) > 0:
+                    try:
+                        table_data = {
+                            "page": page_num,
+                            "table_id": f"p{page_num}_t{table_idx + 1}",
+                            "headers": table[0] if table else [],
+                            "rows": table[1:] if len(table) > 1 else [],
+                            "raw_data": table
+                        }
+                        tables.append(table_data)
+                    except Exception as e:
+                        logger.warning(f"Table {table_idx} error on page {page_num}: {str(e)}")
+            page_data = {
+                "page_num": page_num,
+                "text": page_text,
+                "text_length": len(page_text),
+                "tables_count": len(tables),
+                "width": page.width,
+                "height": page.height
+            }
+            return {
+                "text": page_text,
+                "tables": tables,
+                "page_data": page_data
+            }
+        except Exception as e:
+            logger.error(f"_parse_page error for page {page_num}: {str(e)}")
+            return {
+                "text": "",
+                "tables": [],
+                "page_data": {
+                    "page_num": page_num,
+                    "text": "",
+                    "text_length": 0,
+                    "tables_count": 0
+                }
+            }
+    def _create_chunks(self, text, file_name):
+        """
+        break text into chunks with overlap
+        TODO: maybe improve the chunking logic later
+        """
+        try:
+            chunks = []
+            if not text:
+                logger.warning("Empty text for chunking")
+                return chunks
+            # split by paragraphs
+            paragraphs = text.split('\n\n')
+            current_chunk = ""
+            current_start = 0
+            chunk_id = 0
+            for para in paragraphs:
+                para = para.strip()
+                if not para:
+                    continue
+                # check if adding para exceeds size
+                if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
+                    # save chunk
+                    chunk = DocumentChunk(
+                        chunk_id=f"chunk_{chunk_id}",
+                        text=current_chunk.strip(),
+                        page_num=0,  # not tracking page num for now
+                        start_char=current_start,
+                        end_char=current_start + len(current_chunk),
+                        metadata={
+                            "source_file": file_name,
+                            "chunk_length": len(current_chunk)
+                        }
+                    )
+                    chunks.append(chunk)
+                    chunk_id += 1
+                    # start new chunk with overlap
+                    if len(current_chunk) > self.chunk_overlap:
+                        overlap_text = current_chunk[-self.chunk_overlap:]
+                    else:
+                        overlap_text = current_chunk
+                    current_start = current_start + len(current_chunk) - len(overlap_text)
+                    current_chunk = overlap_text + "\n\n" + para
+                else:
+                    # add to current chunk
+                    if current_chunk:
+                        current_chunk += "\n\n" + para
+                    else:
+                        current_chunk = para
+            # add final chunk
+            if current_chunk:
+                chunk = DocumentChunk(
+                    chunk_id=f"chunk_{chunk_id}",
+                    text=current_chunk.strip(),
+                    page_num=0,
+                    start_char=current_start,
+                    end_char=current_start + len(current_chunk),
+                    metadata={
+                        "source_file": file_name,
+                        "chunk_length": len(current_chunk)
+                    }
+                )
+                chunks.append(chunk)
+            logger.info(f"Created {len(chunks)} chunks")
+            return chunks
+        except Exception as e:
+            logger.error(f"Chunking error: {str(e)}")
+            return []
+    def extract_bureau_score(self, parsed_doc):
+        """
+        grab CIBIL score from CRIF report
+        looks for pattern like "PERFORM CONSUMER 2.2 300-900 627"
+        """
+        try:
+            text = parsed_doc.text_content
+            # main pattern - score after range
+            pattern = r'PERFORM\s+CONSUMER.*?300-900\s+(\d{3})'
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                score = int(match.group(1))
+                if 300 <= score <= 900:
+                    logger.info(f"Found bureau score: {score}")
+                    return {
+                        "value": score,
+                        "source": "CRIF Report – Score Section"
+                    }
+            # fallback - check first couple pages
+            for page in parsed_doc.pages[:2]:
+                page_text = page["text"]
+                numbers = re.findall(r'\b(\d{3})\b', page_text)
+                for num_str in numbers:
+                    num = int(num_str)
+                    if 300 <= num <= 900:
+                        # check if its actually a score
+                        idx = page_text.find(num_str)
+                        context = page_text[max(0, idx-100):idx+100]
+                        keywords = ['score', 'cibil', 'credit', 'bureau']
+                        if any(kw in context.lower() for kw in keywords):
+                            logger.info(f"Found score (fallback): {num}")
+                            return {
+                                "value": num,
+                                "source": f"CRIF Report – Page {page['page_num']}"
+                            }
+            logger.warning("Bureau score not found")
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting bureau score: {str(e)}")
+            return None
+    def extract_gst_sales(self, parsed_doc):
+        """extract sales from GSTR-3B table"""
+        try:
+            text = parsed_doc.text_content
+            filename = parsed_doc.file_name
+            # get month from document
+            month_match = re.search(r'Period\s+(\w+)', text)
+            month_name = month_match.group(1) if month_match else "Unknown"
+            # extract year from filename (GSTR3B_..._012025.pdf format)
+            filename_year_match = re.search(r'_(\d{2})(\d{4})\.pdf', filename)
+            if filename_year_match:
+                year = filename_year_match.group(2)
+            else:
+                # fallback
+                year_match = re.search(r'Year\s+(\d{4})', text)
+                year = year_match.group(1) if year_match else "2025"
+            formatted_month = f"{month_name} {year}"
+            # search tables for sales
+            for table in parsed_doc.tables:
+                rows = table.get("rows", [])
+                for row in rows:
+                    if row and len(row) > 1:
+                        first_cell = str(row[0]).replace('\n', ' ')
+                        # find row (a) with outward supplies
+                        if "(a)" in first_cell and "Outward taxable supplies" in first_cell:
+                            if len(row) > 1 and row[1]:
+                                value_str = str(row[1])
+                                clean_value = re.sub(r'[^\d.]', '', value_str)
+                                if clean_value:
+                                    try:
+                                        sales = float(clean_value)
+                                        logger.info(f"GST sales: {sales} for {formatted_month}")
+                                        return {
+                                            "month": formatted_month,
+                                            "sales": sales,
+                                            "source": "GSTR-3B Table 3.1(a)"
+                                        }
+                                    except ValueError as e:
+                                        logger.warning(f"Couldn't parse sales value '{clean_value}': {str(e)}")
+            logger.warning(f"Sales data not found for {formatted_month}")
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting GST sales: {str(e)}")
+            return None
+    def get_chunks_text(self, chunks):
+        """get text from chunks for embedding"""
+        try:
+            return [chunk.text for chunk in chunks]
+        except Exception as e:
+            logger.error(f"Error getting chunks text: {str(e)}")
+            return []

core/domain_knowledge.py ADDED Viewed

	@@ -0,0 +1,89 @@

+BUREAU_TERMINOLOGY = [
+    "Bureau credit score: Numerical representation of creditworthiness ranging from 300 (poor) to 900 (excellent). Higher scores indicate better credit history and lower risk.",
+    "DPD (Days Past Due): Number of days a payment is overdue beyond the due date. Common thresholds monitored are 30+, 60+, and 90+ DPD.",
+    "30+ DPD: Count of accounts with payments overdue by 30 or more days in the specified monitoring period. Indicates early stage delinquency.",
+    "60+ DPD: Count of accounts with payments overdue by 60 or more days in the specified monitoring period. Indicates moderate delinquency.",
+    "90+ DPD: Count of accounts with payments overdue by 90 or more days in the specified monitoring period. Indicates serious delinquency.",
+    "Settlement: Debt resolved by borrower paying less than the full amount owed, typically after negotiation with creditor. Marked negatively on credit report.",
+    "Write-off: Debt declared unrecoverable by lender and removed from active accounts. Severely impacts credit score and indicates non-payment.",
+    "NTC (No-Track-Case): Credit applicants with insufficient credit history or no previous credit accounts in bureau database. Also called 'New to Credit'.",
+    "Suit Filed: Legal action initiated by creditor for debt recovery through courts. Indicates serious delinquency and unwillingness to pay.",
+    "Wilful Default: Deliberate non-payment of debt despite having the financial ability to pay. Considered fraudulent behavior and severely impacts creditworthiness.",
+    "Live PL/BL: Active Personal Loan or Business Loan currently being serviced by the borrower with regular payments.",
+    "Overdue amount: Total unpaid amount across all accounts that is past the due date. Sum of all overdue balances.",
+    "Credit inquiry: Request made by lender to check credit report when applicant applies for credit. Too many inquiries indicate credit hunger.",
+    "Active loans: Loans currently being serviced by borrower, not yet closed or settled. Indicates current credit obligations.",
+    "Loan exposure: Total outstanding amount across all loans. Also called total debt or credit exposure.",
+]
+# GST and GSTR-3B Terminology
+GST_TERMINOLOGY = [
+    "GSTR-3B: Monthly return filing summarizing outward supplies, input tax credit claimed, and net tax liability for the tax period.",
+    "Table 3.1(a): Section in GSTR-3B reporting outward taxable supplies (other than zero rated, nil rated and exempted). This is the main sales figure.",
+    "Outward supplies: Goods or services provided by the registered GST taxpayer to customers. This is the sales/revenue of the business.",
+    "Taxable supplies: Supplies on which GST is levied at applicable rates (5%, 12%, 18%, or 28%). Excludes exempted and nil-rated supplies.",
+    "Taxable value: The base value on which GST is calculated, excluding the GST amount itself. This is the pre-tax revenue.",
+    "Outward taxable supplies: Sales of goods/services on which GST is applicable. Found in GSTR-3B Table 3.1, row (a).",
+    "GSTR-3B structure: Contains multiple tables - Table 3.1 for outward supplies, Table 3.2 for inter-state supplies, Table 4 for input tax credit.",
+    "Tax period: The month and year for which the GST return is filed. Format is usually 'Month YYYY' (e.g., January 2025).",
+    "GSTIN: GST Identification Number, unique 15-digit alphanumeric code assigned to each registered taxpayer.",
+]
+# Validation and Business Rules
+VALIDATION_RULES = [
+    "Valid bureau credit scores: Must be between 300 and 900 inclusive. Scores outside this range are invalid.",
+    "Credit score interpretation: 300-579 is Poor, 580-669 is Fair, 670-739 is Good, 740-799 is Very Good, 800-900 is Excellent.",
+    "DPD hierarchy rule: 90+ DPD count ≤ 60+ DPD count ≤ 30+ DPD count. If this is violated, data may be incorrect.",
+    "GST sales validation: Taxable value should be non-negative numbers. Negative sales indicate data entry error.",
+    "Suspicious GST amounts: Values over 10 crore (100,000,000 rupees) should be flagged for verification as potentially incorrect.",
+    "Written-off debt amount: Should be non-negative. Negative values indicate error in extraction or data.",
+    "Loan counts validation: Max loans and max active loans should be non-negative integers. Cannot have negative loan counts.",
+    "Overdue threshold: Maximum allowable overdue amount, typically ranging from 0 to several lakhs. Depends on risk appetite.",
+    "Credit inquiry limits: Excessive inquiries (>5 in 6 months) indicate credit hunger and should be flagged.",
+    "Zero values interpretation: Zero or null values may indicate either absence of the attribute or that the parameter is not applicable.",
+]
+# Extraction Hints and Location Guidance
+EXTRACTION_HINTS = [
+    "Bureau credit score location: Typically appears near terms like 'PERFORM', 'CONSUMER', 'Score', 'CIBIL', or in a dedicated score section on first page.",
+    "Credit score format: Usually displayed as a 3-digit number between 300-900, sometimes with a gauge or range indicator.",
+    "DPD information location: Often found in payment history tables, delinquency sections, or account performance summary.",
+    "Settlement and write-off status: Usually marked explicitly in account status columns with keywords 'Settled', 'Written Off', or status codes.",
+    "Live loan indicators: Marked with 'Active', 'Current', 'Live', or similar status in account listings.",
+    "GSTR-3B sales extraction: Sales figures are in Table 3.1, row labeled '(a) Outward taxable supplies', second column shows taxable value.",
+    "GSTR-3B month extraction: Month information appears as 'Period' followed by month name (January, February, etc.).",
+    "GSTR-3B year extraction: Year appears in 'Year' field in format 'YYYY-YY' (e.g., 2024-25) or in filename as MMYYYY (e.g., 012025).",
+    "Table structure in PDFs: Tables may span multiple pages. Look for continuation rows and merged cells.",
+    "Multiple bureau reports: When processing multiple reports, extract parameters separately for each person/entity.",
+    "NTC acceptance: Check for explicit mentions of 'No Track Case', 'NTC', 'New to Credit' status in summary or remarks.",
+    "Suit filed indicators: Look for keywords 'Suit Filed', 'Legal Action', 'Court Case' in account remarks or status.",
+]
+# Common Patterns and Formats
+COMMON_PATTERNS = [
+    "Date formats in bureau reports: DD-MM-YYYY, DD/MM/YYYY, or MMM-YYYY for month-year format.",
+    "Currency representation: Indian Rupees shown as '₹', 'Rs.', 'INR', or just numbers with commas (e.g., 1,50,000).",
+    "Percentage formats: Shown with '%' symbol or as decimals (0.15 = 15%).",
+    "Boolean values: Yes/No, True/False, Y/N, 1/0, or Present/Absent for presence/absence of attributes.",
+    "Account types: PL (Personal Loan), BL (Business Loan), CC (Credit Card), HL (Home Loan), AL (Auto Loan).",
+    "Status codes in bureau: STD (Standard), SMA (Special Mention Account), SUB (Sub-standard), DBT (Doubtful), LSS (Loss).",
+]
+# All knowledge combined for easy iteration
+ALL_KNOWLEDGE = (
+    BUREAU_TERMINOLOGY +
+    GST_TERMINOLOGY +
+    VALIDATION_RULES +
+    EXTRACTION_HINTS +
+    COMMON_PATTERNS
+)
+# Category mapping for retrieval filtering
+KNOWLEDGE_CATEGORIES = {
+    "bureau_terminology": BUREAU_TERMINOLOGY,
+    "gst_terminology": GST_TERMINOLOGY,
+    "validation_rules": VALIDATION_RULES,
+    "extraction_hints": EXTRACTION_HINTS,
+    "common_patterns": COMMON_PATTERNS,
+}

core/domain_rag.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+import numpy as np
+from loguru import logger
+from .domain_knowledge import ALL_KNOWLEDGE, KNOWLEDGE_CATEGORIES
+from .embeddings import EmbeddingService
+@dataclass
+class DomainSnippet:
+    """domain knowledge snippet with its embedding"""
+    text: str
+    embedding: np.ndarray
+    category: str
+    index: int
+class DomainRAG:
+    """
+    RAG for domain knowledge
+    embeds hardcoded domain notes on startup
+    """
+    def __init__(self, embedding_service):
+        try:
+            self.embedding_service = embedding_service
+            self.snippets = []
+            self._initialize_knowledge()
+        except Exception as e:
+            logger.error(f"Failed to init DomainRAG: {str(e)}")
+            raise
+    def _initialize_knowledge(self):
+        """embed all domain knowledge on startup"""
+        try:
+            logger.info("Initializing domain knowledge RAG...")
+            logger.info(f"Total snippets to embed: {len(ALL_KNOWLEDGE)}")
+            # batch embed all snippets
+            embeddings = self.embedding_service.create_embeddings_batch(
+                texts=ALL_KNOWLEDGE
+            )
+            if not embeddings or len(embeddings) != len(ALL_KNOWLEDGE):
+                logger.error("Failed to create embeddings for domain knowledge")
+                return
+            # store snippets with their embeddings
+            for i, text in enumerate(ALL_KNOWLEDGE):
+                category = self._categorize_snippet(text)
+                self.snippets.append(DomainSnippet(
+                    text=text,
+                    embedding=embeddings[i],
+                    category=category,
+                    index=i
+                ))
+            logger.success(f"Domain RAG ready: {len(self.snippets)} snippets embedded")
+            # log category breakdown
+            category_counts = {}
+            for snippet in self.snippets:
+                if snippet.category in category_counts:
+                    category_counts[snippet.category] += 1
+                else:
+                    category_counts[snippet.category] = 1
+            for category, count in category_counts.items():
+                logger.info(f"  - {category}: {count} snippets")
+        except Exception as e:
+            logger.error(f"Error initializing domain knowledge: {str(e)}")
+            # don't crash - system can work without it
+            self.snippets = []
+    def _categorize_snippet(self, text):
+        """figure out what category this snippet belongs to"""
+        try:
+            text_lower = text.lower()
+            # bureau stuff
+            bureau_keywords = ['bureau', 'credit', 'dpd', 'score', 'cibil', 'loan',
+                             'settlement', 'write-off', 'ntc', 'suit']
+            if any(kw in text_lower for kw in bureau_keywords):
+                return "bureau"
+            # gst related
+            gst_keywords = ['gst', 'gstr', 'table', 'supply', 'taxable', 'outward',
+                          'gstin', 'tax period']
+            if any(kw in text_lower for kw in gst_keywords):
+                return "gst"
+            # validation rules
+            validation_keywords = ['valid', 'should', 'must', 'rule', 'between', 'range',
+                                 'validation', 'suspicious', 'negative']
+            if any(kw in text_lower for kw in validation_keywords):
+                return "validation"
+            # extraction hints
+            hint_keywords = ['location', 'found', 'appears', 'extraction', 'look for',
+                           'typically', 'usually', 'marked']
+            if any(kw in text_lower for kw in hint_keywords):
+                return "extraction_hint"
+            # pattern stuff
+            pattern_keywords = ['format', 'pattern', 'representation', 'shown as',
+                              'display', 'code', 'type']
+            if any(kw in text_lower for kw in pattern_keywords):
+                return "common_pattern"
+            return "general"
+        except Exception as e:
+            logger.error(f"Error categorizing snippet: {str(e)}")
+            return "general"
+    def retrieve(self, query, top_k=3, min_similarity=0.3, category_filter=None):
+        """
+        get most relevant domain snippets for query
+        """
+        try:
+            if not self.snippets:
+                logger.warning("No domain knowledge available")
+                return []
+            logger.info(f"Retrieving domain knowledge for: '{query[:100]}...'")
+            # embed the query
+            query_embedding = self.embedding_service.create_embedding(query)
+            if query_embedding is None:
+                logger.error("Failed to create query embedding")
+                return []
+            # filter by category if needed
+            filtered_snippets = self.snippets
+            if category_filter:
+                filtered_snippets = [s for s in self.snippets if s.category == category_filter]
+                logger.info(f"Filtered to {len(filtered_snippets)} snippets in '{category_filter}'")
+            if not filtered_snippets:
+                logger.warning(f"No snippets for category: {category_filter}")
+                return []
+            # prepare data for similarity search
+            snippet_embeddings = [s.embedding for s in filtered_snippets]
+            snippet_texts = [s.text for s in filtered_snippets]
+            snippet_metadata = [{"category": s.category, "index": s.index} for s in filtered_snippets]
+            # find similar snippets
+            results = self.embedding_service.find_most_similar(
+                query_embedding=query_embedding,
+                candidate_embeddings=snippet_embeddings,
+                candidate_texts=snippet_texts,
+                candidate_metadata=snippet_metadata,
+                top_k=top_k,
+                min_similarity=min_similarity
+            )
+            if results:
+                logger.success(f"Retrieved {len(results)} snippets (top: {results[0].similarity:.3f})")
+                return [r.text for r in results]
+            else:
+                logger.warning(f"No snippets above threshold {min_similarity}")
+                return []
+        except Exception as e:
+            logger.error(f"Error retrieving domain knowledge: {str(e)}")
+            return []
+    def retrieve_by_category(self, query, categories, snippets_per_category=2):
+        """get snippets grouped by category"""
+        try:
+            results = {}
+            for category in categories:
+                snippets = self.retrieve(
+                    query=query,
+                    top_k=snippets_per_category,
+                    category_filter=category
+                )
+                if snippets:
+                    results[category] = snippets
+            return results
+        except Exception as e:
+            logger.error(f"Error retrieving by category: {str(e)}")
+            return {}
+    def get_all_snippets(self, category=None):
+        """get all snippets, optionally filtered"""
+        try:
+            if category:
+                return [s.text for s in self.snippets if s.category == category]
+            else:
+                return [s.text for s in self.snippets]
+        except Exception as e:
+            logger.error(f"Error getting snippets: {str(e)}")
+            return []
+    def get_statistics(self):
+        """stats about domain knowledge"""
+        try:
+            category_counts = {}
+            for snippet in self.snippets:
+                if snippet.category in category_counts:
+                    category_counts[snippet.category] += 1
+                else:
+                    category_counts[snippet.category] = 1
+            embedding_dim = 0
+            if self.snippets and len(self.snippets) > 0:
+                embedding_dim = len(self.snippets[0].embedding)
+            return {
+                "total_snippets": len(self.snippets),
+                "categories": category_counts,
+                "embedding_dimension": embedding_dim
+            }
+        except Exception as e:
+            logger.error(f"Error getting stats: {str(e)}")
+            return {"total_snippets": 0, "categories": {}, "embedding_dimension": 0}

core/embeddings.py ADDED Viewed

	@@ -0,0 +1,311 @@

+from openai import OpenAI
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+from loguru import logger
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+from dataclasses import dataclass
+import time
+@dataclass
+class SimilarityResult:
+    """similarity search result"""
+    index: int
+    similarity: float
+    text: str
+    metadata: Dict[str, Any]
+class EmbeddingService:
+    """
+    OpenAI embeddings service
+    handles embedding creation and similarity search
+    """
+    def __init__(self, api_key, model="text-embedding-3-large"):
+        try:
+            self.client = OpenAI(api_key=api_key)
+            self.model = model
+            # dimensions based on model
+            if "large" in model:
+                self.embedding_dim = 1536
+            else:
+                self.embedding_dim = 1024
+            self.max_tokens = 8191
+            logger.info(f"EmbeddingService ready (model={model}, dims={self.embedding_dim})")
+        except Exception as e:
+            logger.error(f"Failed to init EmbeddingService: {str(e)}")
+            raise
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type(Exception),
+        reraise=True
+    )
+    def create_embedding(self, text):
+        """create embedding for single text with retry"""
+        try:
+            # check if text is valid
+            if not text or not text.strip():
+                logger.warning("Empty text for embedding")
+                return None
+            # truncate long text (rough estimate: 4 chars per token)
+            if len(text) > 30000:
+                text = text[:30000]
+                logger.warning("Text truncated to 30k chars")
+            start_time = time.time()
+            response = self.client.embeddings.create(
+                model=self.model,
+                input=text,
+                encoding_format="float"
+            )
+            embedding = response.data[0].embedding
+            elapsed = time.time() - start_time
+            logger.debug(f"Created embedding in {elapsed:.2f}s ({len(text)} chars → {len(embedding)} dims)")
+            return embedding
+        except Exception as e:
+            logger.error(f"Error creating embedding: {str(e)}")
+            raise
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type(Exception),
+        reraise=True
+    )
+    def create_embeddings_batch(self, texts, batch_size=100):
+        """create embeddings for multiple texts in batches"""
+        try:
+            if not texts:
+                logger.warning("Empty text list for batch embedding")
+                return []
+            logger.info(f"Creating embeddings for {len(texts)} texts (batch_size={batch_size})")
+            all_embeddings = []
+            # process in chunks
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i:i + batch_size]
+                # clean up batch
+                processed_batch = []
+                for text in batch:
+                    if text and text.strip():
+                        # truncate if needed
+                        if len(text) > 30000:
+                            processed_batch.append(text[:30000])
+                        else:
+                            processed_batch.append(text)
+                    else:
+                        processed_batch.append(" ")  # fallback for empty
+                try:
+                    start_time = time.time()
+                    response = self.client.embeddings.create(
+                        model=self.model,
+                        input=processed_batch,
+                        encoding_format="float"
+                    )
+                    batch_embeddings = [data.embedding for data in response.data]
+                    all_embeddings.extend(batch_embeddings)
+                    elapsed = time.time() - start_time
+                    batch_num = i//batch_size + 1
+                    logger.debug(f"Batch {batch_num}: {len(batch)} texts in {elapsed:.2f}s")
+                except Exception as e:
+                    logger.error(f"Error in batch {i//batch_size + 1}: {str(e)}")
+                    # add None for failed ones
+                    for _ in range(len(batch)):
+                        all_embeddings.append(None)
+            # count successful embeddings
+            successful = 0
+            for e in all_embeddings:
+                if e is not None:
+                    successful += 1
+            success_rate = (successful / len(texts)) * 100
+            logger.success(f"Created {successful}/{len(texts)} embeddings ({success_rate:.1f}% success)")
+            return all_embeddings
+        except Exception as e:
+            logger.error(f"Error in batch embedding: {str(e)}")
+            return None
+    def cosine_similarity(self, vec1, vec2):
+        """calculate cosine similarity between two vectors"""
+        try:
+            v1 = np.array(vec1)
+            v2 = np.array(vec2)
+            dot_product = np.dot(v1, v2)
+            norm1 = np.linalg.norm(v1)
+            norm2 = np.linalg.norm(v2)
+            # handle zero vectors
+            if norm1 == 0 or norm2 == 0:
+                logger.warning("Zero vector in cosine similarity")
+                return 0.0
+            similarity = dot_product / (norm1 * norm2)
+            # clip to valid range (sometimes gets > 1 due to numerical errors)
+            similarity = float(np.clip(similarity, 0.0, 1.0))
+            return similarity
+        except Exception as e:
+            logger.error(f"Error calculating cosine similarity: {str(e)}")
+            return 0.0
+    def find_most_similar(self, query_embedding, candidate_embeddings,
+                         candidate_texts=None, candidate_metadata=None,
+                         top_k=5, min_similarity=0.0):
+        """
+        find most similar embeddings using cosine similarity
+        returns sorted list of results
+        """
+        try:
+            if not query_embedding or not candidate_embeddings:
+                logger.warning("Empty embeddings for similarity search")
+                return []
+            logger.info(f"Finding top {top_k} from {len(candidate_embeddings)} candidates (min={min_similarity})")
+            similarities = []
+            for idx, candidate in enumerate(candidate_embeddings):
+                # skip None embeddings
+                if candidate is None:
+                    continue
+                try:
+                    similarity = self.cosine_similarity(query_embedding, candidate)
+                    # filter by threshold
+                    if similarity >= min_similarity:
+                        text = ""
+                        if candidate_texts:
+                            text = candidate_texts[idx]
+                        metadata = {}
+                        if candidate_metadata:
+                            metadata = candidate_metadata[idx]
+                        result = SimilarityResult(
+                            index=idx,
+                            similarity=similarity,
+                            text=text,
+                            metadata=metadata
+                        )
+                        similarities.append(result)
+                except Exception as e:
+                    logger.warning(f"Error computing similarity for idx {idx}: {str(e)}")
+                    continue
+            # sort by similarity descending
+            similarities.sort(key=lambda x: x.similarity, reverse=True)
+            # get top k
+            top_results = similarities[:top_k]
+            if top_results:
+                logger.success(f"Found {len(top_results)} results (top: {top_results[0].similarity:.3f})")
+            else:
+                logger.warning("No results above threshold")
+            return top_results
+        except Exception as e:
+            logger.error(f"Error in find_most_similar: {str(e)}")
+            return []
+    def embed_documents(self, texts, metadata=None):
+        """embed multiple documents with metadata"""
+        try:
+            logger.info(f"Embedding {len(texts)} documents")
+            embeddings = self.create_embeddings_batch(texts)
+            if embeddings is None:
+                logger.error("Batch embedding failed")
+                return [], []
+            # create metadata if missing
+            if metadata is None:
+                metadata = []
+                for _ in texts:
+                    metadata.append({})
+            # add embedding info to metadata
+            for i in range(len(embeddings)):
+                embedding = embeddings[i]
+                meta = metadata[i]
+                if embedding is not None:
+                    meta["embedding_dim"] = len(embedding)
+                    meta["has_embedding"] = True
+                else:
+                    meta["has_embedding"] = False
+            return embeddings, metadata
+        except Exception as e:
+            logger.error(f"Error embedding documents: {str(e)}")
+            return [], []
+    def get_embedding_stats(self, embeddings):
+        """get stats about embeddings"""
+        try:
+            valid_embeddings = []
+            for e in embeddings:
+                if e is not None:
+                    valid_embeddings.append(e)
+            if not valid_embeddings:
+                return {
+                    "total": len(embeddings),
+                    "valid": 0,
+                    "invalid": len(embeddings),
+                    "success_rate": 0.0
+                }
+            # convert to numpy for calculations
+            emb_array = np.array(valid_embeddings)
+            # calculate norms
+            norms = []
+            for e in valid_embeddings:
+                norms.append(np.linalg.norm(e))
+            stats = {
+                "total": len(embeddings),
+                "valid": len(valid_embeddings),
+                "invalid": len(embeddings) - len(valid_embeddings),
+                "success_rate": len(valid_embeddings) / len(embeddings),
+                "dimensions": len(valid_embeddings[0]) if valid_embeddings else 0,
+                "mean_norm": float(np.mean(norms)),
+                "std_norm": float(np.std(norms))
+            }
+            logger.info(f"Embedding stats: {stats['valid']}/{stats['total']} valid ({stats['success_rate']*100:.1f}%)")
+            return stats
+        except Exception as e:
+            logger.error(f"Error calculating stats: {str(e)}")
+            return {}

core/extractor.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from openai import OpenAI
+import json
+from typing import Dict, Any, List
+class ExtractionService:
+    """extract parameters using GPT-4"""
+    def __init__(self, api_key):
+        self.client = OpenAI(api_key=api_key)
+        self.model = "gpt-4-turbo-preview"
+    def extract_parameter(self, parameter_name, parameter_description, context, source_location):
+        """
+        extract single parameter using GPT-4
+        """
+        # build the prompt
+        prompt = f"""Extract the following parameter from the document:
+PARAMETER: {parameter_name}
+DESCRIPTION: {parameter_description}
+DOCUMENT CONTEXT:
+{context}
+INSTRUCTIONS:
+1. Extract the exact value (no formatting - remove commas, currency symbols)
+2. If not found, return null
+3. Be precise - no approximations
+Return ONLY a JSON object:
+{{
+    "value": <number or string or null>,
+    "source": "{source_location}"
+}}"""
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are an expert at extracting financial data from documents. Extract exact values without any formatting."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                temperature=0.0,
+                response_format={"type": "json_object"}
+            )
+            result = json.loads(response.choices[0].message.content)
+            return result
+        except Exception as e:
+            return {
+                "value": None,
+                "source": f"Error: {str(e)}"
+            }
+    def extract_from_table(self, parameter_name, tables):
+        """
+        extract parameter from tables
+        TODO: maybe optimize this for large tables
+        """
+        # convert tables to text format
+        tables_text = ""
+        table_count = 0
+        for i, table in enumerate(tables):
+            if table_count >= 5:  # limit to 5 tables
+                break
+            tables_text += f"\n[Table {i+1} - Page {table['page']}]\n"
+            headers = table.get("headers", [])
+            rows = table.get("rows", [])
+            # add headers
+            if headers:
+                header_str = ""
+                for h in headers:
+                    header_str += str(h) + " | "
+                tables_text += header_str + "\n"
+            # add rows (max 20 per table)
+            row_count = 0
+            for row in rows:
+                if row_count >= 20:
+                    break
+                row_str = ""
+                for c in row:
+                    if c:
+                        row_str += str(c) + " | "
+                    else:
+                        row_str += " | "
+                tables_text += row_str + "\n"
+                row_count += 1
+            table_count += 1
+        # build prompt
+        prompt = f"""Find '{parameter_name}' in these tables:
+{tables_text}
+Return JSON:
+{{
+    "value": <extracted value without formatting>,
+    "source": "<table number and location>"
+}}"""
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "Extract data from tables. Return exact numbers without formatting."
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                temperature=0.0,
+                response_format={"type": "json_object"}
+            )
+            result = json.loads(response.choices[0].message.content)
+            return result
+        except Exception as e:
+            return {
+                "value": None,
+                "source": f"Error: {str(e)}"
+            }

core/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,348 @@

+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+import numpy as np
+from loguru import logger
+from openai import OpenAI
+from core.document_parser import ParsedDocument
+from core.embeddings import EmbeddingService
+from core.vision_parser import VisionDocumentParser, VisionExtractionResult
+@dataclass
+class ExtractionResult:
+    """result from parameter extraction"""
+    parameter_id: str
+    parameter_name: str
+    value: Any
+    source: str
+    confidence: float
+    context_used: str
+    metadata: Dict[str, Any]
+class EnhancedRAGPipeline:
+    """
+    RAG Pipeline with Vision support
+    tries vision first, falls back to traditional RAG
+    """
+    def __init__(self, embedding_service, openai_client, domain_rag=None,
+                 top_k=5, similarity_threshold=0.3, model="gpt-4o-mini",
+                 vision_model="gpt-4o", temperature=0.0, use_vision=True):
+        self.embedding_service = embedding_service
+        self.client = openai_client
+        self.domain_rag = domain_rag
+        self.top_k = top_k
+        self.similarity_threshold = similarity_threshold
+        self.model = model
+        self.temperature = temperature
+        self.use_vision = use_vision
+        # setup vision parser if needed
+        if use_vision:
+            self.vision_parser = VisionDocumentParser(
+                openai_client=openai_client,
+                model=vision_model
+            )
+            logger.success("Vision parser ready")
+        else:
+            self.vision_parser = None
+            logger.info("Vision extraction disabled")
+    def extract_parameter_with_vision(self, pdf_path, parameter_id,
+                                     parameter_name, parameter_description):
+        """
+        extract using GPT-4 Vision (most accurate method)
+        """
+        if not self.use_vision or not self.vision_parser:
+            return None
+        try:
+            logger.info(f"[VISION] Extracting {parameter_name}...")
+            # figure out what type of parameter this is
+            param_type = self._infer_parameter_type(parameter_id, parameter_description)
+            # use vision to extract
+            vision_result = self.vision_parser.extract_parameter_from_pdf(
+                pdf_path=pdf_path,
+                parameter_name=parameter_name,
+                parameter_description=parameter_description,
+                parameter_type=param_type,
+                search_all_pages=True
+            )
+            if vision_result:
+                # convert to our format
+                result = ExtractionResult(
+                    parameter_id=parameter_id,
+                    parameter_name=parameter_name,
+                    value=vision_result.value,
+                    source=vision_result.source,
+                    confidence=vision_result.confidence,
+                    context_used=vision_result.context,
+                    metadata={
+                        "method": "vision",
+                        "page_number": vision_result.page_number,
+                        "model": self.vision_parser.model
+                    }
+                )
+                logger.success(f"[VISION] Found: {result.value} (conf: {result.confidence:.2f})")
+                return result
+            else:
+                logger.warning(f"[VISION] Not found: {parameter_name}")
+                return None
+        except Exception as e:
+            logger.error(f"[VISION] Error: {str(e)}")
+            return None
+    def _infer_parameter_type(self, parameter_id, description):
+        """guess parameter type from id and description"""
+        param_lower = parameter_id.lower()
+        desc_lower = description.lower()
+        # boolean stuff
+        boolean_keywords = ["accepted", "flag", "status", "yes/no", "true/false",
+                          "settlement", "writeoff", "suit", "default"]
+        for keyword in boolean_keywords:
+            if keyword in param_lower or keyword in desc_lower:
+                return "boolean"
+        # numeric stuff
+        numeric_keywords = ["amount", "count", "number", "dpd", "loans",
+                          "threshold", "score", "inquiries"]
+        for keyword in numeric_keywords:
+            if keyword in param_lower or keyword in desc_lower:
+                return "number"
+        # dates
+        if "date" in param_lower or "date" in desc_lower:
+            return "date"
+        return "text"
+    def prepare_document(self, parsed_doc):
+        """
+        prepare doc for traditional RAG (fallback)
+        """
+        try:
+            chunk_texts = []
+            chunk_metadata = []
+            for chunk in parsed_doc.chunks:
+                chunk_texts.append(chunk.text)
+                chunk_metadata.append({
+                    "chunk_id": chunk.chunk_id,
+                    "page_num": chunk.page_num,
+                    "start_char": chunk.start_char,
+                    "end_char": chunk.end_char
+                })
+            # create embeddings
+            embeddings_list = self.embedding_service.create_embeddings_batch(chunk_texts)
+            if embeddings_list is None:
+                logger.error("Failed to create embeddings")
+                return None, None, None
+            # convert to numpy
+            embeddings = np.array(embeddings_list)
+            return embeddings, chunk_texts, chunk_metadata
+        except Exception as e:
+            logger.error(f"Error preparing document: {str(e)}")
+            return None, None, None
+    def extract_parameter_full_pipeline(self, parameter_id, parameter_name,
+                                       parameter_description, parsed_doc,
+                                       chunk_embeddings, chunk_texts,
+                                       chunk_metadata, pdf_path=None):
+        """
+        full extraction pipeline
+        tries vision first, then RAG as fallback
+        """
+        try:
+            # try vision first (best accuracy)
+            if pdf_path and self.use_vision:
+                vision_result = self.extract_parameter_with_vision(
+                    pdf_path=pdf_path,
+                    parameter_id=parameter_id,
+                    parameter_name=parameter_name,
+                    parameter_description=parameter_description
+                )
+                # if vision found it with good confidence, use that
+                if vision_result and vision_result.confidence >= 0.7:
+                    logger.success(f"[PIPELINE] Using VISION result (conf: {vision_result.confidence:.2f})")
+                    return vision_result
+            # try traditional RAG
+            logger.info(f"[PIPELINE] Trying traditional RAG for {parameter_name}...")
+            rag_result = self._extract_with_rag(
+                parameter_id=parameter_id,
+                parameter_name=parameter_name,
+                parameter_description=parameter_description,
+                chunk_embeddings=chunk_embeddings,
+                chunk_texts=chunk_texts,
+                chunk_metadata=chunk_metadata,
+                parsed_doc=parsed_doc
+            )
+            # if we have both, compare them
+            if vision_result and rag_result:
+                if vision_result.confidence > rag_result.confidence:
+                    logger.info(f"[PIPELINE] Vision wins: {vision_result.confidence:.2f} > {rag_result.confidence:.2f}")
+                    return vision_result
+                else:
+                    logger.info(f"[PIPELINE] RAG wins: {rag_result.confidence:.2f} > {vision_result.confidence:.2f}")
+                    return rag_result
+            # return whatever worked
+            if vision_result:
+                return vision_result
+            return rag_result
+        except Exception as e:
+            logger.error(f"Error in extraction pipeline: {str(e)}")
+            return None
+    def _extract_with_rag(self, parameter_id, parameter_name, parameter_description,
+                         chunk_embeddings, chunk_texts, chunk_metadata, parsed_doc):
+        """traditional RAG extraction (fallback method)"""
+        try:
+            # build query
+            query = f"{parameter_name}: {parameter_description}"
+            query_embedding = self.embedding_service.create_embedding(query)
+            if query_embedding is None:
+                return None
+            # get relevant chunks
+            similarities = np.dot(chunk_embeddings, query_embedding)
+            top_indices = np.argsort(similarities)[::-1][:self.top_k]
+            # filter by threshold
+            relevant_chunks = []
+            for idx in top_indices:
+                if similarities[idx] >= self.similarity_threshold:
+                    relevant_chunks.append({
+                        "text": chunk_texts[idx],
+                        "similarity": float(similarities[idx]),
+                        "metadata": chunk_metadata[idx]
+                    })
+            if not relevant_chunks:
+                return None
+            # get domain knowledge if available
+            domain_context = ""
+            if self.domain_rag:
+                domain_snippets = self.domain_rag.retrieve(query, top_k=3)
+                if domain_snippets:
+                    formatted_snippets = []
+                    for s in domain_snippets:
+                        # handle different types
+                        if isinstance(s, str):
+                            formatted_snippets.append(f"- {s}")
+                        elif hasattr(s, 'text'):
+                            formatted_snippets.append(f"- {s.text}")
+                        else:
+                            formatted_snippets.append(f"- {str(s)}")
+                    domain_context = "\n".join(formatted_snippets)
+            # build context from chunks
+            context_parts = []
+            for i, c in enumerate(relevant_chunks):
+                chunk_text = f"[Chunk {i+1}, Page {c['metadata']['page_num']}, Similarity: {c['similarity']:.2f}]\n{c['text']}"
+                context_parts.append(chunk_text)
+            context_text = "\n\n".join(context_parts)
+            # build prompt
+            prompt = f"""Extract the following parameter from the document context.
+Parameter: {parameter_name}
+Description: {parameter_description}
+"""
+            if domain_context:
+                prompt += f"Domain Knowledge:\n{domain_context}\n\n"
+            prompt += f"""Document Context:
+{context_text}
+Extract the value and provide the specific source location (e.g., "Account Summary Table, Row 3" not just the filename).
+Return JSON:
+{{
+    "value": <extracted value or null>,
+    "source": "<specific section/table/location>",
+    "confidence": <0.0-1.0>
+}}"""
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=self.temperature,
+                max_tokens=300
+            )
+            result_text = response.choices[0].message.content
+            # parse JSON response
+            import json
+            json_text = result_text.strip()
+            if "```json" in json_text:
+                json_text = json_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in json_text:
+                json_text = json_text.split("```")[1].split("```")[0].strip()
+            data = json.loads(json_text)
+            if data.get("value") is not None:
+                return ExtractionResult(
+                    parameter_id=parameter_id,
+                    parameter_name=parameter_name,
+                    value=data["value"],
+                    source=data.get("source", f"Page {relevant_chunks[0]['metadata']['page_num']}"),
+                    confidence=float(data.get("confidence", 0.5)),
+                    context_used=context_text[:200],
+                    metadata={"method": "rag", "chunks_used": len(relevant_chunks)}
+                )
+            return None
+        except Exception as e:
+            logger.error(f"RAG extraction error: {str(e)}")
+            return None
+    def calculate_overall_confidence(self, results):
+        """calculate overall confidence score"""
+        if not results:
+            return 0.0
+        # count successful extractions
+        successful = []
+        for r in results:
+            if r.value is not None:
+                successful.append(r)
+        if not successful:
+            return 0.0
+        # average confidence
+        total_conf = 0.0
+        for r in successful:
+            total_conf += r.confidence
+        avg_confidence = total_conf / len(successful)
+        # success rate
+        success_rate = len(successful) / len(results)
+        # combine them
+        overall = (avg_confidence * 0.7) + (success_rate * 0.3)
+        return round(overall, 2)

core/vision_parser.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import base64
+import io
+import json  # For parsing Vision API responses
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+from dataclasses import dataclass
+from loguru import logger
+from PIL import Image
+import fitz  # PyMuPDF - Pure Python, no poppler needed!
+from openai import OpenAI
+@dataclass
+class VisionExtractionResult:
+    """Result from vision-based extraction"""
+    parameter_id: str
+    parameter_name: str
+    value: Any
+    source: str  # Specific section/location
+    page_number: int
+    confidence: float
+    context: str  # Surrounding text/context
+class VisionDocumentParser:
+    def __init__(self, openai_client: OpenAI, model: str = "gpt-4o"):
+        self.client = openai_client
+        self.model = model
+        self._image_cache = {}  # Cache converted images by PDF path
+        logger.info(f"VisionDocumentParser initialized with model: {model}")
+    def pdf_to_images(self, pdf_path: str, dpi: int = 200) -> List[Image.Image]:
+        try:
+            # Check cache first - ONLY OPTIMIZATION!
+            cache_key = f"{pdf_path}_{dpi}"
+            if cache_key in self._image_cache:
+                logger.info(f"✅ Using CACHED images for: {Path(pdf_path).name} (skipping conversion)")
+                return self._image_cache[cache_key]
+            logger.info(f"Converting PDF to images: {Path(pdf_path).name} (DPI: {dpi})")
+            # Open PDF with PyMuPDF
+            doc = fitz.open(pdf_path)
+            images = []
+            # Convert each page to image
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                # Calculate zoom factor for DPI
+                # 72 DPI is default, so zoom = target_dpi / 72
+                zoom = dpi / 72
+                mat = fitz.Matrix(zoom, zoom)
+                # Render page to pixmap
+                pix = page.get_pixmap(matrix=mat)
+                # Convert pixmap to PIL Image
+                img_data = pix.tobytes("png")
+                img = Image.open(io.BytesIO(img_data))
+                images.append(img)
+            doc.close()
+            # Cache for reuse - ONLY OPTIMIZATION!
+            self._image_cache[cache_key] = images
+            logger.success(f"Converted {len(images)} pages to images (PyMuPDF) - CACHED for reuse ✅")
+            return images
+        except Exception as e:
+            logger.error(f"Error converting PDF to images: {str(e)}")
+            return []
+    def image_to_base64(self, image: Image.Image) -> str:
+        try:
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_str
+        except Exception as e:
+            logger.error(f"Error encoding image: {str(e)}")
+            return ""
+    def extract_all_parameters_from_page(
+        self,
+        image: Image.Image,
+        page_num: int,
+        parameters: List[Dict[str, str]]
+    ) -> Dict[str, VisionExtractionResult]:
+        try:
+            # Build comprehensive prompt for ALL parameters
+            param_descriptions = []
+            for i, param in enumerate(parameters, 1):
+                param_type = param.get('type', 'text')
+                type_hint = {
+                    'boolean': '(true/false)',
+                    'number': '(numeric value)',
+                    'date': '(date format)',
+                    'text': '(text value)'
+                }.get(param_type, '')
+                param_descriptions.append(
+                    f"{i}. **{param['name']}** {type_hint}: {param['description']}"
+                )
+            params_text = "\n".join(param_descriptions)
+            prompt = f"""Analyze this document page and extract ALL of the following parameters that you can find:
+{params_text}
+IMPORTANT INSTRUCTIONS:
+1. Return a JSON object with ONLY the parameters you found on this page
+2. For each parameter found, provide:
+   - "value": The actual value (use correct data type: number, boolean, string, or null)
+   - "source": SPECIFIC location (e.g., "Account Summary Table - Settlement column, Row 2")
+   - "confidence": Your confidence level (0.0 to 1.0)
+   - "context": Brief surrounding text for verification
+3. Skip parameters not visible on this page (don't include them in response)
+4. Be precise with sources - include table names, section headers, row/column identifiers
+5. For booleans, return true/false, NOT "yes"/"no" or 1/0
+Return ONLY valid JSON, no markdown formatting:
+{{
+  "parameter_id_1": {{
+    "found": true,
+    "value": <actual_value>,
+    "source": "Specific location with details",
+    "confidence": 0.95,
+    "context": "Surrounding text..."
+  }},
+  "parameter_id_2": {{
+    "found": true,
+    "value": <actual_value>,
+    "source": "Another specific location",
+    "confidence": 0.90,
+    "context": "More context..."
+  }}
+}}
+Parameter IDs to use: {', '.join([p['id'] for p in parameters])}"""
+            # Convert image to base64
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode()
+            # Single API call for ALL parameters!
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_base64}"
+                                }
+                            },
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ],
+                max_tokens=2000,
+                temperature=0.0
+            )
+            # Parse response
+            content = response.choices[0].message.content.strip()
+            # Remove markdown if present
+            if content.startswith("```json"):
+                content = content[7:]
+            if content.startswith("```"):
+                content = content[3:]
+            if content.endswith("```"):
+                content = content[:-3]
+            content = content.strip()
+            # Parse JSON
+            results_dict = json.loads(content)
+            # Create mapping of param_id to param_name for lookup
+            param_name_map = {p['id']: p['name'] for p in parameters}
+            # Convert to VisionExtractionResult objects
+            extraction_results = {}
+            for param_id, result_data in results_dict.items():
+                if result_data.get('found', False):
+                    extraction_results[param_id] = VisionExtractionResult(
+                        parameter_id=param_id,
+                        parameter_name=param_name_map.get(param_id, param_id),  # Get name from map
+                        value=result_data.get('value'),
+                        source=result_data.get('source', f'Page {page_num}'),
+                        page_number=page_num,
+                        confidence=result_data.get('confidence', 0.7),
+                        context=result_data.get('context', '')
+                    )
+            logger.success(
+                f"Page {page_num}: Found {len(extraction_results)}/{len(parameters)} parameters "
+                f"in ONE call ⚡"
+            )
+            return extraction_results
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse JSON from page {page_num}: {str(e)}")
+            return {}
+        except Exception as e:
+            logger.error(f"Error extracting from page {page_num}: {str(e)}")
+            return {}
+    def extract_all_parameters_batch(
+        self,
+        pdf_path: str,
+        parameters: List[Dict[str, str]]
+    ) -> Dict[str, VisionExtractionResult]:
+        try:
+            logger.info(
+                f"⚡ BATCH EXTRACTION: Processing {len(parameters)} parameters "
+                f"from {Path(pdf_path).name}"
+            )
+            # Convert PDF to images (uses cache!)
+            images = self.pdf_to_images(pdf_path, dpi=200)
+            if not images:
+                logger.error("Failed to convert PDF to images")
+                return {}
+            # Store best result for each parameter
+            best_results = {}
+            # Process each page once, extracting ALL parameters
+            for page_num, image in enumerate(images, start=1):
+                logger.info(f"⚡ Page {page_num}/{len(images)}: Extracting ALL parameters...")
+                # Extract all parameters from this page in ONE call!
+                page_results = self.extract_all_parameters_from_page(
+                    image=image,
+                    page_num=page_num,
+                    parameters=parameters
+                )
+                # Update best results (keep highest confidence for each parameter)
+                for param_id, result in page_results.items():
+                    if param_id not in best_results:
+                        best_results[param_id] = result
+                        logger.info(f"  ✓ {param_id}: {result.value} (conf: {result.confidence})")
+                    elif result.confidence > best_results[param_id].confidence:
+                        logger.info(
+                            f"  ↑ {param_id}: {result.value} (conf: {result.confidence}) "
+                            f"[better than {best_results[param_id].confidence}]"
+                        )
+                        best_results[param_id] = result
+            found_count = len(best_results)
+            logger.success(
+                f"⚡ BATCH COMPLETE: Found {found_count}/{len(parameters)} parameters "
+                f"in {len(images)} API calls (vs {len(parameters) * len(images)} with old method!)"
+            )
+            return best_results
+        except Exception as e:
+            logger.error(f"Error in batch extraction: {str(e)}")
+            return {}
+    def extract_parameter_from_page(
+        self,
+        image: Image.Image,
+        page_num: int,
+        parameter_name: str,
+        parameter_description: str,
+        parameter_type: str = "text"
+    ) -> Optional[VisionExtractionResult]:
+        try:
+            # Convert image to base64
+            img_base64 = self.image_to_base64(image)
+            if not img_base64:
+                return None
+            # Build prompt based on parameter type
+            prompt = self._build_extraction_prompt(
+                parameter_name,
+                parameter_description,
+                parameter_type
+            )
+            # Call GPT-4 Vision
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_base64}",
+                                    "detail": "high"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                max_tokens=500,
+                temperature=0.0  # Deterministic for data extraction
+            )
+            # Parse response
+            result_text = response.choices[0].message.content
+            # Parse structured response
+            return self._parse_vision_response(
+                result_text,
+                parameter_name,
+                page_num
+            )
+        except Exception as e:
+            logger.error(f"Error extracting {parameter_name} from page {page_num}: {str(e)}")
+            return None
+    def _build_extraction_prompt(
+        self,
+        parameter_name: str,
+        parameter_description: str,
+        parameter_type: str
+    ) -> str:
+        """Build prompt for GPT-4 Vision extraction"""
+        prompt = f"""You are analyzing a financial document (Bureau Credit Report or GST Return).
+**TASK:** Extract the following parameter from this document page.
+**Parameter Name:** {parameter_name}
+**Description:** {parameter_description}
+**Expected Type:** {parameter_type}
+**INSTRUCTIONS:**
+1. Look for this parameter in the document
+2. If found, extract the exact value
+3. Note the specific section/location where you found it (e.g., "Account Summary Table, Row 3" or "DPD History Section")
+4. Provide surrounding context (nearby text)
+**OUTPUT FORMAT (JSON):**
+{{
+    "found": true/false,
+    "value": <extracted value or null>,
+    "source": "<specific section/table/location>",
+    "confidence": <0.0-1.0>,
+    "context": "<surrounding text for verification>"
+}}
+**EXAMPLES:**
+For "DPD 30 Days" in a credit report:
+{{
+    "found": true,
+    "value": 2,
+    "source": "Payment History Table - DPD 30 Days column",
+    "confidence": 0.95,
+    "context": "DPD History: 0-30 days: 2 occurrences"
+}}
+For "Settlement/Write-off" flag:
+{{
+    "found": true,
+    "value": false,
+    "source": "Account Status Summary - Settlement Status field",
+    "confidence": 0.90,
+    "context": "Settlement Status: Not Applicable, Write-off Status: No"
+}}
+If parameter not found on this page:
+{{
+    "found": false,
+    "value": null,
+    "source": "Not found on this page",
+    "confidence": 0.0,
+    "context": ""
+}}
+**CRITICAL RULES:**
+- Be precise with locations (section names, table names, row/column)
+- Extract EXACT values, don't interpret
+- For boolean parameters, return true/false
+- For numeric parameters, return numbers (not strings)
+- If unsure, set confidence < 0.7
+- Return ONLY valid JSON, no other text
+Now analyze the document image and extract the parameter:"""
+        return prompt
+    def _parse_vision_response(
+        self,
+        response_text: str,
+        parameter_id: str,
+        page_num: int
+    ) -> Optional[VisionExtractionResult]:
+        """Parse GPT-4 Vision response into structured result"""
+        try:
+            import json
+            # Extract JSON from response (handle markdown code blocks)
+            json_text = response_text.strip()
+            if "```json" in json_text:
+                json_text = json_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in json_text:
+                json_text = json_text.split("```")[1].split("```")[0].strip()
+            # Parse JSON
+            data = json.loads(json_text)
+            # Check if found
+            if not data.get("found", False):
+                return None
+            # Build result
+            result = VisionExtractionResult(
+                parameter_id=parameter_id,
+                parameter_name=parameter_id.replace("_", " ").title(),
+                value=data.get("value"),
+                source=data.get("source", "Unknown location"),
+                page_number=page_num,
+                confidence=float(data.get("confidence", 0.5)),
+                context=data.get("context", "")
+            )
+            return result
+        except Exception as e:
+            logger.error(f"Error parsing vision response: {str(e)}")
+            logger.debug(f"Response text: {response_text}")
+            return None
+    def extract_parameter_from_pdf(
+        self,
+        pdf_path: str,
+        parameter_name: str,
+        parameter_description: str,
+        parameter_type: str = "text",
+        search_all_pages: bool = True  # Search all pages for best accuracy
+    ) -> Optional[VisionExtractionResult]:
+        try:
+            logger.info(f"Extracting '{parameter_name}' from {Path(pdf_path).name}")
+            # Convert PDF to images (uses cache if already converted! - ONLY OPTIMIZATION)
+            images = self.pdf_to_images(pdf_path, dpi=200)
+            if not images:
+                logger.error("Failed to convert PDF to images")
+                return None
+            # Search pages
+            results = []
+            for page_num, image in enumerate(images, start=1):
+                logger.info(f"Searching page {page_num}/{len(images)}...")
+                result = self.extract_parameter_from_page(
+                    image=image,
+                    page_num=page_num,
+                    parameter_name=parameter_name,
+                    parameter_description=parameter_description,
+                    parameter_type=parameter_type
+                )
+                if result and result.value is not None:
+                    logger.success(f"Found on page {page_num}: {result.value} (confidence: {result.confidence})")
+                    results.append(result)
+                    # Stop if we found a good match and not searching all pages
+                    if not search_all_pages and result.confidence > 0.7:
+                        break
+            # Return best result
+            if results:
+                best_result = max(results, key=lambda r: r.confidence)
+                logger.success(f"Best match: page {best_result.page_number}, confidence {best_result.confidence}")
+                return best_result
+            else:
+                logger.warning(f"Parameter '{parameter_name}' not found in document")
+                return None
+        except Exception as e:
+            logger.error(f"Error extracting parameter from PDF: {str(e)}")
+            return None
+    def extract_gst_sales_with_vision(
+        self,
+        pdf_path: str
+    ) -> Optional[Dict[str, Any]]:
+        try:
+            logger.info(f"Extracting GST sales from {Path(pdf_path).name}")
+            # Convert PDF to images
+            images = self.pdf_to_images(pdf_path)
+            if not images:
+                return None
+            # Prompt for GST sales
+            prompt = """You are analyzing a GSTR-3B (GST Return) document.
+**TASK:** Extract the total taxable sales value from Table 3.1(a).
+**WHAT TO LOOK FOR:**
+- Table 3.1(a): "Details of Outward Supplies and inward supplies liable to reverse charge"
+- Look for "Taxable value" or "Total Taxable value"
+- This is usually in the first row of Table 3.1
+**OUTPUT FORMAT (JSON):**
+{{
+    "found": true/false,
+    "month": "<month and year, e.g., January 2025>",
+    "sales": <numeric value>,
+    "source": "GSTR-3B Table 3.1(a)",
+    "confidence": <0.0-1.0>
+}}
+**EXAMPLE:**
+{{
+    "found": true,
+    "month": "January 2025",
+    "sales": 951381,
+    "source": "GSTR-3B Table 3.1(a) - Taxable outward supplies",
+    "confidence": 0.95
+}}
+Return ONLY valid JSON, no other text."""
+            # Try each page
+            for page_num, image in enumerate(images, start=1):
+                try:
+                    img_base64 = self.image_to_base64(image)
+                    response = self.client.chat.completions.create(
+                        model=self.model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {"type": "text", "text": prompt},
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {
+                                            "url": f"data:image/png;base64,{img_base64}",
+                                            "detail": "high"
+                                        }
+                                    }
+                                ]
+                            }
+                        ],
+                        max_tokens=300,
+                        temperature=0.0
+                    )
+                    result_text = response.choices[0].message.content
+                    # Parse JSON
+                    import json
+                    json_text = result_text.strip()
+                    if "```json" in json_text:
+                        json_text = json_text.split("```json")[1].split("```")[0].strip()
+                    elif "```" in json_text:
+                        json_text = json_text.split("```")[1].split("```")[0].strip()
+                    data = json.loads(json_text)
+                    if data.get("found") and data.get("sales"):
+                        logger.success(f"Found GST sales on page {page_num}: {data['sales']}")
+                        return {
+                            "month": data.get("month", "Unknown"),
+                            "sales": data["sales"],
+                            "source": data.get("source", "GSTR-3B Table 3.1(a)")
+                        }
+                except Exception as e:
+                    logger.debug(f"Page {page_num} - no sales data: {str(e)}")
+                    continue
+            logger.warning("GST sales not found in document")
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting GST sales: {str(e)}")
+            return None