File size: 12,729 Bytes
95ff1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344


import pdfplumber
import re
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
from dataclasses import dataclass
from loguru import logger


@dataclass
class DocumentChunk:
    """chunk of text from document"""
    chunk_id: str
    text: str
    page_num: int
    start_char: int
    end_char: int
    metadata: Dict[str, Any]


@dataclass
class ParsedDocument:
    """parsed document data"""
    file_name: str
    total_pages: int
    text_content: str
    pages: List[Dict[str, Any]]
    tables: List[Dict[str, Any]]
    chunks: List[DocumentChunk]
    metadata: Dict[str, Any]


class DocumentParser:
    # PDF parser with chunking for RAG

    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        logger.info(f"Parser initialized - chunk_size={chunk_size}, overlap={chunk_overlap}")

    def parse_pdf(self, pdf_path):
        """

        parse PDF and extract content

        """
        logger.info(f"Parsing: {Path(pdf_path).name}")

        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                pages_data = []
                tables_data = []

                # go through each page
                for page_num, page in enumerate(pdf.pages, start=1):
                    try:
                        page_result = self._parse_page(page, page_num)

                        all_text.append(page_result["text"])
                        pages_data.append(page_result["page_data"])
                        tables_data.extend(page_result["tables"])

                        logger.debug(f"Page {page_num}: {len(page_result['text'])} chars, {len(page_result['tables'])} tables")

                    except Exception as e:
                        logger.error(f"Error on page {page_num}: {str(e)}")
                        continue  # skip problematic pages

                full_text = "\n\n".join(all_text)

                # create chunks for embeddings
                chunks = self._create_chunks(full_text, Path(pdf_path).name)

                metadata = {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "total_pages": len(pdf.pages),
                    "total_tables": len(tables_data),
                    "total_chunks": len(chunks),
                    "text_length": len(full_text)
                }

                parsed_doc = ParsedDocument(
                    file_name=Path(pdf_path).name,
                    total_pages=len(pdf.pages),
                    text_content=full_text,
                    pages=pages_data,
                    tables=tables_data,
                    chunks=chunks,
                    metadata=metadata
                )

                logger.success(f"Parsed {len(pdf.pages)} pages, {len(tables_data)} tables, {len(chunks)} chunks")

                return parsed_doc

        except FileNotFoundError:
            logger.error(f"File not found: {pdf_path}")
            return None
        except Exception as e:
            logger.error(f"Failed to parse {pdf_path}: {str(e)}")
            return None

    def _parse_page(self, page, page_num):
        """parse single page"""
        try:
            # grab text
            page_text = page.extract_text()
            if page_text is None:
                page_text = ""

            # extract tables
            tables = []
            raw_tables = page.extract_tables()

            for table_idx, table in enumerate(raw_tables):
                if table and len(table) > 0:
                    try:
                        table_data = {
                            "page": page_num,
                            "table_id": f"p{page_num}_t{table_idx + 1}",
                            "headers": table[0] if table else [],
                            "rows": table[1:] if len(table) > 1 else [],
                            "raw_data": table
                        }
                        tables.append(table_data)
                    except Exception as e:
                        logger.warning(f"Table {table_idx} error on page {page_num}: {str(e)}")

            page_data = {
                "page_num": page_num,
                "text": page_text,
                "text_length": len(page_text),
                "tables_count": len(tables),
                "width": page.width,
                "height": page.height
            }

            return {
                "text": page_text,
                "tables": tables,
                "page_data": page_data
            }

        except Exception as e:
            logger.error(f"_parse_page error for page {page_num}: {str(e)}")
            return {
                "text": "",
                "tables": [],
                "page_data": {
                    "page_num": page_num,
                    "text": "",
                    "text_length": 0,
                    "tables_count": 0
                }
            }

    def _create_chunks(self, text, file_name):
        """

        break text into chunks with overlap

        TODO: maybe improve the chunking logic later

        """
        try:
            chunks = []

            if not text:
                logger.warning("Empty text for chunking")
                return chunks

            # split by paragraphs
            paragraphs = text.split('\n\n')

            current_chunk = ""
            current_start = 0
            chunk_id = 0

            for para in paragraphs:
                para = para.strip()
                if not para:
                    continue

                # check if adding para exceeds size
                if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
                    # save chunk
                    chunk = DocumentChunk(
                        chunk_id=f"chunk_{chunk_id}",
                        text=current_chunk.strip(),
                        page_num=0,  # not tracking page num for now
                        start_char=current_start,
                        end_char=current_start + len(current_chunk),
                        metadata={
                            "source_file": file_name,
                            "chunk_length": len(current_chunk)
                        }
                    )
                    chunks.append(chunk)
                    chunk_id += 1

                    # start new chunk with overlap
                    if len(current_chunk) > self.chunk_overlap:
                        overlap_text = current_chunk[-self.chunk_overlap:]
                    else:
                        overlap_text = current_chunk
                    current_start = current_start + len(current_chunk) - len(overlap_text)
                    current_chunk = overlap_text + "\n\n" + para
                else:
                    # add to current chunk
                    if current_chunk:
                        current_chunk += "\n\n" + para
                    else:
                        current_chunk = para

            # add final chunk
            if current_chunk:
                chunk = DocumentChunk(
                    chunk_id=f"chunk_{chunk_id}",
                    text=current_chunk.strip(),
                    page_num=0,
                    start_char=current_start,
                    end_char=current_start + len(current_chunk),
                    metadata={
                        "source_file": file_name,
                        "chunk_length": len(current_chunk)
                    }
                )
                chunks.append(chunk)

            logger.info(f"Created {len(chunks)} chunks")
            return chunks

        except Exception as e:
            logger.error(f"Chunking error: {str(e)}")
            return []

    def extract_bureau_score(self, parsed_doc):
        """

        grab CIBIL score from CRIF report

        looks for pattern like "PERFORM CONSUMER 2.2 300-900 627"

        """
        try:
            text = parsed_doc.text_content

            # main pattern - score after range
            pattern = r'PERFORM\s+CONSUMER.*?300-900\s+(\d{3})'
            match = re.search(pattern, text, re.IGNORECASE)

            if match:
                score = int(match.group(1))
                if 300 <= score <= 900:
                    logger.info(f"Found bureau score: {score}")
                    return {
                        "value": score,
                        "source": "CRIF Report – Score Section"
                    }

            # fallback - check first couple pages
            for page in parsed_doc.pages[:2]:
                page_text = page["text"]
                numbers = re.findall(r'\b(\d{3})\b', page_text)

                for num_str in numbers:
                    num = int(num_str)
                    if 300 <= num <= 900:
                        # check if its actually a score
                        idx = page_text.find(num_str)
                        context = page_text[max(0, idx-100):idx+100]

                        keywords = ['score', 'cibil', 'credit', 'bureau']
                        if any(kw in context.lower() for kw in keywords):
                            logger.info(f"Found score (fallback): {num}")
                            return {
                                "value": num,
                                "source": f"CRIF Report – Page {page['page_num']}"
                            }

            logger.warning("Bureau score not found")
            return None

        except Exception as e:
            logger.error(f"Error extracting bureau score: {str(e)}")
            return None

    def extract_gst_sales(self, parsed_doc):
        """extract sales from GSTR-3B table"""
        try:
            text = parsed_doc.text_content
            filename = parsed_doc.file_name

            # get month from document
            month_match = re.search(r'Period\s+(\w+)', text)
            month_name = month_match.group(1) if month_match else "Unknown"

            # extract year from filename (GSTR3B_..._012025.pdf format)
            filename_year_match = re.search(r'_(\d{2})(\d{4})\.pdf', filename)
            if filename_year_match:
                year = filename_year_match.group(2)
            else:
                # fallback
                year_match = re.search(r'Year\s+(\d{4})', text)
                year = year_match.group(1) if year_match else "2025"

            formatted_month = f"{month_name} {year}"

            # search tables for sales
            for table in parsed_doc.tables:
                rows = table.get("rows", [])

                for row in rows:
                    if row and len(row) > 1:
                        first_cell = str(row[0]).replace('\n', ' ')

                        # find row (a) with outward supplies
                        if "(a)" in first_cell and "Outward taxable supplies" in first_cell:
                            if len(row) > 1 and row[1]:
                                value_str = str(row[1])
                                clean_value = re.sub(r'[^\d.]', '', value_str)

                                if clean_value:
                                    try:
                                        sales = float(clean_value)
                                        logger.info(f"GST sales: {sales} for {formatted_month}")
                                        return {
                                            "month": formatted_month,
                                            "sales": sales,
                                            "source": "GSTR-3B Table 3.1(a)"
                                        }
                                    except ValueError as e:
                                        logger.warning(f"Couldn't parse sales value '{clean_value}': {str(e)}")

            logger.warning(f"Sales data not found for {formatted_month}")
            return None

        except Exception as e:
            logger.error(f"Error extracting GST sales: {str(e)}")
            return None

    def get_chunks_text(self, chunks):
        """get text from chunks for embedding"""
        try:
            return [chunk.text for chunk in chunks]
        except Exception as e:
            logger.error(f"Error getting chunks text: {str(e)}")
            return []