Spaces:

datasciencesage
/

Document-Intelligence-Bureau-GST-Data-Extraction

Sleeping

Document-Intelligence-Bureau-GST-Data-Extraction

File size: 12,729 Bytes

95ff1e1



import pdfplumber
import re
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
from dataclasses import dataclass
from loguru import logger


@dataclass
class DocumentChunk:
    """chunk of text from document"""
    chunk_id: str
    text: str
    page_num: int
    start_char: int
    end_char: int
    metadata: Dict[str, Any]


@dataclass
class ParsedDocument:
    """parsed document data"""
    file_name: str
    total_pages: int
    text_content: str
    pages: List[Dict[str, Any]]
    tables: List[Dict[str, Any]]
    chunks: List[DocumentChunk]
    metadata: Dict[str, Any]


class DocumentParser:
    # PDF parser with chunking for RAG

    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        logger.info(f"Parser initialized - chunk_size={chunk_size}, overlap={chunk_overlap}")

    def parse_pdf(self, pdf_path):
        """

        parse PDF and extract content

        """
        logger.info(f"Parsing: {Path(pdf_path).name}")

        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                pages_data = []
                tables_data = []

                # go through each page
                for page_num, page in enumerate(pdf.pages, start=1):
                    try:
                        page_result = self._parse_page(page, page_num)

                        all_text.append(page_result["text"])
                        pages_data.append(page_result["page_data"])
                        tables_data.extend(page_result["tables"])

                        logger.debug(f"Page {page_num}: {len(page_result['text'])} chars, {len(page_result['tables'])} tables")

                    except Exception as e:
                        logger.error(f"Error on page {page_num}: {str(e)}")
                        continue  # skip problematic pages

                full_text = "\n\n".join(all_text)

                # create chunks for embeddings
                chunks = self._create_chunks(full_text, Path(pdf_path).name)

                metadata = {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "total_pages": len(pdf.pages),
                    "total_tables": len(tables_data),
                    "total_chunks": len(chunks),
                    "text_length": len(full_text)
                }

                parsed_doc = ParsedDocument(
                    file_name=Path(pdf_path).name,
                    total_pages=len(pdf.pages),
                    text_content=full_text,
                    pages=pages_data,
                    tables=tables_data,
                    chunks=chunks,
                    metadata=metadata
                )

                logger.success(f"Parsed {len(pdf.pages)} pages, {len(tables_data)} tables, {len(chunks)} chunks")

                return parsed_doc

        except FileNotFoundError:
            logger.error(f"File not found: {pdf_path}")
            return None
        except Exception as e:
            logger.error(f"Failed to parse {pdf_path}: {str(e)}")
            return None

    def _parse_page(self, page, page_num):
        """parse single page"""
        try:
            # grab text
            page_text = page.extract_text()
            if page_text is None:
                page_text = ""

            # extract tables
            tables = []
            raw_tables = page.extract_tables()

            for table_idx, table in enumerate(raw_tables):
                if table and len(table) > 0:
                    try:
                        table_data = {
                            "page": page_num,
                            "table_id": f"p{page_num}_t{table_idx + 1}",
                            "headers": table[0] if table else [],
                            "rows": table[1:] if len(table) > 1 else [],
                            "raw_data": table
                        }
                        tables.append(table_data)
                    except Exception as e:
                        logger.warning(f"Table {table_idx} error on page {page_num}: {str(e)}")

            page_data = {
                "page_num": page_num,
                "text": page_text,
                "text_length": len(page_text),
                "tables_count": len(tables),
                "width": page.width,
                "height": page.height
            }

            return {
                "text": page_text,
                "tables": tables,
                "page_data": page_data
            }

        except Exception as e:
            logger.error(f"_parse_page error for page {page_num}: {str(e)}")
            return {
                "text": "",
                "tables": [],
                "page_data": {
                    "page_num": page_num,
                    "text": "",
                    "text_length": 0,
                    "tables_count": 0
                }
            }

    def _create_chunks(self, text, file_name):
        """

        break text into chunks with overlap

        TODO: maybe improve the chunking logic later

        """
        try:
            chunks = []

            if not text:
                logger.warning("Empty text for chunking")
                return chunks

            # split by paragraphs
            paragraphs = text.split('\n\n')

            current_chunk = ""
            current_start = 0
            chunk_id = 0

            for para in paragraphs:
                para = para.strip()
                if not para:
                    continue

                # check if adding para exceeds size
                if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
                    # save chunk
                    chunk = DocumentChunk(
                        chunk_id=f"chunk_{chunk_id}",
                        text=current_chunk.strip(),
                        page_num=0,  # not tracking page num for now
                        start_char=current_start,
                        end_char=current_start + len(current_chunk),
                        metadata={
                            "source_file": file_name,
                            "chunk_length": len(current_chunk)
                        }
                    )
                    chunks.append(chunk)
                    chunk_id += 1

                    # start new chunk with overlap
                    if len(current_chunk) > self.chunk_overlap:
                        overlap_text = current_chunk[-self.chunk_overlap:]
                    else:
                        overlap_text = current_chunk
                    current_start = current_start + len(current_chunk) - len(overlap_text)
                    current_chunk = overlap_text + "\n\n" + para
                else:
                    # add to current chunk
                    if current_chunk:
                        current_chunk += "\n\n" + para
                    else:
                        current_chunk = para

            # add final chunk
            if current_chunk:
                chunk = DocumentChunk(
                    chunk_id=f"chunk_{chunk_id}",
                    text=current_chunk.strip(),
                    page_num=0,
                    start_char=current_start,
                    end_char=current_start + len(current_chunk),
                    metadata={
                        "source_file": file_name,
                        "chunk_length": len(current_chunk)
                    }
                )
                chunks.append(chunk)

            logger.info(f"Created {len(chunks)} chunks")
            return chunks

        except Exception as e:
            logger.error(f"Chunking error: {str(e)}")
            return []

    def extract_bureau_score(self, parsed_doc):
        """

        grab CIBIL score from CRIF report

        looks for pattern like "PERFORM CONSUMER 2.2 300-900 627"

        """
        try:
            text = parsed_doc.text_content

            # main pattern - score after range
            pattern = r'PERFORM\s+CONSUMER.*?300-900\s+(\d{3})'
            match = re.search(pattern, text, re.IGNORECASE)

            if match:
                score = int(match.group(1))
                if 300 <= score <= 900:
                    logger.info(f"Found bureau score: {score}")
                    return {
                        "value": score,
                        "source": "CRIF Report – Score Section"
                    }

            # fallback - check first couple pages
            for page in parsed_doc.pages[:2]:
                page_text = page["text"]
                numbers = re.findall(r'\b(\d{3})\b', page_text)

                for num_str in numbers:
                    num = int(num_str)
                    if 300 <= num <= 900:
                        # check if its actually a score
                        idx = page_text.find(num_str)
                        context = page_text[max(0, idx-100):idx+100]

                        keywords = ['score', 'cibil', 'credit', 'bureau']
                        if any(kw in context.lower() for kw in keywords):
                            logger.info(f"Found score (fallback): {num}")
                            return {
                                "value": num,
                                "source": f"CRIF Report – Page {page['page_num']}"
                            }

            logger.warning("Bureau score not found")
            return None

        except Exception as e:
            logger.error(f"Error extracting bureau score: {str(e)}")
            return None

    def extract_gst_sales(self, parsed_doc):
        """extract sales from GSTR-3B table"""
        try:
            text = parsed_doc.text_content
            filename = parsed_doc.file_name

            # get month from document
            month_match = re.search(r'Period\s+(\w+)', text)
            month_name = month_match.group(1) if month_match else "Unknown"

            # extract year from filename (GSTR3B_..._012025.pdf format)
            filename_year_match = re.search(r'_(\d{2})(\d{4})\.pdf', filename)
            if filename_year_match:
                year = filename_year_match.group(2)
            else:
                # fallback
                year_match = re.search(r'Year\s+(\d{4})', text)
                year = year_match.group(1) if year_match else "2025"

            formatted_month = f"{month_name} {year}"

            # search tables for sales
            for table in parsed_doc.tables:
                rows = table.get("rows", [])

                for row in rows:
                    if row and len(row) > 1:
                        first_cell = str(row[0]).replace('\n', ' ')

                        # find row (a) with outward supplies
                        if "(a)" in first_cell and "Outward taxable supplies" in first_cell:
                            if len(row) > 1 and row[1]:
                                value_str = str(row[1])
                                clean_value = re.sub(r'[^\d.]', '', value_str)

                                if clean_value:
                                    try:
                                        sales = float(clean_value)
                                        logger.info(f"GST sales: {sales} for {formatted_month}")
                                        return {
                                            "month": formatted_month,
                                            "sales": sales,
                                            "source": "GSTR-3B Table 3.1(a)"
                                        }
                                    except ValueError as e:
                                        logger.warning(f"Couldn't parse sales value '{clean_value}': {str(e)}")

            logger.warning(f"Sales data not found for {formatted_month}")
            return None

        except Exception as e:
            logger.error(f"Error extracting GST sales: {str(e)}")
            return None

    def get_chunks_text(self, chunks):
        """get text from chunks for embedding"""
        try:
            return [chunk.text for chunk in chunks]
        except Exception as e:
            logger.error(f"Error getting chunks text: {str(e)}")
            return []