Spaces:

babupallam
/

knowflow-ai-rag-document-chatbot

Sleeping

File size: 4,464 Bytes

c37cfba

# ============================================================
# FILE: src/document_loader.py
# ============================================================
# PURPOSE:
# Load documents from the local knowledge base folder.
#
# SUPPORTED FILE TYPES:
# - .txt
# - .md
# - .csv
# - .pdf
#
# In production, document loading becomes an ingestion pipeline.
# You may need:
# - file validation
# - file size limits
# - malware scanning
# - OCR for scanned PDFs
# - metadata extraction
# - document versioning
# - access control rules
# ============================================================

from dataclasses import dataclass
from pathlib import Path
from typing import List

import pandas as pd

"""
Why dataclass decoration?
- Cleaner syntax for simple data containers.
- Automatic generation of __init__, __repr__, and other methods.
- Ideal for the Document class, which is just a structured way to hold data.
"""

@dataclass
class Document:
    """
    Represents one loaded document.

    source:
    - relative file path used for source attribution

    text:
    - extracted plain text

    file_type:
    - original file extension

    character_count:
    - useful for debugging and monitoring
    """

    source: str
    text: str
    file_type: str
    character_count: int


def read_text_file(path: Path) -> str:
    """
    Read a normal text file.

    errors='ignore' prevents a full crash if the file contains
    unusual encoding characters.
    """
    return path.read_text(encoding="utf-8", errors="ignore")


def read_csv_file(path: Path) -> str:
    """
    Read a CSV file and convert each row into readable text.

    Why convert CSV to text?
    RAG retrieval works on text chunks. A row must become text before
    it can be embedded and retrieved.
    """

    df = pd.read_csv(path)
    lines = []

    for row_index, row in df.iterrows():
        row_parts = []

        for column_name, value in row.items():
            row_parts.append(f"{column_name}: {value}")

        lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts))

    return "\n".join(lines)


def read_pdf_file(path: Path) -> str:
    """
    Extract text from a PDF file.

    Important limitation:
    pypdf works for text-based PDFs.
    It may not work for scanned image PDFs.

    Production options for scanned PDFs:
    - Tesseract OCR
    - AWS Textract
    - Azure Document Intelligence
    - Google Document AI
    """

    try:
        from pypdf import PdfReader
    except ImportError as error:
        raise ImportError("pypdf is not installed. Run: pip install pypdf") from error

    reader = PdfReader(str(path))
    pages = []

    for page_number, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text() or ""
        pages.append(f"\n--- Page {page_number} ---\n{page_text}")

    return "\n".join(pages)


def load_single_document(path: Path, project_root: Path) -> Document:
    """
    Load one supported document and return a Document object.

    This function keeps file-type-specific logic in one place.
    """

    extension = path.suffix.lower()

    if extension in {".txt", ".md"}:
        text = read_text_file(path)
    elif extension == ".csv":
        text = read_csv_file(path)
    elif extension == ".pdf":
        text = read_pdf_file(path)
    else:
        raise ValueError(f"Unsupported file type: {extension}")

    text = text.strip()

    return Document(
        source=str(path.relative_to(project_root)),
        text=text,
        file_type=extension,
        character_count=len(text),
    )


def load_documents(folder: Path, project_root: Path) -> List[Document]:
    """
    Load all supported documents from a folder.

    Returns:
    List[Document]

    AI ENGINEER PRODUCTION TIP:
    Always keep source metadata. Without source metadata, your app
    cannot explain where an answer came from.
    """

    supported_extensions = {".txt", ".md", ".csv", ".pdf"}
    documents = []

    for path in sorted(folder.rglob("*")):
        if not path.is_file():
            continue

        if path.suffix.lower() not in supported_extensions:
            continue

        try:
            document = load_single_document(path=path, project_root=project_root)

            if document.text:
                documents.append(document)

        except Exception as error:
            print(f"Could not load file: {path}")
            print(f"Reason: {error}")

    return documents