knowflow-ai-rag-document-chatbot / src /document_loader.py
Babu Pallam
Add document loading and text cleaning modules
c37cfba
Raw
History Blame Contribute Delete
4.46 kB
# ============================================================
# FILE: src/document_loader.py
# ============================================================
# PURPOSE:
# Load documents from the local knowledge base folder.
#
# SUPPORTED FILE TYPES:
# - .txt
# - .md
# - .csv
# - .pdf
#
# In production, document loading becomes an ingestion pipeline.
# You may need:
# - file validation
# - file size limits
# - malware scanning
# - OCR for scanned PDFs
# - metadata extraction
# - document versioning
# - access control rules
# ============================================================
from dataclasses import dataclass
from pathlib import Path
from typing import List
import pandas as pd
"""
Why dataclass decoration?
- Cleaner syntax for simple data containers.
- Automatic generation of __init__, __repr__, and other methods.
- Ideal for the Document class, which is just a structured way to hold data.
"""
@dataclass
class Document:
"""
Represents one loaded document.
source:
- relative file path used for source attribution
text:
- extracted plain text
file_type:
- original file extension
character_count:
- useful for debugging and monitoring
"""
source: str
text: str
file_type: str
character_count: int
def read_text_file(path: Path) -> str:
"""
Read a normal text file.
errors='ignore' prevents a full crash if the file contains
unusual encoding characters.
"""
return path.read_text(encoding="utf-8", errors="ignore")
def read_csv_file(path: Path) -> str:
"""
Read a CSV file and convert each row into readable text.
Why convert CSV to text?
RAG retrieval works on text chunks. A row must become text before
it can be embedded and retrieved.
"""
df = pd.read_csv(path)
lines = []
for row_index, row in df.iterrows():
row_parts = []
for column_name, value in row.items():
row_parts.append(f"{column_name}: {value}")
lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts))
return "\n".join(lines)
def read_pdf_file(path: Path) -> str:
"""
Extract text from a PDF file.
Important limitation:
pypdf works for text-based PDFs.
It may not work for scanned image PDFs.
Production options for scanned PDFs:
- Tesseract OCR
- AWS Textract
- Azure Document Intelligence
- Google Document AI
"""
try:
from pypdf import PdfReader
except ImportError as error:
raise ImportError("pypdf is not installed. Run: pip install pypdf") from error
reader = PdfReader(str(path))
pages = []
for page_number, page in enumerate(reader.pages, start=1):
page_text = page.extract_text() or ""
pages.append(f"\n--- Page {page_number} ---\n{page_text}")
return "\n".join(pages)
def load_single_document(path: Path, project_root: Path) -> Document:
"""
Load one supported document and return a Document object.
This function keeps file-type-specific logic in one place.
"""
extension = path.suffix.lower()
if extension in {".txt", ".md"}:
text = read_text_file(path)
elif extension == ".csv":
text = read_csv_file(path)
elif extension == ".pdf":
text = read_pdf_file(path)
else:
raise ValueError(f"Unsupported file type: {extension}")
text = text.strip()
return Document(
source=str(path.relative_to(project_root)),
text=text,
file_type=extension,
character_count=len(text),
)
def load_documents(folder: Path, project_root: Path) -> List[Document]:
"""
Load all supported documents from a folder.
Returns:
List[Document]
AI ENGINEER PRODUCTION TIP:
Always keep source metadata. Without source metadata, your app
cannot explain where an answer came from.
"""
supported_extensions = {".txt", ".md", ".csv", ".pdf"}
documents = []
for path in sorted(folder.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in supported_extensions:
continue
try:
document = load_single_document(path=path, project_root=project_root)
if document.text:
documents.append(document)
except Exception as error:
print(f"Could not load file: {path}")
print(f"Reason: {error}")
return documents