Spaces:

babupallam
/

knowflow-ai-rag-document-chatbot

Sleeping

knowflow-ai-rag-document-chatbot / src /document_loader.py

Babu Pallam

Add document loading and text cleaning modules

c37cfba about 1 month ago

4.46 kB

	# ============================================================
	# FILE: src/document_loader.py
	# ============================================================
	# PURPOSE:
	# Load documents from the local knowledge base folder.
	#
	# SUPPORTED FILE TYPES:
	# - .txt
	# - .md
	# - .csv
	# - .pdf
	#
	# In production, document loading becomes an ingestion pipeline.
	# You may need:
	# - file validation
	# - file size limits
	# - malware scanning
	# - OCR for scanned PDFs
	# - metadata extraction
	# - document versioning
	# - access control rules
	# ============================================================

	from dataclasses import dataclass
	from pathlib import Path
	from typing import List

	import pandas as pd

	"""
	Why dataclass decoration?
	- Cleaner syntax for simple data containers.
	- Automatic generation of __init__, __repr__, and other methods.
	- Ideal for the Document class, which is just a structured way to hold data.
	"""

	@dataclass
	class Document:
	"""
	Represents one loaded document.

	source:
	- relative file path used for source attribution

	text:
	- extracted plain text

	file_type:
	- original file extension

	character_count:
	- useful for debugging and monitoring
	"""

	source: str
	text: str
	file_type: str
	character_count: int


	def read_text_file(path: Path) -> str:
	"""
	Read a normal text file.

	errors='ignore' prevents a full crash if the file contains
	unusual encoding characters.
	"""
	return path.read_text(encoding="utf-8", errors="ignore")


	def read_csv_file(path: Path) -> str:
	"""
	Read a CSV file and convert each row into readable text.

	Why convert CSV to text?
	RAG retrieval works on text chunks. A row must become text before
	it can be embedded and retrieved.
	"""

	df = pd.read_csv(path)
	lines = []

	for row_index, row in df.iterrows():
	row_parts = []

	for column_name, value in row.items():
	row_parts.append(f"{column_name}: {value}")

	lines.append(f"Row {row_index + 1}: " + " \| ".join(row_parts))

	return "\n".join(lines)


	def read_pdf_file(path: Path) -> str:
	"""
	Extract text from a PDF file.

	Important limitation:
	pypdf works for text-based PDFs.
	It may not work for scanned image PDFs.

	Production options for scanned PDFs:
	- Tesseract OCR
	- AWS Textract
	- Azure Document Intelligence
	- Google Document AI
	"""

	try:
	from pypdf import PdfReader
	except ImportError as error:
	raise ImportError("pypdf is not installed. Run: pip install pypdf") from error

	reader = PdfReader(str(path))
	pages = []

	for page_number, page in enumerate(reader.pages, start=1):
	page_text = page.extract_text() or ""
	pages.append(f"\n--- Page {page_number} ---\n{page_text}")

	return "\n".join(pages)


	def load_single_document(path: Path, project_root: Path) -> Document:
	"""
	Load one supported document and return a Document object.

	This function keeps file-type-specific logic in one place.
	"""

	extension = path.suffix.lower()

	if extension in {".txt", ".md"}:
	text = read_text_file(path)
	elif extension == ".csv":
	text = read_csv_file(path)
	elif extension == ".pdf":
	text = read_pdf_file(path)
	else:
	raise ValueError(f"Unsupported file type: {extension}")

	text = text.strip()

	return Document(
	source=str(path.relative_to(project_root)),
	text=text,
	file_type=extension,
	character_count=len(text),
	)


	def load_documents(folder: Path, project_root: Path) -> List[Document]:
	"""
	Load all supported documents from a folder.

	Returns:
	List[Document]

	AI ENGINEER PRODUCTION TIP:
	Always keep source metadata. Without source metadata, your app
	cannot explain where an answer came from.
	"""

	supported_extensions = {".txt", ".md", ".csv", ".pdf"}
	documents = []

	for path in sorted(folder.rglob("*")):
	if not path.is_file():
	continue

	if path.suffix.lower() not in supported_extensions:
	continue

	try:
	document = load_single_document(path=path, project_root=project_root)

	if document.text:
	documents.append(document)

	except Exception as error:
	print(f"Could not load file: {path}")
	print(f"Reason: {error}")

	return documents