Spaces:

Abdul2000
/

Ragbase_Studio

Sleeping

App Files Files Community

Ragbase_Studio / src /document_loader.py

Abdul2000

Rename document_loader.py to src/document_loader.py

6558fe5 verified 19 days ago

Raw

History Blame Contribute Delete

5.3 kB

	"""
	document_loader.py
	------------------
	Handles loading and extracting text from different file types.

	Supported formats:
	- .txt (plain text)
	- .pdf (PDF documents)
	- .csv (comma-separated values)
	- .docx (Microsoft Word documents)

	Each loader returns a list of LangChain Document objects.
	A Document has two fields:
	- page_content : the extracted text
	- metadata : a dict with extra info like the source file name
	"""

	import os
	from langchain_core.documents import Document

	# ── helpers ──────────────────────────────────────────────────────────────────

	def _make_doc(text: str, source: str) -> Document:
	"""Wrap extracted text in a LangChain Document with source metadata."""
	return Document(page_content=text, metadata={"source": source})


	# ── per-format loaders ────────────────────────────────────────────────────────

	def load_txt(file_path: str) -> list[Document]:
	"""Load a plain-text file and return it as a single Document."""
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()
	return [_make_doc(text, file_path)]


	def load_pdf(file_path: str) -> list[Document]:
	"""
	Load a PDF file page-by-page.
	Each page becomes its own Document so we can cite the exact page later.
	Requires: pypdf
	"""
	try:
	from pypdf import PdfReader
	except ImportError:
	raise ImportError("pypdf is required for PDF support. Run: pip install pypdf")

	reader = PdfReader(file_path)
	documents = []
	for page_num, page in enumerate(reader.pages):
	text = page.extract_text() or ""
	if text.strip(): # skip blank pages
	doc = Document(
	page_content=text,
	metadata={"source": file_path, "page": page_num + 1},
	)
	documents.append(doc)
	return documents


	def load_csv(file_path: str) -> list[Document]:
	"""
	Load a CSV file.
	Each row is turned into a readable 'key: value' string and stored as
	one Document so every row is individually searchable.
	Requires: pandas
	"""
	try:
	import pandas as pd
	except ImportError:
	raise ImportError("pandas is required for CSV support. Run: pip install pandas")

	df = pd.read_csv(file_path)
	documents = []
	for idx, row in df.iterrows():
	# Build a human-readable string from each row
	row_text = "\n".join(f"{col}: {val}" for col, val in row.items())
	doc = Document(
	page_content=row_text,
	metadata={"source": file_path, "row": idx + 1},
	)
	documents.append(doc)
	return documents


	def load_docx(file_path: str) -> list[Document]:
	"""
	Load a Microsoft Word (.docx) file.
	Each paragraph becomes its own Document.
	Requires: python-docx
	"""
	try:
	from docx import Document as WordDocument
	except ImportError:
	raise ImportError(
	"python-docx is required for DOCX support. Run: pip install python-docx"
	)

	word_doc = WordDocument(file_path)
	documents = []
	for para_num, para in enumerate(word_doc.paragraphs):
	text = para.text.strip()
	if text: # skip empty paragraphs
	doc = Document(
	page_content=text,
	metadata={"source": file_path, "paragraph": para_num + 1},
	)
	documents.append(doc)
	return documents


	# ── main entry point ──────────────────────────────────────────────────────────

	def load_document(file_path: str) -> list[Document]:
	"""
	Detect the file extension and call the right loader.

	Parameters
	----------
	file_path : str
	Full path to the file on disk.

	Returns
	-------
	list[Document]
	A list of LangChain Document objects with extracted text.

	Raises
	------
	ValueError – if the file type is not supported.
	Exception – if loading fails for any reason.
	"""
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")

	extension = os.path.splitext(file_path)[1].lower()

	loaders = {
	".txt": load_txt,
	".pdf": load_pdf,
	".csv": load_csv,
	".docx": load_docx,
	}

	if extension not in loaders:
	raise ValueError(
	f"Unsupported file type: '{extension}'. "
	f"Supported types: {', '.join(loaders.keys())}"
	)

	# Call the appropriate loader
	documents = loaders[extension](file_path)

	if not documents:
	raise ValueError(f"No readable text found in: {file_path}")

	print(f" OK: Loaded {len(documents)} chunk(s) from '{os.path.basename(file_path)}'")
	return documents