Spaces:

Harshdhsvguyt
/

policy_rag_assistant

Sleeping

App Files Files Community

policy_rag_assistant / src /loader.py

Harshdhsvguyt

Update src/loader.py

bb76352 verified about 1 month ago

raw

history blame contribute delete

3.83 kB

	import os
	from pathlib import Path
	from typing import List, Dict
	import PyPDF2


	# ---------------------------------------------------------
	# Main Loader
	# ---------------------------------------------------------
	def load_documents(directory: str = "data/policies") -> List[Dict]:
	"""
	Load all documents from the policies directory.
	Supports PDF, TXT, and MD files.

	Returns:
	List of dicts with 'text' and 'metadata'
	"""
	documents = []
	policy_dir = Path(directory)

	if not policy_dir.exists():
	print(f"[Loader] Warning: {directory} does not exist")
	return documents

	for file_path in policy_dir.iterdir():
	if not file_path.is_file():
	continue

	try:
	suffix = file_path.suffix.lower()

	if suffix == ".pdf":
	text = load_pdf(file_path)

	elif suffix in [".txt", ".md"]:
	text = load_text(file_path)

	else:
	print(f"[Loader] Skipped unsupported file: {file_path.name}")
	continue

	# -------------------------------------------------
	# Validate extracted text
	# -------------------------------------------------
	if text and text.strip():
	documents.append({
	"text": text,
	"metadata": {
	"source": file_path.name,
	"type": suffix.replace(".", "")
	}
	})
	print(f"[Loader] Loaded: {file_path.name} \| chars={len(text)}")
	else:
	print(f"[Loader] Empty or image-only file skipped: {file_path.name}")

	except Exception as e:
	print(f"[Loader] Error loading {file_path.name}: {e}")

	return documents


	# ---------------------------------------------------------
	# PDF Loader (Robust Version)
	# ---------------------------------------------------------
	def load_pdf(file_path: Path) -> str:
	"""
	Extract text from PDF safely.

	Handles:
	- None pages
	- Image-based PDFs
	- HuggingFace file handling
	"""
	text_parts = []

	try:
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)

	if not reader.pages:
	print(f"[Loader] PDF has no pages: {file_path.name}")
	return ""

	for i, page in enumerate(reader.pages):
	try:
	page_text = page.extract_text()

	# Skip empty pages
	if page_text and page_text.strip():
	text_parts.append(page_text)
	else:
	print(f"[Loader] Page {i+1} empty or image-only")

	except Exception as e:
	print(f"[Loader] Failed reading page {i+1}: {e}")

	except Exception as e:
	print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
	return ""

	final_text = "\n".join(text_parts)

	# Detect image-only PDFs
	if not final_text.strip():
	print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")

	return final_text


	# ---------------------------------------------------------
	# Text Loader
	# ---------------------------------------------------------
	def load_text(file_path: Path) -> str:
	"""
	Load text from TXT or MD safely.
	"""
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()

	except UnicodeDecodeError:
	# Fallback encoding (common on Windows/HF)
	with open(file_path, "r", encoding="latin-1") as f:
	return f.read()

	except Exception as e:
	print(f"[Loader] Error reading text file {file_path.name}: {e}")
	return ""