Harshdhsvguyt's picture
Update src/loader.py
bb76352 verified
import os
from pathlib import Path
from typing import List, Dict
import PyPDF2
# ---------------------------------------------------------
# Main Loader
# ---------------------------------------------------------
def load_documents(directory: str = "data/policies") -> List[Dict]:
"""
Load all documents from the policies directory.
Supports PDF, TXT, and MD files.
Returns:
List of dicts with 'text' and 'metadata'
"""
documents = []
policy_dir = Path(directory)
if not policy_dir.exists():
print(f"[Loader] Warning: {directory} does not exist")
return documents
for file_path in policy_dir.iterdir():
if not file_path.is_file():
continue
try:
suffix = file_path.suffix.lower()
if suffix == ".pdf":
text = load_pdf(file_path)
elif suffix in [".txt", ".md"]:
text = load_text(file_path)
else:
print(f"[Loader] Skipped unsupported file: {file_path.name}")
continue
# -------------------------------------------------
# Validate extracted text
# -------------------------------------------------
if text and text.strip():
documents.append({
"text": text,
"metadata": {
"source": file_path.name,
"type": suffix.replace(".", "")
}
})
print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
else:
print(f"[Loader] Empty or image-only file skipped: {file_path.name}")
except Exception as e:
print(f"[Loader] Error loading {file_path.name}: {e}")
return documents
# ---------------------------------------------------------
# PDF Loader (Robust Version)
# ---------------------------------------------------------
def load_pdf(file_path: Path) -> str:
"""
Extract text from PDF safely.
Handles:
- None pages
- Image-based PDFs
- HuggingFace file handling
"""
text_parts = []
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
if not reader.pages:
print(f"[Loader] PDF has no pages: {file_path.name}")
return ""
for i, page in enumerate(reader.pages):
try:
page_text = page.extract_text()
# Skip empty pages
if page_text and page_text.strip():
text_parts.append(page_text)
else:
print(f"[Loader] Page {i+1} empty or image-only")
except Exception as e:
print(f"[Loader] Failed reading page {i+1}: {e}")
except Exception as e:
print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
return ""
final_text = "\n".join(text_parts)
# Detect image-only PDFs
if not final_text.strip():
print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")
return final_text
# ---------------------------------------------------------
# Text Loader
# ---------------------------------------------------------
def load_text(file_path: Path) -> str:
"""
Load text from TXT or MD safely.
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
# Fallback encoding (common on Windows/HF)
with open(file_path, "r", encoding="latin-1") as f:
return f.read()
except Exception as e:
print(f"[Loader] Error reading text file {file_path.name}: {e}")
return ""