import os from pathlib import Path from typing import List, Dict import PyPDF2 # --------------------------------------------------------- # Main Loader # --------------------------------------------------------- def load_documents(directory: str = "data/policies") -> List[Dict]: """ Load all documents from the policies directory. Supports PDF, TXT, and MD files. Returns: List of dicts with 'text' and 'metadata' """ documents = [] policy_dir = Path(directory) if not policy_dir.exists(): print(f"[Loader] Warning: {directory} does not exist") return documents for file_path in policy_dir.iterdir(): if not file_path.is_file(): continue try: suffix = file_path.suffix.lower() if suffix == ".pdf": text = load_pdf(file_path) elif suffix in [".txt", ".md"]: text = load_text(file_path) else: print(f"[Loader] Skipped unsupported file: {file_path.name}") continue # ------------------------------------------------- # Validate extracted text # ------------------------------------------------- if text and text.strip(): documents.append({ "text": text, "metadata": { "source": file_path.name, "type": suffix.replace(".", "") } }) print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}") else: print(f"[Loader] Empty or image-only file skipped: {file_path.name}") except Exception as e: print(f"[Loader] Error loading {file_path.name}: {e}") return documents # --------------------------------------------------------- # PDF Loader (Robust Version) # --------------------------------------------------------- def load_pdf(file_path: Path) -> str: """ Extract text from PDF safely. Handles: - None pages - Image-based PDFs - HuggingFace file handling """ text_parts = [] try: with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) if not reader.pages: print(f"[Loader] PDF has no pages: {file_path.name}") return "" for i, page in enumerate(reader.pages): try: page_text = page.extract_text() # Skip empty pages if page_text and page_text.strip(): text_parts.append(page_text) else: print(f"[Loader] Page {i+1} empty or image-only") except Exception as e: print(f"[Loader] Failed reading page {i+1}: {e}") except Exception as e: print(f"[Loader] Failed opening PDF {file_path.name}: {e}") return "" final_text = "\n".join(text_parts) # Detect image-only PDFs if not final_text.strip(): print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}") return final_text # --------------------------------------------------------- # Text Loader # --------------------------------------------------------- def load_text(file_path: Path) -> str: """ Load text from TXT or MD safely. """ try: with open(file_path, "r", encoding="utf-8") as f: return f.read() except UnicodeDecodeError: # Fallback encoding (common on Windows/HF) with open(file_path, "r", encoding="latin-1") as f: return f.read() except Exception as e: print(f"[Loader] Error reading text file {file_path.name}: {e}") return ""