Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from typing import List, Dict | |
| import PyPDF2 | |
| # --------------------------------------------------------- | |
| # Main Loader | |
| # --------------------------------------------------------- | |
| def load_documents(directory: str = "data/policies") -> List[Dict]: | |
| """ | |
| Load all documents from the policies directory. | |
| Supports PDF, TXT, and MD files. | |
| Returns: | |
| List of dicts with 'text' and 'metadata' | |
| """ | |
| documents = [] | |
| policy_dir = Path(directory) | |
| if not policy_dir.exists(): | |
| print(f"[Loader] Warning: {directory} does not exist") | |
| return documents | |
| for file_path in policy_dir.iterdir(): | |
| if not file_path.is_file(): | |
| continue | |
| try: | |
| suffix = file_path.suffix.lower() | |
| if suffix == ".pdf": | |
| text = load_pdf(file_path) | |
| elif suffix in [".txt", ".md"]: | |
| text = load_text(file_path) | |
| else: | |
| print(f"[Loader] Skipped unsupported file: {file_path.name}") | |
| continue | |
| # ------------------------------------------------- | |
| # Validate extracted text | |
| # ------------------------------------------------- | |
| if text and text.strip(): | |
| documents.append({ | |
| "text": text, | |
| "metadata": { | |
| "source": file_path.name, | |
| "type": suffix.replace(".", "") | |
| } | |
| }) | |
| print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}") | |
| else: | |
| print(f"[Loader] Empty or image-only file skipped: {file_path.name}") | |
| except Exception as e: | |
| print(f"[Loader] Error loading {file_path.name}: {e}") | |
| return documents | |
| # --------------------------------------------------------- | |
| # PDF Loader (Robust Version) | |
| # --------------------------------------------------------- | |
| def load_pdf(file_path: Path) -> str: | |
| """ | |
| Extract text from PDF safely. | |
| Handles: | |
| - None pages | |
| - Image-based PDFs | |
| - HuggingFace file handling | |
| """ | |
| text_parts = [] | |
| try: | |
| with open(file_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| if not reader.pages: | |
| print(f"[Loader] PDF has no pages: {file_path.name}") | |
| return "" | |
| for i, page in enumerate(reader.pages): | |
| try: | |
| page_text = page.extract_text() | |
| # Skip empty pages | |
| if page_text and page_text.strip(): | |
| text_parts.append(page_text) | |
| else: | |
| print(f"[Loader] Page {i+1} empty or image-only") | |
| except Exception as e: | |
| print(f"[Loader] Failed reading page {i+1}: {e}") | |
| except Exception as e: | |
| print(f"[Loader] Failed opening PDF {file_path.name}: {e}") | |
| return "" | |
| final_text = "\n".join(text_parts) | |
| # Detect image-only PDFs | |
| if not final_text.strip(): | |
| print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}") | |
| return final_text | |
| # --------------------------------------------------------- | |
| # Text Loader | |
| # --------------------------------------------------------- | |
| def load_text(file_path: Path) -> str: | |
| """ | |
| Load text from TXT or MD safely. | |
| """ | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except UnicodeDecodeError: | |
| # Fallback encoding (common on Windows/HF) | |
| with open(file_path, "r", encoding="latin-1") as f: | |
| return f.read() | |
| except Exception as e: | |
| print(f"[Loader] Error reading text file {file_path.name}: {e}") | |
| return "" | |