Spaces:
Sleeping
Sleeping
File size: 3,829 Bytes
bb76352 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import os
from pathlib import Path
from typing import List, Dict
import PyPDF2
# ---------------------------------------------------------
# Main Loader
# ---------------------------------------------------------
def load_documents(directory: str = "data/policies") -> List[Dict]:
"""
Load all documents from the policies directory.
Supports PDF, TXT, and MD files.
Returns:
List of dicts with 'text' and 'metadata'
"""
documents = []
policy_dir = Path(directory)
if not policy_dir.exists():
print(f"[Loader] Warning: {directory} does not exist")
return documents
for file_path in policy_dir.iterdir():
if not file_path.is_file():
continue
try:
suffix = file_path.suffix.lower()
if suffix == ".pdf":
text = load_pdf(file_path)
elif suffix in [".txt", ".md"]:
text = load_text(file_path)
else:
print(f"[Loader] Skipped unsupported file: {file_path.name}")
continue
# -------------------------------------------------
# Validate extracted text
# -------------------------------------------------
if text and text.strip():
documents.append({
"text": text,
"metadata": {
"source": file_path.name,
"type": suffix.replace(".", "")
}
})
print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
else:
print(f"[Loader] Empty or image-only file skipped: {file_path.name}")
except Exception as e:
print(f"[Loader] Error loading {file_path.name}: {e}")
return documents
# ---------------------------------------------------------
# PDF Loader (Robust Version)
# ---------------------------------------------------------
def load_pdf(file_path: Path) -> str:
"""
Extract text from PDF safely.
Handles:
- None pages
- Image-based PDFs
- HuggingFace file handling
"""
text_parts = []
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
if not reader.pages:
print(f"[Loader] PDF has no pages: {file_path.name}")
return ""
for i, page in enumerate(reader.pages):
try:
page_text = page.extract_text()
# Skip empty pages
if page_text and page_text.strip():
text_parts.append(page_text)
else:
print(f"[Loader] Page {i+1} empty or image-only")
except Exception as e:
print(f"[Loader] Failed reading page {i+1}: {e}")
except Exception as e:
print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
return ""
final_text = "\n".join(text_parts)
# Detect image-only PDFs
if not final_text.strip():
print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")
return final_text
# ---------------------------------------------------------
# Text Loader
# ---------------------------------------------------------
def load_text(file_path: Path) -> str:
"""
Load text from TXT or MD safely.
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
# Fallback encoding (common on Windows/HF)
with open(file_path, "r", encoding="latin-1") as f:
return f.read()
except Exception as e:
print(f"[Loader] Error reading text file {file_path.name}: {e}")
return ""
|