File size: 3,829 Bytes
bb76352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
from pathlib import Path
from typing import List, Dict
import PyPDF2


# ---------------------------------------------------------
# Main Loader
# ---------------------------------------------------------
def load_documents(directory: str = "data/policies") -> List[Dict]:
    """
    Load all documents from the policies directory.
    Supports PDF, TXT, and MD files.

    Returns:
        List of dicts with 'text' and 'metadata'
    """
    documents = []
    policy_dir = Path(directory)

    if not policy_dir.exists():
        print(f"[Loader] Warning: {directory} does not exist")
        return documents

    for file_path in policy_dir.iterdir():
        if not file_path.is_file():
            continue

        try:
            suffix = file_path.suffix.lower()

            if suffix == ".pdf":
                text = load_pdf(file_path)

            elif suffix in [".txt", ".md"]:
                text = load_text(file_path)

            else:
                print(f"[Loader] Skipped unsupported file: {file_path.name}")
                continue

            # -------------------------------------------------
            # Validate extracted text
            # -------------------------------------------------
            if text and text.strip():
                documents.append({
                    "text": text,
                    "metadata": {
                        "source": file_path.name,
                        "type": suffix.replace(".", "")
                    }
                })
                print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
            else:
                print(f"[Loader] Empty or image-only file skipped: {file_path.name}")

        except Exception as e:
            print(f"[Loader] Error loading {file_path.name}: {e}")

    return documents


# ---------------------------------------------------------
# PDF Loader (Robust Version)
# ---------------------------------------------------------
def load_pdf(file_path: Path) -> str:
    """
    Extract text from PDF safely.

    Handles:
    - None pages
    - Image-based PDFs
    - HuggingFace file handling
    """
    text_parts = []

    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)

            if not reader.pages:
                print(f"[Loader] PDF has no pages: {file_path.name}")
                return ""

            for i, page in enumerate(reader.pages):
                try:
                    page_text = page.extract_text()

                    # Skip empty pages
                    if page_text and page_text.strip():
                        text_parts.append(page_text)
                    else:
                        print(f"[Loader] Page {i+1} empty or image-only")

                except Exception as e:
                    print(f"[Loader] Failed reading page {i+1}: {e}")

    except Exception as e:
        print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
        return ""

    final_text = "\n".join(text_parts)

    # Detect image-only PDFs
    if not final_text.strip():
        print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")

    return final_text


# ---------------------------------------------------------
# Text Loader
# ---------------------------------------------------------
def load_text(file_path: Path) -> str:
    """
    Load text from TXT or MD safely.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    except UnicodeDecodeError:
        # Fallback encoding (common on Windows/HF)
        with open(file_path, "r", encoding="latin-1") as f:
            return f.read()

    except Exception as e:
        print(f"[Loader] Error reading text file {file_path.name}: {e}")
        return ""