Harshdhsvguyt commited on
Commit
bb76352
·
verified ·
1 Parent(s): 5e96842

Update src/loader.py

Browse files
Files changed (1) hide show
  1. src/loader.py +129 -60
src/loader.py CHANGED
@@ -1,60 +1,129 @@
1
- import os
2
- from pathlib import Path
3
- from typing import List
4
- import PyPDF2
5
-
6
-
7
- def load_documents(directory: str = "data/policies") -> List[dict]:
8
- """
9
- Load all documents from the policies directory.
10
- Supports PDF, TXT, and MD files.
11
-
12
- Returns:
13
- List of dicts with 'text' and 'metadata' keys
14
- """
15
- documents = []
16
- policy_dir = Path(directory)
17
-
18
- if not policy_dir.exists():
19
- print(f"Warning: {directory} does not exist")
20
- return documents
21
-
22
- for file_path in policy_dir.iterdir():
23
- if file_path.is_file():
24
- try:
25
- if file_path.suffix.lower() == ".pdf":
26
- text = load_pdf(file_path)
27
- elif file_path.suffix.lower() in [".txt", ".md"]:
28
- text = load_text(file_path)
29
- else:
30
- continue
31
-
32
- if text.strip():
33
- documents.append({
34
- "text": text,
35
- "metadata": {
36
- "source": file_path.name,
37
- "type": file_path.suffix[1:]
38
- }
39
- })
40
- print(f"Loaded: {file_path.name}")
41
- except Exception as e:
42
- print(f"Error loading {file_path.name}: {e}")
43
-
44
- return documents
45
-
46
-
47
- def load_pdf(file_path: Path) -> str:
48
- """Extract text from PDF file."""
49
- text = []
50
- with open(file_path, "rb") as f:
51
- reader = PyPDF2.PdfReader(f)
52
- for page in reader.pages:
53
- text.append(page.extract_text())
54
- return "\n".join(text)
55
-
56
-
57
- def load_text(file_path: Path) -> str:
58
- """Load text from TXT or MD file."""
59
- with open(file_path, "r", encoding="utf-8") as f:
60
- return f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Dict
4
+ import PyPDF2
5
+
6
+
7
+ # ---------------------------------------------------------
8
+ # Main Loader
9
+ # ---------------------------------------------------------
10
+ def load_documents(directory: str = "data/policies") -> List[Dict]:
11
+ """
12
+ Load all documents from the policies directory.
13
+ Supports PDF, TXT, and MD files.
14
+
15
+ Returns:
16
+ List of dicts with 'text' and 'metadata'
17
+ """
18
+ documents = []
19
+ policy_dir = Path(directory)
20
+
21
+ if not policy_dir.exists():
22
+ print(f"[Loader] Warning: {directory} does not exist")
23
+ return documents
24
+
25
+ for file_path in policy_dir.iterdir():
26
+ if not file_path.is_file():
27
+ continue
28
+
29
+ try:
30
+ suffix = file_path.suffix.lower()
31
+
32
+ if suffix == ".pdf":
33
+ text = load_pdf(file_path)
34
+
35
+ elif suffix in [".txt", ".md"]:
36
+ text = load_text(file_path)
37
+
38
+ else:
39
+ print(f"[Loader] Skipped unsupported file: {file_path.name}")
40
+ continue
41
+
42
+ # -------------------------------------------------
43
+ # Validate extracted text
44
+ # -------------------------------------------------
45
+ if text and text.strip():
46
+ documents.append({
47
+ "text": text,
48
+ "metadata": {
49
+ "source": file_path.name,
50
+ "type": suffix.replace(".", "")
51
+ }
52
+ })
53
+ print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
54
+ else:
55
+ print(f"[Loader] Empty or image-only file skipped: {file_path.name}")
56
+
57
+ except Exception as e:
58
+ print(f"[Loader] Error loading {file_path.name}: {e}")
59
+
60
+ return documents
61
+
62
+
63
+ # ---------------------------------------------------------
64
+ # PDF Loader (Robust Version)
65
+ # ---------------------------------------------------------
66
+ def load_pdf(file_path: Path) -> str:
67
+ """
68
+ Extract text from PDF safely.
69
+
70
+ Handles:
71
+ - None pages
72
+ - Image-based PDFs
73
+ - HuggingFace file handling
74
+ """
75
+ text_parts = []
76
+
77
+ try:
78
+ with open(file_path, "rb") as f:
79
+ reader = PyPDF2.PdfReader(f)
80
+
81
+ if not reader.pages:
82
+ print(f"[Loader] PDF has no pages: {file_path.name}")
83
+ return ""
84
+
85
+ for i, page in enumerate(reader.pages):
86
+ try:
87
+ page_text = page.extract_text()
88
+
89
+ # Skip empty pages
90
+ if page_text and page_text.strip():
91
+ text_parts.append(page_text)
92
+ else:
93
+ print(f"[Loader] Page {i+1} empty or image-only")
94
+
95
+ except Exception as e:
96
+ print(f"[Loader] Failed reading page {i+1}: {e}")
97
+
98
+ except Exception as e:
99
+ print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
100
+ return ""
101
+
102
+ final_text = "\n".join(text_parts)
103
+
104
+ # Detect image-only PDFs
105
+ if not final_text.strip():
106
+ print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")
107
+
108
+ return final_text
109
+
110
+
111
+ # ---------------------------------------------------------
112
+ # Text Loader
113
+ # ---------------------------------------------------------
114
+ def load_text(file_path: Path) -> str:
115
+ """
116
+ Load text from TXT or MD safely.
117
+ """
118
+ try:
119
+ with open(file_path, "r", encoding="utf-8") as f:
120
+ return f.read()
121
+
122
+ except UnicodeDecodeError:
123
+ # Fallback encoding (common on Windows/HF)
124
+ with open(file_path, "r", encoding="latin-1") as f:
125
+ return f.read()
126
+
127
+ except Exception as e:
128
+ print(f"[Loader] Error reading text file {file_path.name}: {e}")
129
+ return ""