Spaces:

Harshdhsvguyt
/

policy_rag_assistant

Sleeping

App Files Files Community

Harshdhsvguyt commited on Feb 13

Commit

bb76352

verified ·

1 Parent(s): 5e96842

Update src/loader.py

Browse files

Files changed (1) hide show

src/loader.py +129 -60

src/loader.py CHANGED Viewed

@@ -1,60 +1,129 @@
-import os
-from pathlib import Path
-from typing import List
-import PyPDF2
-def load_documents(directory: str = "data/policies") -> List[dict]:
-    """
-    Load all documents from the policies directory.
-    Supports PDF, TXT, and MD files.
-    Returns:
-        List of dicts with 'text' and 'metadata' keys
-    """
-    documents = []
-    policy_dir = Path(directory)
-    if not policy_dir.exists():
-        print(f"Warning: {directory} does not exist")
-        return documents
-    for file_path in policy_dir.iterdir():
-        if file_path.is_file():
-            try:
-                if file_path.suffix.lower() == ".pdf":
-                    text = load_pdf(file_path)
-                elif file_path.suffix.lower() in [".txt", ".md"]:
-                    text = load_text(file_path)
-                else:
-                    continue
-                if text.strip():
-                    documents.append({
-                        "text": text,
-                        "metadata": {
-                            "source": file_path.name,
-                            "type": file_path.suffix[1:]
-                        }
-                    })
-                    print(f"Loaded: {file_path.name}")
-            except Exception as e:
-                print(f"Error loading {file_path.name}: {e}")
-    return documents
-def load_pdf(file_path: Path) -> str:
-    """Extract text from PDF file."""
-    text = []
-    with open(file_path, "rb") as f:
-        reader = PyPDF2.PdfReader(f)
-        for page in reader.pages:
-            text.append(page.extract_text())
-    return "\n".join(text)
-def load_text(file_path: Path) -> str:
-    """Load text from TXT or MD file."""
-    with open(file_path, "r", encoding="utf-8") as f:
-        return f.read()

+import os
+from pathlib import Path
+from typing import List, Dict
+import PyPDF2
+# ---------------------------------------------------------
+# Main Loader
+# ---------------------------------------------------------
+def load_documents(directory: str = "data/policies") -> List[Dict]:
+    """
+    Load all documents from the policies directory.
+    Supports PDF, TXT, and MD files.
+    Returns:
+        List of dicts with 'text' and 'metadata'
+    """
+    documents = []
+    policy_dir = Path(directory)
+    if not policy_dir.exists():
+        print(f"[Loader] Warning: {directory} does not exist")
+        return documents
+    for file_path in policy_dir.iterdir():
+        if not file_path.is_file():
+            continue
+        try:
+            suffix = file_path.suffix.lower()
+            if suffix == ".pdf":
+                text = load_pdf(file_path)
+            elif suffix in [".txt", ".md"]:
+                text = load_text(file_path)
+            else:
+                print(f"[Loader] Skipped unsupported file: {file_path.name}")
+                continue
+            # -------------------------------------------------
+            # Validate extracted text
+            # -------------------------------------------------
+            if text and text.strip():
+                documents.append({
+                    "text": text,
+                    "metadata": {
+                        "source": file_path.name,
+                        "type": suffix.replace(".", "")
+                    }
+                })
+                print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
+            else:
+                print(f"[Loader] Empty or image-only file skipped: {file_path.name}")
+        except Exception as e:
+            print(f"[Loader] Error loading {file_path.name}: {e}")
+    return documents
+# ---------------------------------------------------------
+# PDF Loader (Robust Version)
+# ---------------------------------------------------------
+def load_pdf(file_path: Path) -> str:
+    """
+    Extract text from PDF safely.
+    Handles:
+    - None pages
+    - Image-based PDFs
+    - HuggingFace file handling
+    """
+    text_parts = []
+    try:
+        with open(file_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            if not reader.pages:
+                print(f"[Loader] PDF has no pages: {file_path.name}")
+                return ""
+            for i, page in enumerate(reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    # Skip empty pages
+                    if page_text and page_text.strip():
+                        text_parts.append(page_text)
+                    else:
+                        print(f"[Loader] Page {i+1} empty or image-only")
+                except Exception as e:
+                    print(f"[Loader] Failed reading page {i+1}: {e}")
+    except Exception as e:
+        print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
+        return ""
+    final_text = "\n".join(text_parts)
+    # Detect image-only PDFs
+    if not final_text.strip():
+        print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")
+    return final_text
+# ---------------------------------------------------------
+# Text Loader
+# ---------------------------------------------------------
+def load_text(file_path: Path) -> str:
+    """
+    Load text from TXT or MD safely.
+    """
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except UnicodeDecodeError:
+        # Fallback encoding (common on Windows/HF)
+        with open(file_path, "r", encoding="latin-1") as f:
+            return f.read()
+    except Exception as e:
+        print(f"[Loader] Error reading text file {file_path.name}: {e}")
+        return ""