Spaces:

Harshdhsvguyt
/

policy_rag_assistant

Sleeping

File size: 3,829 Bytes

bb76352

import os
from pathlib import Path
from typing import List, Dict
import PyPDF2


# ---------------------------------------------------------
# Main Loader
# ---------------------------------------------------------
def load_documents(directory: str = "data/policies") -> List[Dict]:
    """
    Load all documents from the policies directory.
    Supports PDF, TXT, and MD files.

    Returns:
        List of dicts with 'text' and 'metadata'
    """
    documents = []
    policy_dir = Path(directory)

    if not policy_dir.exists():
        print(f"[Loader] Warning: {directory} does not exist")
        return documents

    for file_path in policy_dir.iterdir():
        if not file_path.is_file():
            continue

        try:
            suffix = file_path.suffix.lower()

            if suffix == ".pdf":
                text = load_pdf(file_path)

            elif suffix in [".txt", ".md"]:
                text = load_text(file_path)

            else:
                print(f"[Loader] Skipped unsupported file: {file_path.name}")
                continue

            # -------------------------------------------------
            # Validate extracted text
            # -------------------------------------------------
            if text and text.strip():
                documents.append({
                    "text": text,
                    "metadata": {
                        "source": file_path.name,
                        "type": suffix.replace(".", "")
                    }
                })
                print(f"[Loader] Loaded: {file_path.name} | chars={len(text)}")
            else:
                print(f"[Loader] Empty or image-only file skipped: {file_path.name}")

        except Exception as e:
            print(f"[Loader] Error loading {file_path.name}: {e}")

    return documents


# ---------------------------------------------------------
# PDF Loader (Robust Version)
# ---------------------------------------------------------
def load_pdf(file_path: Path) -> str:
    """
    Extract text from PDF safely.

    Handles:
    - None pages
    - Image-based PDFs
    - HuggingFace file handling
    """
    text_parts = []

    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)

            if not reader.pages:
                print(f"[Loader] PDF has no pages: {file_path.name}")
                return ""

            for i, page in enumerate(reader.pages):
                try:
                    page_text = page.extract_text()

                    # Skip empty pages
                    if page_text and page_text.strip():
                        text_parts.append(page_text)
                    else:
                        print(f"[Loader] Page {i+1} empty or image-only")

                except Exception as e:
                    print(f"[Loader] Failed reading page {i+1}: {e}")

    except Exception as e:
        print(f"[Loader] Failed opening PDF {file_path.name}: {e}")
        return ""

    final_text = "\n".join(text_parts)

    # Detect image-only PDFs
    if not final_text.strip():
        print(f"[Loader] No extractable text found (likely scanned PDF): {file_path.name}")

    return final_text


# ---------------------------------------------------------
# Text Loader
# ---------------------------------------------------------
def load_text(file_path: Path) -> str:
    """
    Load text from TXT or MD safely.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    except UnicodeDecodeError:
        # Fallback encoding (common on Windows/HF)
        with open(file_path, "r", encoding="latin-1") as f:
            return f.read()

    except Exception as e:
        print(f"[Loader] Error reading text file {file_path.name}: {e}")
        return ""