""" data_loader.py ────────────── Loads the Parquet-backed PDF dataset from Hugging Face Hub and returns a list of LangChain Document objects ready for indexing. Responsibilities: - Connect to HF Hub (handles both public and private datasets) - Auto-detect the text column - Yield Document objects with rich metadata (source file, page number, etc.) """ import logging from typing import Optional import pandas as pd from datasets import load_dataset from langchain_core.documents import Document from config import cfg logger = logging.getLogger(__name__) # ── Public API ──────────────────────────────────────────────────────────────── def load_documents() -> list[Document]: """ Entry point: load HF dataset and return chunked-ready Document objects. Returns ------- list[Document] One Document per non-empty row, with metadata preserved. Raises ------ ValueError If the dataset is not configured or no usable text column is found. """ if not cfg.hf_dataset: raise ValueError( "HF_DATASET env var is not set. " "Set it to 'username/dataset-name' in your Space secrets." ) df = _fetch_dataframe() text_col = _detect_text_column(df) documents = _build_documents(df, text_col) logger.info("Loaded %d documents from '%s' (column: '%s')", len(documents), cfg.hf_dataset, text_col) return documents # ── Internal helpers ────────────────────────────────────────────────────────── def _fetch_dataframe() -> pd.DataFrame: """Download the dataset split from HF Hub and return as a DataFrame.""" logger.info("Fetching dataset '%s' split='%s' …", cfg.hf_dataset, cfg.dataset_split) ds = load_dataset( cfg.hf_dataset, split=cfg.dataset_split, token=cfg.hf_token or None, ) df = ds.to_pandas() logger.info("Dataset shape: %s | columns: %s", df.shape, df.columns.tolist()) return df def _detect_text_column(df: pd.DataFrame) -> str: """ Find the first column whose lowercase name matches a known text-column name. Falls back to the first column if none match. """ col_lower = {c.lower(): c for c in df.columns} for candidate in cfg.text_column_candidates: if candidate in col_lower: return col_lower[candidate] fallback = df.columns[0] logger.warning( "No known text column found. Falling back to '%s'. " "Expected one of: %s", fallback, cfg.text_column_candidates, ) return fallback def _build_documents(df: pd.DataFrame, text_col: str) -> list[Document]: """Convert DataFrame rows into LangChain Document objects with metadata.""" meta_cols = [c for c in df.columns if c != text_col] documents: list[Document] = [] for row_idx, row in df.iterrows(): text = str(row[text_col]).strip() if not text or text.lower() == "nan": continue # skip empty rows metadata = {col: str(row.get(col, "")) for col in meta_cols} metadata["source_row"] = int(row_idx) # type: ignore[arg-type] documents.append(Document(page_content=text, metadata=metadata)) return documents