Spaces:
Sleeping
Sleeping
| """ | |
| data_loader.py | |
| ββββββββββββββ | |
| Loads the Parquet-backed PDF dataset from Hugging Face Hub and returns | |
| a list of LangChain Document objects ready for indexing. | |
| Responsibilities: | |
| - Connect to HF Hub (handles both public and private datasets) | |
| - Auto-detect the text column | |
| - Yield Document objects with rich metadata (source file, page number, etc.) | |
| """ | |
| import logging | |
| from typing import Optional | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from langchain_core.documents import Document | |
| from config import cfg | |
| logger = logging.getLogger(__name__) | |
| # ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_documents() -> list[Document]: | |
| """ | |
| Entry point: load HF dataset and return chunked-ready Document objects. | |
| Returns | |
| ------- | |
| list[Document] | |
| One Document per non-empty row, with metadata preserved. | |
| Raises | |
| ------ | |
| ValueError | |
| If the dataset is not configured or no usable text column is found. | |
| """ | |
| if not cfg.hf_dataset: | |
| raise ValueError( | |
| "HF_DATASET env var is not set. " | |
| "Set it to 'username/dataset-name' in your Space secrets." | |
| ) | |
| df = _fetch_dataframe() | |
| text_col = _detect_text_column(df) | |
| documents = _build_documents(df, text_col) | |
| logger.info("Loaded %d documents from '%s' (column: '%s')", | |
| len(documents), cfg.hf_dataset, text_col) | |
| return documents | |
| # ββ Internal helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _fetch_dataframe() -> pd.DataFrame: | |
| """Download the dataset split from HF Hub and return as a DataFrame.""" | |
| logger.info("Fetching dataset '%s' split='%s' β¦", cfg.hf_dataset, cfg.dataset_split) | |
| ds = load_dataset( | |
| cfg.hf_dataset, | |
| split=cfg.dataset_split, | |
| token=cfg.hf_token or None, | |
| ) | |
| df = ds.to_pandas() | |
| logger.info("Dataset shape: %s | columns: %s", df.shape, df.columns.tolist()) | |
| return df | |
| def _detect_text_column(df: pd.DataFrame) -> str: | |
| """ | |
| Find the first column whose lowercase name matches a known text-column | |
| name. Falls back to the first column if none match. | |
| """ | |
| col_lower = {c.lower(): c for c in df.columns} | |
| for candidate in cfg.text_column_candidates: | |
| if candidate in col_lower: | |
| return col_lower[candidate] | |
| fallback = df.columns[0] | |
| logger.warning( | |
| "No known text column found. Falling back to '%s'. " | |
| "Expected one of: %s", | |
| fallback, cfg.text_column_candidates, | |
| ) | |
| return fallback | |
| def _build_documents(df: pd.DataFrame, text_col: str) -> list[Document]: | |
| """Convert DataFrame rows into LangChain Document objects with metadata.""" | |
| meta_cols = [c for c in df.columns if c != text_col] | |
| documents: list[Document] = [] | |
| for row_idx, row in df.iterrows(): | |
| text = str(row[text_col]).strip() | |
| if not text or text.lower() == "nan": | |
| continue # skip empty rows | |
| metadata = {col: str(row.get(col, "")) for col in meta_cols} | |
| metadata["source_row"] = int(row_idx) # type: ignore[arg-type] | |
| documents.append(Document(page_content=text, metadata=metadata)) | |
| return documents | |