DemoChatBot / data_loader.py
OnlyTheTruth03's picture
Initial Commit
721ca73 verified
"""
data_loader.py
──────────────
Loads the Parquet-backed PDF dataset from Hugging Face Hub and returns
a list of LangChain Document objects ready for indexing.
Responsibilities:
- Connect to HF Hub (handles both public and private datasets)
- Auto-detect the text column
- Yield Document objects with rich metadata (source file, page number, etc.)
"""
import logging
from typing import Optional
import pandas as pd
from datasets import load_dataset
from langchain_core.documents import Document
from config import cfg
logger = logging.getLogger(__name__)
# ── Public API ────────────────────────────────────────────────────────────────
def load_documents() -> list[Document]:
"""
Entry point: load HF dataset and return chunked-ready Document objects.
Returns
-------
list[Document]
One Document per non-empty row, with metadata preserved.
Raises
------
ValueError
If the dataset is not configured or no usable text column is found.
"""
if not cfg.hf_dataset:
raise ValueError(
"HF_DATASET env var is not set. "
"Set it to 'username/dataset-name' in your Space secrets."
)
df = _fetch_dataframe()
text_col = _detect_text_column(df)
documents = _build_documents(df, text_col)
logger.info("Loaded %d documents from '%s' (column: '%s')",
len(documents), cfg.hf_dataset, text_col)
return documents
# ── Internal helpers ──────────────────────────────────────────────────────────
def _fetch_dataframe() -> pd.DataFrame:
"""Download the dataset split from HF Hub and return as a DataFrame."""
logger.info("Fetching dataset '%s' split='%s' …", cfg.hf_dataset, cfg.dataset_split)
ds = load_dataset(
cfg.hf_dataset,
split=cfg.dataset_split,
token=cfg.hf_token or None,
)
df = ds.to_pandas()
logger.info("Dataset shape: %s | columns: %s", df.shape, df.columns.tolist())
return df
def _detect_text_column(df: pd.DataFrame) -> str:
"""
Find the first column whose lowercase name matches a known text-column
name. Falls back to the first column if none match.
"""
col_lower = {c.lower(): c for c in df.columns}
for candidate in cfg.text_column_candidates:
if candidate in col_lower:
return col_lower[candidate]
fallback = df.columns[0]
logger.warning(
"No known text column found. Falling back to '%s'. "
"Expected one of: %s",
fallback, cfg.text_column_candidates,
)
return fallback
def _build_documents(df: pd.DataFrame, text_col: str) -> list[Document]:
"""Convert DataFrame rows into LangChain Document objects with metadata."""
meta_cols = [c for c in df.columns if c != text_col]
documents: list[Document] = []
for row_idx, row in df.iterrows():
text = str(row[text_col]).strip()
if not text or text.lower() == "nan":
continue # skip empty rows
metadata = {col: str(row.get(col, "")) for col in meta_cols}
metadata["source_row"] = int(row_idx) # type: ignore[arg-type]
documents.append(Document(page_content=text, metadata=metadata))
return documents