Spaces:

OnlyTheTruth03
/

DemoChatBot

Sleeping

App Files Files Community

DemoChatBot / data_loader.py

OnlyTheTruth03

Initial Commit

721ca73 verified about 1 month ago

raw

history blame contribute delete

3.61 kB

	"""
	data_loader.py
	──────────────
	Loads the Parquet-backed PDF dataset from Hugging Face Hub and returns
	a list of LangChain Document objects ready for indexing.

	Responsibilities:
	- Connect to HF Hub (handles both public and private datasets)
	- Auto-detect the text column
	- Yield Document objects with rich metadata (source file, page number, etc.)
	"""

	import logging
	from typing import Optional

	import pandas as pd
	from datasets import load_dataset
	from langchain_core.documents import Document

	from config import cfg

	logger = logging.getLogger(__name__)


	# ── Public API ────────────────────────────────────────────────────────────────

	def load_documents() -> list[Document]:
	"""
	Entry point: load HF dataset and return chunked-ready Document objects.

	Returns
	-------
	list[Document]
	One Document per non-empty row, with metadata preserved.

	Raises
	------
	ValueError
	If the dataset is not configured or no usable text column is found.
	"""
	if not cfg.hf_dataset:
	raise ValueError(
	"HF_DATASET env var is not set. "
	"Set it to 'username/dataset-name' in your Space secrets."
	)

	df = _fetch_dataframe()
	text_col = _detect_text_column(df)
	documents = _build_documents(df, text_col)

	logger.info("Loaded %d documents from '%s' (column: '%s')",
	len(documents), cfg.hf_dataset, text_col)
	return documents


	# ── Internal helpers ──────────────────────────────────────────────────────────

	def _fetch_dataframe() -> pd.DataFrame:
	"""Download the dataset split from HF Hub and return as a DataFrame."""
	logger.info("Fetching dataset '%s' split='%s' …", cfg.hf_dataset, cfg.dataset_split)
	ds = load_dataset(
	cfg.hf_dataset,
	split=cfg.dataset_split,
	token=cfg.hf_token or None,
	)
	df = ds.to_pandas()
	logger.info("Dataset shape: %s \| columns: %s", df.shape, df.columns.tolist())
	return df


	def _detect_text_column(df: pd.DataFrame) -> str:
	"""
	Find the first column whose lowercase name matches a known text-column
	name. Falls back to the first column if none match.
	"""
	col_lower = {c.lower(): c for c in df.columns}
	for candidate in cfg.text_column_candidates:
	if candidate in col_lower:
	return col_lower[candidate]

	fallback = df.columns[0]
	logger.warning(
	"No known text column found. Falling back to '%s'. "
	"Expected one of: %s",
	fallback, cfg.text_column_candidates,
	)
	return fallback


	def _build_documents(df: pd.DataFrame, text_col: str) -> list[Document]:
	"""Convert DataFrame rows into LangChain Document objects with metadata."""
	meta_cols = [c for c in df.columns if c != text_col]

	documents: list[Document] = []
	for row_idx, row in df.iterrows():
	text = str(row[text_col]).strip()
	if not text or text.lower() == "nan":
	continue # skip empty rows

	metadata = {col: str(row.get(col, "")) for col in meta_cols}
	metadata["source_row"] = int(row_idx) # type: ignore[arg-type]

	documents.append(Document(page_content=text, metadata=metadata))

	return documents