File size: 3,605 Bytes
721ca73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""

data_loader.py

──────────────

Loads the Parquet-backed PDF dataset from Hugging Face Hub and returns

a list of LangChain Document objects ready for indexing.



Responsibilities:

  - Connect to HF Hub (handles both public and private datasets)

  - Auto-detect the text column

  - Yield Document objects with rich metadata (source file, page number, etc.)

"""

import logging
from typing import Optional

import pandas as pd
from datasets import load_dataset
from langchain_core.documents import Document

from config import cfg

logger = logging.getLogger(__name__)


# ── Public API ────────────────────────────────────────────────────────────────

def load_documents() -> list[Document]:
    """

    Entry point: load HF dataset and return chunked-ready Document objects.



    Returns

    -------

    list[Document]

        One Document per non-empty row, with metadata preserved.



    Raises

    ------

    ValueError

        If the dataset is not configured or no usable text column is found.

    """
    if not cfg.hf_dataset:
        raise ValueError(
            "HF_DATASET env var is not set. "
            "Set it to 'username/dataset-name' in your Space secrets."
        )

    df = _fetch_dataframe()
    text_col = _detect_text_column(df)
    documents = _build_documents(df, text_col)

    logger.info("Loaded %d documents from '%s' (column: '%s')",
                len(documents), cfg.hf_dataset, text_col)
    return documents


# ── Internal helpers ──────────────────────────────────────────────────────────

def _fetch_dataframe() -> pd.DataFrame:
    """Download the dataset split from HF Hub and return as a DataFrame."""
    logger.info("Fetching dataset '%s' split='%s' …", cfg.hf_dataset, cfg.dataset_split)
    ds = load_dataset(
        cfg.hf_dataset,
        split=cfg.dataset_split,
        token=cfg.hf_token or None,
    )
    df = ds.to_pandas()
    logger.info("Dataset shape: %s | columns: %s", df.shape, df.columns.tolist())
    return df


def _detect_text_column(df: pd.DataFrame) -> str:
    """

    Find the first column whose lowercase name matches a known text-column

    name. Falls back to the first column if none match.

    """
    col_lower = {c.lower(): c for c in df.columns}
    for candidate in cfg.text_column_candidates:
        if candidate in col_lower:
            return col_lower[candidate]

    fallback = df.columns[0]
    logger.warning(
        "No known text column found. Falling back to '%s'. "
        "Expected one of: %s",
        fallback, cfg.text_column_candidates,
    )
    return fallback


def _build_documents(df: pd.DataFrame, text_col: str) -> list[Document]:
    """Convert DataFrame rows into LangChain Document objects with metadata."""
    meta_cols = [c for c in df.columns if c != text_col]

    documents: list[Document] = []
    for row_idx, row in df.iterrows():
        text = str(row[text_col]).strip()
        if not text or text.lower() == "nan":
            continue  # skip empty rows

        metadata = {col: str(row.get(col, "")) for col in meta_cols}
        metadata["source_row"] = int(row_idx)  # type: ignore[arg-type]

        documents.append(Document(page_content=text, metadata=metadata))

    return documents