File size: 4,782 Bytes
ca96eb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import logging
import gradio as gr
from typing import Dict, Any
from hf_eda_mcp.services.dataset_service import (
    DatasetServiceError,
    DatasetNotParquetError,
    NoTextColumnsError,
    get_dataset_service
)
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
from hf_eda_mcp.validation import (
    validate_dataset_id,
    validate_config_name,
    validate_split_name,
    ValidationError,
    format_validation_error,
)
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context


logger = logging.getLogger(__name__)


def search_text_in_dataset(
    dataset_id: str,
    config_name: str,
    split: str,
    query: str,
    offset: int = 0,
    length: int = 10,
    hf_api_token: gr.Header = "",
) -> Dict[str, Any]:
    """
    Search for text in text columns of a dataset using the Dataset Viewer API.
    Only text columns are searched and only parquet datasets are supported (builder_name="parquet")

    Useful for finding relevant examples or debugging issues.

    Args:
        dataset_id: HuggingFace full dataset identifier (e.g., 'stanfordnlp/imdb', 'rajpurkar/squad', 'nyu-mll/glue')
        config_name: Configuration name
        split: Split name
        query: Search query
        offset: Offset for pagination (default: 0)
        length: Number of examples to return (default: 50). Means that we search in [offset, offset+length[
        hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers
    
    Returns:
        Dictionary containing search results including:
        - features: List of features from the dataset, including column names and data types
        - rows: List of slice of rows of a dataset and the content contained in each column of a specific row.
        - num_rows_total: Total number of examples in the split
        - num_rows_per_page: Number of examples in the current page
        - partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big.
    """
    # Handle empty strings from Gradio (convert to None)
    if config_name == "":
        config_name = None
    
    # Input validation using centralized validation
    try:
        dataset_id = validate_dataset_id(dataset_id)
        config_name = validate_config_name(config_name)
        split = validate_split_name(split)
    except ValidationError as e:
        logger.error(f"Validation error: {format_validation_error(e)}")
        raise ValueError(format_validation_error(e))

    context = {
        "dataset_id": dataset_id,
        "config_name": config_name,
        "split": split,
        "query": query,
        "offset": offset,
        "length": length,
        "operation": "search_text_in_dataset"
    }
    
    logger.info(
        f"Searching text {query} in dataset: {dataset_id}, split: {split}, "
        f"config: {config_name}, offset: {offset}, length: {length}"
    )

    try:
        # Get dataset service
        service = get_dataset_service(hf_api_token=hf_api_token)

        # Search in dataset
        search_results = service.search_text_in_dataset(
            dataset_id=dataset_id,
            config_name=config_name,
            split_name=split,
            query=query,
            offset=offset,
            length=length
        )

        return search_results

    except DatasetNotParquetError as e:
        log_error_with_context(e, context, level=logging.WARNING)
        logger.info(f"Dataset is not in parquet format: {str(e)}")
        raise ValueError(str(e)) from e

    except NoTextColumnsError as e:
        log_error_with_context(e, context, level=logging.WARNING)
        logger.info(f"Dataset has no text columns: {str(e)}")
        raise ValueError(str(e)) from e

    except DatasetNotFoundError as e:
        log_error_with_context(e, context, level=logging.WARNING)
        error_response = format_error_response(e, context)
        logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
        raise

    except AuthenticationError as e:
        log_error_with_context(e, context, level=logging.WARNING)
        error_response = format_error_response(e, context)
        logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
        raise

    except NetworkError as e:
        log_error_with_context(e, context)
        error_response = format_error_response(e, context)
        logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
        raise

    except Exception as e:
        log_error_with_context(e, context)
        raise DatasetServiceError(f"Failed to search in dataset: {str(e)}") from e