Spaces:
Running
Running
File size: 4,782 Bytes
ca96eb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import logging
import gradio as gr
from typing import Dict, Any
from hf_eda_mcp.services.dataset_service import (
DatasetServiceError,
DatasetNotParquetError,
NoTextColumnsError,
get_dataset_service
)
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
from hf_eda_mcp.validation import (
validate_dataset_id,
validate_config_name,
validate_split_name,
ValidationError,
format_validation_error,
)
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
logger = logging.getLogger(__name__)
def search_text_in_dataset(
dataset_id: str,
config_name: str,
split: str,
query: str,
offset: int = 0,
length: int = 10,
hf_api_token: gr.Header = "",
) -> Dict[str, Any]:
"""
Search for text in text columns of a dataset using the Dataset Viewer API.
Only text columns are searched and only parquet datasets are supported (builder_name="parquet")
Useful for finding relevant examples or debugging issues.
Args:
dataset_id: HuggingFace full dataset identifier (e.g., 'stanfordnlp/imdb', 'rajpurkar/squad', 'nyu-mll/glue')
config_name: Configuration name
split: Split name
query: Search query
offset: Offset for pagination (default: 0)
length: Number of examples to return (default: 50). Means that we search in [offset, offset+length[
hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers
Returns:
Dictionary containing search results including:
- features: List of features from the dataset, including column names and data types
- rows: List of slice of rows of a dataset and the content contained in each column of a specific row.
- num_rows_total: Total number of examples in the split
- num_rows_per_page: Number of examples in the current page
- partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big.
"""
# Handle empty strings from Gradio (convert to None)
if config_name == "":
config_name = None
# Input validation using centralized validation
try:
dataset_id = validate_dataset_id(dataset_id)
config_name = validate_config_name(config_name)
split = validate_split_name(split)
except ValidationError as e:
logger.error(f"Validation error: {format_validation_error(e)}")
raise ValueError(format_validation_error(e))
context = {
"dataset_id": dataset_id,
"config_name": config_name,
"split": split,
"query": query,
"offset": offset,
"length": length,
"operation": "search_text_in_dataset"
}
logger.info(
f"Searching text {query} in dataset: {dataset_id}, split: {split}, "
f"config: {config_name}, offset: {offset}, length: {length}"
)
try:
# Get dataset service
service = get_dataset_service(hf_api_token=hf_api_token)
# Search in dataset
search_results = service.search_text_in_dataset(
dataset_id=dataset_id,
config_name=config_name,
split_name=split,
query=query,
offset=offset,
length=length
)
return search_results
except DatasetNotParquetError as e:
log_error_with_context(e, context, level=logging.WARNING)
logger.info(f"Dataset is not in parquet format: {str(e)}")
raise ValueError(str(e)) from e
except NoTextColumnsError as e:
log_error_with_context(e, context, level=logging.WARNING)
logger.info(f"Dataset has no text columns: {str(e)}")
raise ValueError(str(e)) from e
except DatasetNotFoundError as e:
log_error_with_context(e, context, level=logging.WARNING)
error_response = format_error_response(e, context)
logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}")
raise
except AuthenticationError as e:
log_error_with_context(e, context, level=logging.WARNING)
error_response = format_error_response(e, context)
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
raise
except NetworkError as e:
log_error_with_context(e, context)
error_response = format_error_response(e, context)
logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
raise
except Exception as e:
log_error_with_context(e, context)
raise DatasetServiceError(f"Failed to search in dataset: {str(e)}") from e
|