Spaces:
Running
Running
| import logging | |
| import gradio as gr | |
| from typing import Dict, Any | |
| from hf_eda_mcp.services.dataset_service import ( | |
| DatasetServiceError, | |
| DatasetNotParquetError, | |
| NoTextColumnsError, | |
| get_dataset_service | |
| ) | |
| from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError | |
| from hf_eda_mcp.validation import ( | |
| validate_dataset_id, | |
| validate_config_name, | |
| validate_split_name, | |
| ValidationError, | |
| format_validation_error, | |
| ) | |
| from hf_eda_mcp.error_handling import format_error_response, log_error_with_context | |
| logger = logging.getLogger(__name__) | |
| def search_text_in_dataset( | |
| dataset_id: str, | |
| config_name: str, | |
| split: str, | |
| query: str, | |
| offset: int = 0, | |
| length: int = 10, | |
| hf_api_token: gr.Header = "", | |
| ) -> Dict[str, Any]: | |
| """ | |
| Search for text in text columns of a dataset using the Dataset Viewer API. | |
| Only text columns are searched and only parquet datasets are supported (builder_name="parquet") | |
| Useful for finding relevant examples or debugging issues. | |
| Args: | |
| dataset_id: HuggingFace full dataset identifier (e.g., 'stanfordnlp/imdb', 'rajpurkar/squad', 'nyu-mll/glue') | |
| config_name: Configuration name | |
| split: Split name | |
| query: Search query | |
| offset: Offset for pagination (default: 0) | |
| length: Number of examples to return (default: 50). Means that we search in [offset, offset+length[ | |
| hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers | |
| Returns: | |
| Dictionary containing search results including: | |
| - features: List of features from the dataset, including column names and data types | |
| - rows: List of slice of rows of a dataset and the content contained in each column of a specific row. | |
| - num_rows_total: Total number of examples in the split | |
| - num_rows_per_page: Number of examples in the current page | |
| - partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big. | |
| """ | |
| # Handle empty strings from Gradio (convert to None) | |
| if config_name == "": | |
| config_name = None | |
| # Input validation using centralized validation | |
| try: | |
| dataset_id = validate_dataset_id(dataset_id) | |
| config_name = validate_config_name(config_name) | |
| split = validate_split_name(split) | |
| except ValidationError as e: | |
| logger.error(f"Validation error: {format_validation_error(e)}") | |
| raise ValueError(format_validation_error(e)) | |
| context = { | |
| "dataset_id": dataset_id, | |
| "config_name": config_name, | |
| "split": split, | |
| "query": query, | |
| "offset": offset, | |
| "length": length, | |
| "operation": "search_text_in_dataset" | |
| } | |
| logger.info( | |
| f"Searching text {query} in dataset: {dataset_id}, split: {split}, " | |
| f"config: {config_name}, offset: {offset}, length: {length}" | |
| ) | |
| try: | |
| # Get dataset service | |
| service = get_dataset_service(hf_api_token=hf_api_token) | |
| # Search in dataset | |
| search_results = service.search_text_in_dataset( | |
| dataset_id=dataset_id, | |
| config_name=config_name, | |
| split_name=split, | |
| query=query, | |
| offset=offset, | |
| length=length | |
| ) | |
| return search_results | |
| except DatasetNotParquetError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| logger.info(f"Dataset is not in parquet format: {str(e)}") | |
| raise ValueError(str(e)) from e | |
| except NoTextColumnsError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| logger.info(f"Dataset has no text columns: {str(e)}") | |
| raise ValueError(str(e)) from e | |
| except DatasetNotFoundError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Dataset/split not found suggestions: {error_response.get('suggestions', [])}") | |
| raise | |
| except AuthenticationError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}") | |
| raise | |
| except NetworkError as e: | |
| log_error_with_context(e, context) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Network error guidance: {error_response.get('suggestions', [])}") | |
| raise | |
| except Exception as e: | |
| log_error_with_context(e, context) | |
| raise DatasetServiceError(f"Failed to search in dataset: {str(e)}") from e | |