KhalilGuetari commited on
Commit
e93499e
·
1 Parent(s): 59f0b85

fix checking statistics capabilities

Browse files
scripts/playground/metadata_tool_playground.py CHANGED
@@ -25,7 +25,7 @@ logging.basicConfig(
25
  logger = logging.getLogger(__name__)
26
 
27
 
28
- def test_merged_metadata(dataset_name = "rajpurkar/squad"):
29
  """Test merged metadata retrieval."""
30
  print("=" * 80)
31
  print("Testing Merged Metadata from DatasetService")
@@ -43,7 +43,7 @@ def test_merged_metadata(dataset_name = "rajpurkar/squad"):
43
  # Test with squad dataset
44
  print(f"\n### Testing: {dataset_name} ###\n")
45
  try:
46
- metadata = service.load_dataset_info(dataset_name)
47
 
48
  print("Key Information:")
49
  print(f" Dataset ID: {metadata.get('id')}")
@@ -124,4 +124,4 @@ def test_multi_config_dataset(dataset_name = "stanfordnlp/imdb"):
124
 
125
  if __name__ == "__main__":
126
  test_merged_metadata()
127
- test_multi_config_dataset()
 
25
  logger = logging.getLogger(__name__)
26
 
27
 
28
+ def test_merged_metadata(dataset_name = "wikitext", config_name = "wikitext-2-raw-v1"):
29
  """Test merged metadata retrieval."""
30
  print("=" * 80)
31
  print("Testing Merged Metadata from DatasetService")
 
43
  # Test with squad dataset
44
  print(f"\n### Testing: {dataset_name} ###\n")
45
  try:
46
+ metadata = service.load_dataset_info(dataset_name, config_name)
47
 
48
  print("Key Information:")
49
  print(f" Dataset ID: {metadata.get('id')}")
 
124
 
125
  if __name__ == "__main__":
126
  test_merged_metadata()
127
+ #test_multi_config_dataset()
src/hf_eda_mcp/services/dataset_service.py CHANGED
@@ -808,7 +808,7 @@ class DatasetService:
808
  def _check_statistics_availability(
809
  self,
810
  dataset_name: str,
811
- config: Optional[str] = None
812
  ) -> dict:
813
  """
814
  Check if statistics are available for a dataset.
@@ -818,7 +818,7 @@ class DatasetService:
818
 
819
  Args:
820
  dataset_name: HuggingFace dataset identifier
821
- config: Optional configuration name
822
 
823
  Returns:
824
  Dictionary with availability information:
@@ -830,34 +830,40 @@ class DatasetService:
830
  DatasetViewerError: If the API request fails
831
  """
832
  try:
833
- dataset_info = self.load_dataset_info(dataset_name, config)
834
  full_dataset_id = dataset_info.get('id', dataset_name)
835
 
836
  if len(dataset_info["configs"]) == 1:
837
  # Single config format
838
  builder_name = dataset_info.get('builder_name', '')
839
  is_parquet = builder_name == 'parquet'
840
-
841
- return {
842
- "available": is_parquet,
843
- "full_dataset_id": full_dataset_id,
844
- "configs": [dataset_info.get('config_name')] if is_parquet else [],
845
- "reason": 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
846
- }
847
  else:
848
  # Multiple configs format
849
- parquet_configs = []
850
- for cfg_data in dataset_info["config_details"]:
851
- if cfg_data.get('builder_name') == 'parquet':
852
- parquet_configs.append(cfg_data.get("config_name"))
853
-
854
- return {
855
- "full_dataset_id": full_dataset_id,
856
- "available": len(parquet_configs) > 0,
857
- "configs": parquet_configs,
858
- "reason": f'Statistics available for {len(parquet_configs)} config(s)' if parquet_configs else 'No parquet configs found'
859
- }
860
-
 
 
 
 
 
 
 
 
 
 
861
  except Exception as e:
862
  error_msg = f"Unexpected error checking statistics availability: {str(e)}"
863
  logger.error(error_msg)
@@ -867,8 +873,8 @@ class DatasetService:
867
  def get_dataset_service(hf_api_token: str) -> DatasetService:
868
  """Get or create the global dataset service instance using current config."""
869
  config = get_config()
870
- #if hf_api_token is None or len(hf_api_token) == 0:
871
- hf_api_token = config.hf_token
872
  dataset_service = DatasetService(
873
  cache_dir=config.cache_dir,
874
  token=hf_api_token
 
808
  def _check_statistics_availability(
809
  self,
810
  dataset_name: str,
811
+ config_name: Optional[str] = None
812
  ) -> dict:
813
  """
814
  Check if statistics are available for a dataset.
 
818
 
819
  Args:
820
  dataset_name: HuggingFace dataset identifier
821
+ config_name: Optional configuration name
822
 
823
  Returns:
824
  Dictionary with availability information:
 
830
  DatasetViewerError: If the API request fails
831
  """
832
  try:
833
+ dataset_info = self.load_dataset_info(dataset_name, config_name)
834
  full_dataset_id = dataset_info.get('id', dataset_name)
835
 
836
  if len(dataset_info["configs"]) == 1:
837
  # Single config format
838
  builder_name = dataset_info.get('builder_name', '')
839
  is_parquet = builder_name == 'parquet'
840
+ configs = [dataset_info["configs"][0]] if is_parquet else [],
841
+ reason = 'Statistics available' if is_parquet else f'Statistics only available for parquet datasets (found: {builder_name})'
842
+
 
 
 
 
843
  else:
844
  # Multiple configs format
845
+ if config_name is None:
846
+ # Take every configs
847
+ configs = []
848
+ for cfg_data in dataset_info["config_details"]:
849
+ if cfg_data.get('builder_name') == 'parquet':
850
+ configs.append(cfg_data.get("config_name"))
851
+ is_parquet = len(configs) > 0
852
+ reason = f'Statistics available for {len(configs)} config(s)' if configs else 'No parquet configs found'
853
+
854
+ else:
855
+ configs = [config_name]
856
+ builder_name = dataset_info.get('builder_name', '')
857
+ is_parquet = builder_name == 'parquet'
858
+ reason = f'Statistics available for provided config {config_name}' if is_parquet else f'No parquet found for config {config_name}'
859
+
860
+ return {
861
+ "available": is_parquet,
862
+ "full_dataset_id": full_dataset_id,
863
+ "configs": configs,
864
+ "reason": reason
865
+ }
866
+
867
  except Exception as e:
868
  error_msg = f"Unexpected error checking statistics availability: {str(e)}"
869
  logger.error(error_msg)
 
873
  def get_dataset_service(hf_api_token: str) -> DatasetService:
874
  """Get or create the global dataset service instance using current config."""
875
  config = get_config()
876
+ if hf_api_token is None or len(hf_api_token) == 0:
877
+ hf_api_token = config.hf_token
878
  dataset_service = DatasetService(
879
  cache_dir=config.cache_dir,
880
  token=hf_api_token