Update processor.py
Browse files- processor.py +58 -38
processor.py
CHANGED
|
@@ -12,51 +12,71 @@ class DatasetCommandCenter:
|
|
| 12 |
self.token = token
|
| 13 |
|
| 14 |
def get_dataset_metadata(self, dataset_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
splits = list(infos.values())[0].splits.keys()
|
| 27 |
-
except:
|
| 28 |
-
splits = ['train', 'test', 'validation']
|
| 29 |
-
# license
|
| 30 |
-
try:
|
| 31 |
-
configs = get_dataset_config_names(dataset_id, token=self.token)
|
| 32 |
-
except:
|
| 33 |
-
configs = ['default']
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
except Exception as e:
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def get_splits_for_config(self, dataset_id, config_name):
|
|
|
|
| 54 |
try:
|
| 55 |
infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# --- HELPER: Recursive JSON/Dot Notation Getter ---
|
| 62 |
def _get_value_by_path(self, obj, path):
|
|
|
|
| 12 |
self.token = token
|
| 13 |
|
| 14 |
def get_dataset_metadata(self, dataset_id):
|
| 15 |
+
configs = []
|
| 16 |
+
splits = []
|
| 17 |
+
license_name = "unknown"
|
| 18 |
+
|
| 19 |
+
# 1. Get Configs
|
| 20 |
try:
|
| 21 |
+
configs = get_dataset_config_names(dataset_id, token=self.token)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
logger.warning(f"Could not fetch configs for {dataset_id}: {e}")
|
| 24 |
+
# Fallback: if we can't get configs, assume 'default'
|
| 25 |
+
configs = ['default']
|
| 26 |
+
|
| 27 |
+
# 2. Get Splits & License
|
| 28 |
+
# Many datasets return 404 on dataset_infos.json. We must catch this.
|
| 29 |
+
try:
|
| 30 |
+
selected_config = configs[0] if configs else 'default'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
# This API call frequently fails on datasets without metadata cards
|
| 33 |
+
infos = get_dataset_infos(dataset_id, token=self.token)
|
| 34 |
+
|
| 35 |
+
# Attempt to find the info object for our config
|
| 36 |
+
info_obj = None
|
| 37 |
+
if selected_config in infos:
|
| 38 |
+
info_obj = infos[selected_config]
|
| 39 |
+
elif 'default' in infos:
|
| 40 |
+
info_obj = infos['default']
|
| 41 |
+
elif len(infos) > 0:
|
| 42 |
+
# Fallback to the first available if names don't match
|
| 43 |
+
info_obj = list(infos.values())[0]
|
| 44 |
+
|
| 45 |
+
if info_obj:
|
| 46 |
+
splits = list(info_obj.splits.keys())
|
| 47 |
+
license_name = info_obj.license or "unknown"
|
| 48 |
+
|
| 49 |
except Exception as e:
|
| 50 |
+
logger.warning(f"Could not fetch dataset_infos (using fallbacks): {e}")
|
| 51 |
+
# Safe Fallback if metadata fails
|
| 52 |
+
splits = ['train', 'test', 'validation']
|
| 53 |
+
license_name = "unknown"
|
| 54 |
+
|
| 55 |
+
# Ensure we NEVER return None for lists
|
| 56 |
+
return {
|
| 57 |
+
"status": "success",
|
| 58 |
+
"configs": configs if configs else ['default'],
|
| 59 |
+
"splits": splits if splits else ['train'],
|
| 60 |
+
"license_detected": license_name
|
| 61 |
+
}
|
| 62 |
|
| 63 |
def get_splits_for_config(self, dataset_id, config_name):
|
| 64 |
+
splits = []
|
| 65 |
try:
|
| 66 |
infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
|
| 67 |
+
|
| 68 |
+
if config_name in infos:
|
| 69 |
+
splits = list(infos[config_name].splits.keys())
|
| 70 |
+
elif len(infos) > 0:
|
| 71 |
+
# Fallback to first available
|
| 72 |
+
splits = list(infos.values())[0].splits.keys()
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.warning(f"Could not fetch splits for config {config_name}: {e}")
|
| 76 |
+
# Fallback
|
| 77 |
+
splits = ['train', 'test', 'validation']
|
| 78 |
+
|
| 79 |
+
return {"status": "success", "splits": list(splits) if splits else ['train']}
|
| 80 |
|
| 81 |
# --- HELPER: Recursive JSON/Dot Notation Getter ---
|
| 82 |
def _get_value_by_path(self, obj, path):
|