broadfield-dev commited on
Commit
a9a8eca
·
verified ·
1 Parent(s): cf942b8

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +58 -38
processor.py CHANGED
@@ -12,51 +12,71 @@ class DatasetCommandCenter:
12
  self.token = token
13
 
14
  def get_dataset_metadata(self, dataset_id):
 
 
 
 
 
15
  try:
16
- try:
17
- configs = get_dataset_config_names(dataset_id, token=self.token)
18
- except:
19
- configs = ['default']
20
- try:
21
- infos = get_dataset_infos(dataset_id, token=self.token)
22
- first_conf = configs[0]
23
- if first_conf in infos:
24
- splits = list(infos[first_conf].splits.keys())
25
- else:
26
- splits = list(infos.values())[0].splits.keys()
27
- except:
28
- splits = ['train', 'test', 'validation']
29
- # license
30
- try:
31
- configs = get_dataset_config_names(dataset_id, token=self.token)
32
- except:
33
- configs = ['default']
34
 
35
- license_name = "unknown"
36
- try:
37
- infos = get_dataset_infos(dataset_id, token=self.token)
38
- # Try to grab license from first config
39
- first = list(infos.values())[0]
40
- license_name = first.license or "unknown"
41
- except:
42
- pass
43
-
44
- return {
45
- "status": "success",
46
- "configs": configs,
47
- # We assume user will pick splits later, just return configs + license hint
48
- "license_detected": license_name
49
- }
 
 
50
  except Exception as e:
51
- return {"status": "error", "message": str(e)}
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def get_splits_for_config(self, dataset_id, config_name):
 
54
  try:
55
  infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
56
- splits = list(infos[config_name].splits.keys())
57
- return {"status": "success", "splits": splits}
58
- except:
59
- return {"status": "success", "splits": ['train', 'test', 'validation']}
 
 
 
 
 
 
 
 
 
60
 
61
  # --- HELPER: Recursive JSON/Dot Notation Getter ---
62
  def _get_value_by_path(self, obj, path):
 
12
  self.token = token
13
 
14
  def get_dataset_metadata(self, dataset_id):
15
+ configs = []
16
+ splits = []
17
+ license_name = "unknown"
18
+
19
+ # 1. Get Configs
20
  try:
21
+ configs = get_dataset_config_names(dataset_id, token=self.token)
22
+ except Exception as e:
23
+ logger.warning(f"Could not fetch configs for {dataset_id}: {e}")
24
+ # Fallback: if we can't get configs, assume 'default'
25
+ configs = ['default']
26
+
27
+ # 2. Get Splits & License
28
+ # Many datasets return 404 on dataset_infos.json. We must catch this.
29
+ try:
30
+ selected_config = configs[0] if configs else 'default'
 
 
 
 
 
 
 
 
31
 
32
+ # This API call frequently fails on datasets without metadata cards
33
+ infos = get_dataset_infos(dataset_id, token=self.token)
34
+
35
+ # Attempt to find the info object for our config
36
+ info_obj = None
37
+ if selected_config in infos:
38
+ info_obj = infos[selected_config]
39
+ elif 'default' in infos:
40
+ info_obj = infos['default']
41
+ elif len(infos) > 0:
42
+ # Fallback to the first available if names don't match
43
+ info_obj = list(infos.values())[0]
44
+
45
+ if info_obj:
46
+ splits = list(info_obj.splits.keys())
47
+ license_name = info_obj.license or "unknown"
48
+
49
  except Exception as e:
50
+ logger.warning(f"Could not fetch dataset_infos (using fallbacks): {e}")
51
+ # Safe Fallback if metadata fails
52
+ splits = ['train', 'test', 'validation']
53
+ license_name = "unknown"
54
+
55
+ # Ensure we NEVER return None for lists
56
+ return {
57
+ "status": "success",
58
+ "configs": configs if configs else ['default'],
59
+ "splits": splits if splits else ['train'],
60
+ "license_detected": license_name
61
+ }
62
 
63
  def get_splits_for_config(self, dataset_id, config_name):
64
+ splits = []
65
  try:
66
  infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
67
+
68
+ if config_name in infos:
69
+ splits = list(infos[config_name].splits.keys())
70
+ elif len(infos) > 0:
71
+ # Fallback to first available
72
+ splits = list(infos.values())[0].splits.keys()
73
+
74
+ except Exception as e:
75
+ logger.warning(f"Could not fetch splits for config {config_name}: {e}")
76
+ # Fallback
77
+ splits = ['train', 'test', 'validation']
78
+
79
+ return {"status": "success", "splits": list(splits) if splits else ['train']}
80
 
81
  # --- HELPER: Recursive JSON/Dot Notation Getter ---
82
  def _get_value_by_path(self, obj, path):