broadfield-dev commited on
Commit
0020ded
·
verified ·
1 Parent(s): cca5bb1

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +25 -24
processor.py CHANGED
@@ -21,8 +21,7 @@ class DatasetCommandCenter:
21
 
22
  def get_dataset_metadata(self, dataset_id):
23
  """
24
- Fetches Configs, Splits, and License info.
25
- Gracefully handles missing metadata/404s.
26
  """
27
  configs = ['default']
28
  splits = ['train', 'test', 'validation']
@@ -80,8 +79,7 @@ class DatasetCommandCenter:
80
 
81
  def _sanitize_for_json(self, obj):
82
  """
83
- Recursively cleans data for JSON serialization.
84
- FIXES: 'Preview' crashes caused by NaN, Infinity, or complex Objects.
85
  """
86
  if isinstance(obj, float):
87
  if math.isnan(obj) or math.isinf(obj):
@@ -98,12 +96,11 @@ class DatasetCommandCenter:
98
 
99
  def _flatten_object(self, obj, parent_key='', sep='.'):
100
  """
101
- Recursively finds all keys in nested dicts/JSON to populate
102
- the 'Simple Path' dropdown in the UI.
103
  """
104
  items = {}
105
 
106
- # Transparently parse JSON strings for Schema Discovery
107
  if isinstance(obj, str):
108
  s = obj.strip()
109
  if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
@@ -136,7 +133,7 @@ class DatasetCommandCenter:
136
  for i, row in enumerate(ds_stream):
137
  if i >= 10: break
138
 
139
- # Clean row for UI Preview
140
  clean_row = self._sanitize_for_json(row)
141
  sample_rows.append(clean_row)
142
 
@@ -181,32 +178,38 @@ class DatasetCommandCenter:
181
 
182
  def _get_value_by_path(self, obj, path):
183
  """
184
- Extracts values using dot notation.
185
- FIX: Lazy Parsing. Only parses JSON strings if we strictly need to
186
- traverse deeper. This preserves raw strings for top-level columns.
187
  """
188
  if not path: return obj
 
 
 
 
 
 
 
 
189
  keys = path.split('.')
190
  current = obj
191
 
192
  for i, key in enumerate(keys):
193
- # 1. Access Key
194
- if isinstance(current, dict) and key in current:
195
  current = current[key]
196
- else:
197
- return None
198
 
199
- # 2. Traverse Deeper?
200
  is_last_key = (i == len(keys) - 1)
201
-
202
- # Only parse if we are NOT at the end (we need to go inside)
203
  if not is_last_key and isinstance(current, str):
204
  s = current.strip()
205
  if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
206
  try:
207
  current = json.loads(s)
208
  except:
209
- return None # Broken JSON in path
210
 
211
  return current
212
 
@@ -216,7 +219,6 @@ class DatasetCommandCenter:
216
  """
217
  data = row.get(source_col)
218
 
219
- # Parse if string
220
  if isinstance(data, str):
221
  try:
222
  data = json.loads(data)
@@ -268,7 +270,6 @@ class DatasetCommandCenter:
268
  new_row[target_col] = val
269
 
270
  except Exception as e:
271
- # Fail Fast
272
  raise ValueError(f"Column '{target_col}' failed: {str(e)}")
273
 
274
  return new_row
@@ -281,7 +282,7 @@ class DatasetCommandCenter:
281
  card_data = DatasetCardData(
282
  language="en",
283
  license=license_name,
284
- tags=["dataset-command-center", "etl"],
285
  base_model=source_id,
286
  )
287
 
@@ -389,7 +390,7 @@ The following operations were applied to the source data:
389
  for i, row in enumerate(ds_stream):
390
  if len(processed) >= 5: break
391
 
392
- # Filter Check
393
  passed = True
394
  if recipe.get('filter_rule'):
395
  try:
@@ -405,7 +406,7 @@ The following operations were applied to the source data:
405
  if passed:
406
  try:
407
  new_row = self._apply_projection(row, recipe)
408
- # Sanitize to prevent JSON crashes (NaN, Infinity, Images)
409
  clean_new_row = self._sanitize_for_json(new_row)
410
  processed.append(clean_new_row)
411
  except Exception as e:
 
21
 
22
  def get_dataset_metadata(self, dataset_id):
23
  """
24
+ Fetches Configs and Splits.
 
25
  """
26
  configs = ['default']
27
  splits = ['train', 'test', 'validation']
 
79
 
80
  def _sanitize_for_json(self, obj):
81
  """
82
+ Ensures data is safe for JSON serialization to prevent UI crashes (NaN, NaT, Infinity).
 
83
  """
84
  if isinstance(obj, float):
85
  if math.isnan(obj) or math.isinf(obj):
 
96
 
97
  def _flatten_object(self, obj, parent_key='', sep='.'):
98
  """
99
+ Recursively finds keys for the UI dropdowns.
 
100
  """
101
  items = {}
102
 
103
+ # Transparently parse JSON strings
104
  if isinstance(obj, str):
105
  s = obj.strip()
106
  if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
 
133
  for i, row in enumerate(ds_stream):
134
  if i >= 10: break
135
 
136
+ # Clean row for UI
137
  clean_row = self._sanitize_for_json(row)
138
  sample_rows.append(clean_row)
139
 
 
178
 
179
  def _get_value_by_path(self, obj, path):
180
  """
181
+ Retrieves a value from the row.
182
+ PRIORITY 1: Exact Key Match (Simplest, safest path).
183
+ PRIORITY 2: Dot Notation Traversal (for nested JSON).
184
  """
185
  if not path: return obj
186
+
187
+ # 1. Try Direct Access (Fixes "Simple Path" for columns with dots in name)
188
+ try:
189
+ if isinstance(obj, dict) and path in obj:
190
+ return obj[path]
191
+ except: pass
192
+
193
+ # 2. Try Dot Notation
194
  keys = path.split('.')
195
  current = obj
196
 
197
  for i, key in enumerate(keys):
198
+ # Access key with duck-typing support (works on dicts, UserDicts, etc)
199
+ try:
200
  current = current[key]
201
+ except:
202
+ return None # Key not found
203
 
204
+ # Lazy Parsing: Only parse string if we need to go deeper
205
  is_last_key = (i == len(keys) - 1)
 
 
206
  if not is_last_key and isinstance(current, str):
207
  s = current.strip()
208
  if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
209
  try:
210
  current = json.loads(s)
211
  except:
212
+ return None # Broken JSON
213
 
214
  return current
215
 
 
219
  """
220
  data = row.get(source_col)
221
 
 
222
  if isinstance(data, str):
223
  try:
224
  data = json.loads(data)
 
270
  new_row[target_col] = val
271
 
272
  except Exception as e:
 
273
  raise ValueError(f"Column '{target_col}' failed: {str(e)}")
274
 
275
  return new_row
 
282
  card_data = DatasetCardData(
283
  language="en",
284
  license=license_name,
285
+ tags=["dataset-command-center", "etl", "generated-dataset"],
286
  base_model=source_id,
287
  )
288
 
 
390
  for i, row in enumerate(ds_stream):
391
  if len(processed) >= 5: break
392
 
393
+ # Check Filter
394
  passed = True
395
  if recipe.get('filter_rule'):
396
  try:
 
406
  if passed:
407
  try:
408
  new_row = self._apply_projection(row, recipe)
409
+ # Sanitize to prevent JSON crashes
410
  clean_new_row = self._sanitize_for_json(new_row)
411
  processed.append(clean_new_row)
412
  except Exception as e: