Update processor.py
Browse files- processor.py +25 -24
processor.py
CHANGED
|
@@ -21,8 +21,7 @@ class DatasetCommandCenter:
|
|
| 21 |
|
| 22 |
def get_dataset_metadata(self, dataset_id):
|
| 23 |
"""
|
| 24 |
-
Fetches Configs
|
| 25 |
-
Gracefully handles missing metadata/404s.
|
| 26 |
"""
|
| 27 |
configs = ['default']
|
| 28 |
splits = ['train', 'test', 'validation']
|
|
@@ -80,8 +79,7 @@ class DatasetCommandCenter:
|
|
| 80 |
|
| 81 |
def _sanitize_for_json(self, obj):
|
| 82 |
"""
|
| 83 |
-
|
| 84 |
-
FIXES: 'Preview' crashes caused by NaN, Infinity, or complex Objects.
|
| 85 |
"""
|
| 86 |
if isinstance(obj, float):
|
| 87 |
if math.isnan(obj) or math.isinf(obj):
|
|
@@ -98,12 +96,11 @@ class DatasetCommandCenter:
|
|
| 98 |
|
| 99 |
def _flatten_object(self, obj, parent_key='', sep='.'):
|
| 100 |
"""
|
| 101 |
-
Recursively finds
|
| 102 |
-
the 'Simple Path' dropdown in the UI.
|
| 103 |
"""
|
| 104 |
items = {}
|
| 105 |
|
| 106 |
-
# Transparently parse JSON strings
|
| 107 |
if isinstance(obj, str):
|
| 108 |
s = obj.strip()
|
| 109 |
if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
|
|
@@ -136,7 +133,7 @@ class DatasetCommandCenter:
|
|
| 136 |
for i, row in enumerate(ds_stream):
|
| 137 |
if i >= 10: break
|
| 138 |
|
| 139 |
-
# Clean row for UI
|
| 140 |
clean_row = self._sanitize_for_json(row)
|
| 141 |
sample_rows.append(clean_row)
|
| 142 |
|
|
@@ -181,32 +178,38 @@ class DatasetCommandCenter:
|
|
| 181 |
|
| 182 |
def _get_value_by_path(self, obj, path):
|
| 183 |
"""
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
"""
|
| 188 |
if not path: return obj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
keys = path.split('.')
|
| 190 |
current = obj
|
| 191 |
|
| 192 |
for i, key in enumerate(keys):
|
| 193 |
-
#
|
| 194 |
-
|
| 195 |
current = current[key]
|
| 196 |
-
|
| 197 |
-
return None
|
| 198 |
|
| 199 |
-
#
|
| 200 |
is_last_key = (i == len(keys) - 1)
|
| 201 |
-
|
| 202 |
-
# Only parse if we are NOT at the end (we need to go inside)
|
| 203 |
if not is_last_key and isinstance(current, str):
|
| 204 |
s = current.strip()
|
| 205 |
if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
|
| 206 |
try:
|
| 207 |
current = json.loads(s)
|
| 208 |
except:
|
| 209 |
-
return None # Broken JSON
|
| 210 |
|
| 211 |
return current
|
| 212 |
|
|
@@ -216,7 +219,6 @@ class DatasetCommandCenter:
|
|
| 216 |
"""
|
| 217 |
data = row.get(source_col)
|
| 218 |
|
| 219 |
-
# Parse if string
|
| 220 |
if isinstance(data, str):
|
| 221 |
try:
|
| 222 |
data = json.loads(data)
|
|
@@ -268,7 +270,6 @@ class DatasetCommandCenter:
|
|
| 268 |
new_row[target_col] = val
|
| 269 |
|
| 270 |
except Exception as e:
|
| 271 |
-
# Fail Fast
|
| 272 |
raise ValueError(f"Column '{target_col}' failed: {str(e)}")
|
| 273 |
|
| 274 |
return new_row
|
|
@@ -281,7 +282,7 @@ class DatasetCommandCenter:
|
|
| 281 |
card_data = DatasetCardData(
|
| 282 |
language="en",
|
| 283 |
license=license_name,
|
| 284 |
-
tags=["dataset-command-center", "etl"],
|
| 285 |
base_model=source_id,
|
| 286 |
)
|
| 287 |
|
|
@@ -389,7 +390,7 @@ The following operations were applied to the source data:
|
|
| 389 |
for i, row in enumerate(ds_stream):
|
| 390 |
if len(processed) >= 5: break
|
| 391 |
|
| 392 |
-
#
|
| 393 |
passed = True
|
| 394 |
if recipe.get('filter_rule'):
|
| 395 |
try:
|
|
@@ -405,7 +406,7 @@ The following operations were applied to the source data:
|
|
| 405 |
if passed:
|
| 406 |
try:
|
| 407 |
new_row = self._apply_projection(row, recipe)
|
| 408 |
-
# Sanitize to prevent JSON crashes
|
| 409 |
clean_new_row = self._sanitize_for_json(new_row)
|
| 410 |
processed.append(clean_new_row)
|
| 411 |
except Exception as e:
|
|
|
|
| 21 |
|
| 22 |
def get_dataset_metadata(self, dataset_id):
|
| 23 |
"""
|
| 24 |
+
Fetches Configs and Splits.
|
|
|
|
| 25 |
"""
|
| 26 |
configs = ['default']
|
| 27 |
splits = ['train', 'test', 'validation']
|
|
|
|
| 79 |
|
| 80 |
def _sanitize_for_json(self, obj):
|
| 81 |
"""
|
| 82 |
+
Ensures data is safe for JSON serialization to prevent UI crashes (NaN, NaT, Infinity).
|
|
|
|
| 83 |
"""
|
| 84 |
if isinstance(obj, float):
|
| 85 |
if math.isnan(obj) or math.isinf(obj):
|
|
|
|
| 96 |
|
| 97 |
def _flatten_object(self, obj, parent_key='', sep='.'):
|
| 98 |
"""
|
| 99 |
+
Recursively finds keys for the UI dropdowns.
|
|
|
|
| 100 |
"""
|
| 101 |
items = {}
|
| 102 |
|
| 103 |
+
# Transparently parse JSON strings
|
| 104 |
if isinstance(obj, str):
|
| 105 |
s = obj.strip()
|
| 106 |
if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
|
|
|
|
| 133 |
for i, row in enumerate(ds_stream):
|
| 134 |
if i >= 10: break
|
| 135 |
|
| 136 |
+
# Clean row for UI
|
| 137 |
clean_row = self._sanitize_for_json(row)
|
| 138 |
sample_rows.append(clean_row)
|
| 139 |
|
|
|
|
| 178 |
|
| 179 |
def _get_value_by_path(self, obj, path):
|
| 180 |
"""
|
| 181 |
+
Retrieves a value from the row.
|
| 182 |
+
PRIORITY 1: Exact Key Match (Simplest, safest path).
|
| 183 |
+
PRIORITY 2: Dot Notation Traversal (for nested JSON).
|
| 184 |
"""
|
| 185 |
if not path: return obj
|
| 186 |
+
|
| 187 |
+
# 1. Try Direct Access (Fixes "Simple Path" for columns with dots in name)
|
| 188 |
+
try:
|
| 189 |
+
if isinstance(obj, dict) and path in obj:
|
| 190 |
+
return obj[path]
|
| 191 |
+
except: pass
|
| 192 |
+
|
| 193 |
+
# 2. Try Dot Notation
|
| 194 |
keys = path.split('.')
|
| 195 |
current = obj
|
| 196 |
|
| 197 |
for i, key in enumerate(keys):
|
| 198 |
+
# Access key with duck-typing support (works on dicts, UserDicts, etc)
|
| 199 |
+
try:
|
| 200 |
current = current[key]
|
| 201 |
+
except:
|
| 202 |
+
return None # Key not found
|
| 203 |
|
| 204 |
+
# Lazy Parsing: Only parse string if we need to go deeper
|
| 205 |
is_last_key = (i == len(keys) - 1)
|
|
|
|
|
|
|
| 206 |
if not is_last_key and isinstance(current, str):
|
| 207 |
s = current.strip()
|
| 208 |
if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
|
| 209 |
try:
|
| 210 |
current = json.loads(s)
|
| 211 |
except:
|
| 212 |
+
return None # Broken JSON
|
| 213 |
|
| 214 |
return current
|
| 215 |
|
|
|
|
| 219 |
"""
|
| 220 |
data = row.get(source_col)
|
| 221 |
|
|
|
|
| 222 |
if isinstance(data, str):
|
| 223 |
try:
|
| 224 |
data = json.loads(data)
|
|
|
|
| 270 |
new_row[target_col] = val
|
| 271 |
|
| 272 |
except Exception as e:
|
|
|
|
| 273 |
raise ValueError(f"Column '{target_col}' failed: {str(e)}")
|
| 274 |
|
| 275 |
return new_row
|
|
|
|
| 282 |
card_data = DatasetCardData(
|
| 283 |
language="en",
|
| 284 |
license=license_name,
|
| 285 |
+
tags=["dataset-command-center", "etl", "generated-dataset"],
|
| 286 |
base_model=source_id,
|
| 287 |
)
|
| 288 |
|
|
|
|
| 390 |
for i, row in enumerate(ds_stream):
|
| 391 |
if len(processed) >= 5: break
|
| 392 |
|
| 393 |
+
# Check Filter
|
| 394 |
passed = True
|
| 395 |
if recipe.get('filter_rule'):
|
| 396 |
try:
|
|
|
|
| 406 |
if passed:
|
| 407 |
try:
|
| 408 |
new_row = self._apply_projection(row, recipe)
|
| 409 |
+
# Sanitize to prevent JSON crashes
|
| 410 |
clean_new_row = self._sanitize_for_json(new_row)
|
| 411 |
processed.append(clean_new_row)
|
| 412 |
except Exception as e:
|