Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

0020ded

verified ·

1 Parent(s): cca5bb1

Update processor.py

Browse files

Files changed (1) hide show

processor.py +25 -24

processor.py CHANGED Viewed

@@ -21,8 +21,7 @@ class DatasetCommandCenter:
     def get_dataset_metadata(self, dataset_id):
         """
-        Fetches Configs, Splits, and License info.
-        Gracefully handles missing metadata/404s.
         """
         configs = ['default']
         splits = ['train', 'test', 'validation']
@@ -80,8 +79,7 @@ class DatasetCommandCenter:
     def _sanitize_for_json(self, obj):
         """
-        Recursively cleans data for JSON serialization.
-        FIXES: 'Preview' crashes caused by NaN, Infinity, or complex Objects.
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
@@ -98,12 +96,11 @@ class DatasetCommandCenter:
     def _flatten_object(self, obj, parent_key='', sep='.'):
         """
-        Recursively finds all keys in nested dicts/JSON to populate
-        the 'Simple Path' dropdown in the UI.
         """
         items = {}
-        # Transparently parse JSON strings for Schema Discovery
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
@@ -136,7 +133,7 @@ class DatasetCommandCenter:
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
-                # Clean row for UI Preview
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
@@ -181,32 +178,38 @@ class DatasetCommandCenter:
     def _get_value_by_path(self, obj, path):
         """
-        Extracts values using dot notation.
-        FIX: Lazy Parsing. Only parses JSON strings if we strictly need to
-        traverse deeper. This preserves raw strings for top-level columns.
         """
         if not path: return obj
         keys = path.split('.')
         current = obj
         for i, key in enumerate(keys):
-            # 1. Access Key
-            if isinstance(current, dict) and key in current:
                 current = current[key]
-            else:
-                return None
-            # 2. Traverse Deeper?
             is_last_key = (i == len(keys) - 1)
-            # Only parse if we are NOT at the end (we need to go inside)
             if not is_last_key and isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
                     except:
-                        return None # Broken JSON in path
         return current
@@ -216,7 +219,6 @@ class DatasetCommandCenter:
         """
         data = row.get(source_col)
-        # Parse if string
         if isinstance(data, str):
             try:
                 data = json.loads(data)
@@ -268,7 +270,6 @@ class DatasetCommandCenter:
                     new_row[target_col] = val
             except Exception as e:
-                # Fail Fast
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
@@ -281,7 +282,7 @@ class DatasetCommandCenter:
         card_data = DatasetCardData(
             language="en",
             license=license_name,
-            tags=["dataset-command-center", "etl"],
             base_model=source_id,
         )
@@ -389,7 +390,7 @@ The following operations were applied to the source data:
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
-                # Filter Check
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
@@ -405,7 +406,7 @@ The following operations were applied to the source data:
                 if passed:
                     try:
                         new_row = self._apply_projection(row, recipe)
-                        # Sanitize to prevent JSON crashes (NaN, Infinity, Images)
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e:

     def get_dataset_metadata(self, dataset_id):
         """
+        Fetches Configs and Splits.
         """
         configs = ['default']
         splits = ['train', 'test', 'validation']
     def _sanitize_for_json(self, obj):
         """
+        Ensures data is safe for JSON serialization to prevent UI crashes (NaN, NaT, Infinity).
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
     def _flatten_object(self, obj, parent_key='', sep='.'):
         """
+        Recursively finds keys for the UI dropdowns.
         """
         items = {}
+        # Transparently parse JSON strings
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # Clean row for UI
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
     def _get_value_by_path(self, obj, path):
         """
+        Retrieves a value from the row.
+        PRIORITY 1: Exact Key Match (Simplest, safest path).
+        PRIORITY 2: Dot Notation Traversal (for nested JSON).
         """
         if not path: return obj
+        # 1. Try Direct Access (Fixes "Simple Path" for columns with dots in name)
+        try:
+            if isinstance(obj, dict) and path in obj:
+                return obj[path]
+        except: pass
+        # 2. Try Dot Notation
         keys = path.split('.')
         current = obj
         for i, key in enumerate(keys):
+            # Access key with duck-typing support (works on dicts, UserDicts, etc)
+            try:
                 current = current[key]
+            except:
+                return None # Key not found
+            # Lazy Parsing: Only parse string if we need to go deeper
             is_last_key = (i == len(keys) - 1)
             if not is_last_key and isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
                     except:
+                        return None # Broken JSON
         return current
         """
         data = row.get(source_col)
         if isinstance(data, str):
             try:
                 data = json.loads(data)
                     new_row[target_col] = val
             except Exception as e:
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
         card_data = DatasetCardData(
             language="en",
             license=license_name,
+            tags=["dataset-command-center", "etl", "generated-dataset"],
             base_model=source_id,
         )
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
+                # Check Filter
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
                 if passed:
                     try:
                         new_row = self._apply_projection(row, recipe)
+                        # Sanitize to prevent JSON crashes
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e: