Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

cca5bb1

verified ·

1 Parent(s): 2f3537a

Update processor.py

Browse files

Files changed (1) hide show

processor.py +41 -59

processor.py CHANGED Viewed

@@ -21,8 +21,8 @@ class DatasetCommandCenter:
     def get_dataset_metadata(self, dataset_id):
         """
-        Robustly fetches Configs and Splits.
-        Handles 404 errors gracefully if metadata files are missing.
         """
         configs = ['default']
         splits = ['train', 'test', 'validation']
@@ -35,12 +35,11 @@ class DatasetCommandCenter:
                 if found_configs:
                     configs = found_configs
             except Exception:
-                pass # Keep default
             # 2. Fetch Metadata (Splits & License)
             try:
                 selected = configs[0]
-                # This API call can fail on some datasets, so we wrap it safely
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 info = None
@@ -55,7 +54,7 @@ class DatasetCommandCenter:
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
             except Exception:
-                pass # Keep defaults if metadata fails
             return {
                 "status": "success",
@@ -67,9 +66,6 @@ class DatasetCommandCenter:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
-        """
-        Updates the Split dropdown when the user changes the Config.
-        """
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
@@ -85,7 +81,7 @@ class DatasetCommandCenter:
     def _sanitize_for_json(self, obj):
         """
         Recursively cleans data for JSON serialization.
-        CRITICAL FIX: Prevents 'Preview' crashes caused by NaN, Infinity, or Timestamps.
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
@@ -98,17 +94,16 @@ class DatasetCommandCenter:
         elif isinstance(obj, (str, int, bool, type(None))):
             return obj
         else:
-            # Convert complex objects (Images, Dates) to string
             return str(obj)
     def _flatten_object(self, obj, parent_key='', sep='.'):
         """
-        Recursively finds all keys in nested dicts or JSON strings
-        to populate the 'Simple Path' dropdown in the UI.
         """
         items = {}
-        # Transparently parse JSON strings
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
@@ -122,39 +117,34 @@ class DatasetCommandCenter:
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
                 items.update(self._flatten_object(v, new_key, sep=sep))
         elif isinstance(obj, list):
-            # We mark lists but do not recurse infinitely
             new_key = f"{parent_key}" if parent_key else "list_content"
             items[new_key] = "List"
         else:
-            # Leaf node
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
-        """
-        Scans the first 10 rows to build a Schema Tree for the UI.
-        """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
-            schema_map = {} # Used for List Mode detection
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
-                # 1. Clean row for UI Preview (Handle NaN/Objects)
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
-                # 2. Deep Flattening for "Simple Path" dropdowns
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
-                # 3. Top Level Analysis for "List Mode" detection
                 for k, v in row.items():
                     if k not in schema_map:
                         schema_map[k] = {"type": "Object"}
@@ -167,7 +157,6 @@ class DatasetCommandCenter:
                     if isinstance(val, list):
                         schema_map[k]["type"] = "List"
-            # Reconstruct Schema Tree for UI grouping
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
@@ -179,8 +168,8 @@ class DatasetCommandCenter:
             return {
                 "status": "success",
                 "samples": sample_rows,
-                "schema_tree": schema_tree, # Used by Simple Path Dropdown
-                "schema": schema_map,       # Used by List Mode Dropdown
                 "dataset_id": dataset_id
             }
         except Exception as e:
@@ -192,32 +181,38 @@ class DatasetCommandCenter:
     def _get_value_by_path(self, obj, path):
         """
-        Navigates dot notation (meta.user.id), automatically parsing
-        JSON strings if encountered along the path.
         """
         if not path: return obj
         keys = path.split('.')
         current = obj
-        for key in keys:
-            # Auto-parse JSON string if encountered
-            if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
                     except:
-                        pass
-            if isinstance(current, dict) and key in current:
-                current = current[key]
-            else:
-                return None # Path broken
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         """
-        Logic for: FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
         """
         data = row.get(source_col)
@@ -233,7 +228,6 @@ class DatasetCommandCenter:
         matched_item = None
         for item in data:
-            # String comparison for safety
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
@@ -244,13 +238,9 @@ class DatasetCommandCenter:
         return None
     def _apply_projection(self, row, recipe):
-        """
-        Builds the new row based on the recipe.
-        Raises ValueError if user Python code fails (Fail Fast).
-        """
         new_row = {}
-        # Setup Eval Context (Variables available in Python Mode)
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
@@ -274,13 +264,11 @@ class DatasetCommandCenter:
                     )
                 elif t_type == 'python':
-                    # Execute user code
-                    expression = col_def['expression']
-                    val = eval(expression, {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
-                # Fail Fast: Stop the generator immediately if a column fails
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
@@ -290,13 +278,10 @@ class DatasetCommandCenter:
     # ==========================================
     def _generate_card(self, source_id, target_id, recipe, license_name):
-        """
-        Creates a high-quality README.md with a Markdown table of operations.
-        """
         card_data = DatasetCardData(
             language="en",
             license=license_name,
-            tags=["dataset-command-center", "etl", "generated-dataset"],
             base_model=source_id,
         )
@@ -368,17 +353,16 @@ The following operations were applied to the source data:
                     yield self._apply_projection(row, recipe)
                     count += 1
                 except ValueError as ve:
-                    # Pass the specific column error up
                     raise ve
                 except Exception as e:
                     raise ValueError(f"Unexpected crash on row {i}: {e}")
         try:
-            # 1. Process & Push Data
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
-            # 2. Generate & Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
@@ -405,7 +389,7 @@ The following operations were applied to the source data:
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
-                # Check Filter
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
@@ -416,19 +400,17 @@ The following operations were applied to the source data:
                         if not eval(recipe['filter_rule'], {}, ctx):
                             passed = False
                     except:
-                        passed = False # Skip invalid rows in preview
                 if passed:
                     try:
                         new_row = self._apply_projection(row, recipe)
-                        # CRITICAL: Sanitize for JSON (handles NaNs, Dates, Images)
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e:
-                        # In preview, we want to see the error, not crash
                         processed.append({"_preview_error": f"Error: {str(e)}"})
             return processed
         except Exception as e:
-             # Return global error if loading fails
              raise e

     def get_dataset_metadata(self, dataset_id):
         """
+        Fetches Configs, Splits, and License info.
+        Gracefully handles missing metadata/404s.
         """
         configs = ['default']
         splits = ['train', 'test', 'validation']
                 if found_configs:
                     configs = found_configs
             except Exception:
+                pass
             # 2. Fetch Metadata (Splits & License)
             try:
                 selected = configs[0]
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 info = None
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
             except Exception:
+                pass
             return {
                 "status": "success",
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
     def _sanitize_for_json(self, obj):
         """
         Recursively cleans data for JSON serialization.
+        FIXES: 'Preview' crashes caused by NaN, Infinity, or complex Objects.
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
         elif isinstance(obj, (str, int, bool, type(None))):
             return obj
         else:
             return str(obj)
     def _flatten_object(self, obj, parent_key='', sep='.'):
         """
+        Recursively finds all keys in nested dicts/JSON to populate
+        the 'Simple Path' dropdown in the UI.
         """
         items = {}
+        # Transparently parse JSON strings for Schema Discovery
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
                 items.update(self._flatten_object(v, new_key, sep=sep))
         elif isinstance(obj, list):
             new_key = f"{parent_key}" if parent_key else "list_content"
             items[new_key] = "List"
         else:
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
+            schema_map = {}
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # Clean row for UI Preview
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
+                # Schema Discovery
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
+                # List Mode Detection
                 for k, v in row.items():
                     if k not in schema_map:
                         schema_map[k] = {"type": "Object"}
                     if isinstance(val, list):
                         schema_map[k]["type"] = "List"
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
             return {
                 "status": "success",
                 "samples": sample_rows,
+                "schema_tree": schema_tree,
+                "schema": schema_map,
                 "dataset_id": dataset_id
             }
         except Exception as e:
     def _get_value_by_path(self, obj, path):
         """
+        Extracts values using dot notation.
+        FIX: Lazy Parsing. Only parses JSON strings if we strictly need to
+        traverse deeper. This preserves raw strings for top-level columns.
         """
         if not path: return obj
         keys = path.split('.')
         current = obj
+        for i, key in enumerate(keys):
+            # 1. Access Key
+            if isinstance(current, dict) and key in current:
+                current = current[key]
+            else:
+                return None
+            # 2. Traverse Deeper?
+            is_last_key = (i == len(keys) - 1)
+            # Only parse if we are NOT at the end (we need to go inside)
+            if not is_last_key and isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
                     except:
+                        return None # Broken JSON in path
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         """
+        FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
         """
         data = row.get(source_col)
         matched_item = None
         for item in data:
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
         return None
     def _apply_projection(self, row, recipe):
         new_row = {}
+        # Eval Context
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
                     )
                 elif t_type == 'python':
+                    val = eval(col_def['expression'], {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
+                # Fail Fast
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
     # ==========================================
     def _generate_card(self, source_id, target_id, recipe, license_name):
         card_data = DatasetCardData(
             language="en",
             license=license_name,
+            tags=["dataset-command-center", "etl"],
             base_model=source_id,
         )
                     yield self._apply_projection(row, recipe)
                     count += 1
                 except ValueError as ve:
                     raise ve
                 except Exception as e:
                     raise ValueError(f"Unexpected crash on row {i}: {e}")
         try:
+            # 1. Process & Push
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
+            # 2. Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
+                # Filter Check
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
                         if not eval(recipe['filter_rule'], {}, ctx):
                             passed = False
                     except:
+                        passed = False
                 if passed:
                     try:
                         new_row = self._apply_projection(row, recipe)
+                        # Sanitize to prevent JSON crashes (NaN, Infinity, Images)
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e:
                         processed.append({"_preview_error": f"Error: {str(e)}"})
             return processed
         except Exception as e:
              raise e