Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

4425935

verified ·

1 Parent(s): 1a2363c

Update processor.py

Browse files

Files changed (1) hide show

processor.py +117 -140

processor.py CHANGED Viewed

@@ -2,7 +2,6 @@ import json
 import logging
 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
-from huggingface_hub import HfApi
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -13,185 +12,166 @@ class DatasetCommandCenter:
         self.token = token
     def get_dataset_metadata(self, dataset_id):
-        """
-        Step 1: Get available Configs (subsets) and Splits without downloading data.
-        """
         try:
-            # 1. Get Configs (e.g., 'en', 'fr' or 'default')
             try:
                 configs = get_dataset_config_names(dataset_id, token=self.token)
-            except Exception:
-                # Some datasets have no configs or throw errors, default to 'default' or None
                 configs = ['default']
-            # 2. Get Splits for the first config (to pre-populate)
-            # We will fetch specific splits for other configs dynamically if needed
-            selected_config = configs[0]
             try:
-                # This fetches metadata (splits, columns) without downloading rows
                 infos = get_dataset_infos(dataset_id, token=self.token)
-                # If multiple configs, infos is a dict keyed by config name
-                if selected_config in infos:
-                    splits = list(infos[selected_config].splits.keys())
                 else:
-                    # Fallback if structure is flat
                     splits = list(infos.values())[0].splits.keys()
             except:
-                # Fallback: try to just list simple splits
                 splits = ['train', 'test', 'validation']
-            return {
-                "status": "success",
-                "configs": configs,
-                "splits": list(splits)
-            }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
-        """
-        Helper to update splits when user changes the Config dropdown
-        """
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             splits = list(infos[config_name].splits.keys())
             return {"status": "success", "splits": splits}
-        except Exception as e:
-             # Fallback
             return {"status": "success", "splits": ['train', 'test', 'validation']}
     def inspect_dataset(self, dataset_id, config, split):
         """
-        Step 2: Stream actual rows and detect JSON.
         """
         try:
-            # Handle 'default' config edge cases
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             for i, row in enumerate(ds_stream):
-                if i >= 5: break
-                # Convert non-serializable objects (like PIL Images) to strings for preview
                 clean_row = {}
                 for k, v in row.items():
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
-                        clean_row[k] = str(v)
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
-            if not sample_rows:
-                return {"status": "error", "message": "Dataset is empty."}
-            # Analyze Columns
-            analysis = {}
-            keys = sample_rows[0].keys()
-            for k in keys:
-                sample_val = sample_rows[0][k]
-                col_type = type(sample_val).__name__
-                is_json_str = False
-                # Check if string looks like JSON
-                if isinstance(sample_val, str):
-                    s = sample_val.strip()
-                    if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                        try:
-                            json.loads(s)
-                            is_json_str = True
-                        except:
-                            pass
-                analysis[k] = {
-                    "type": col_type,
-                    "is_json_string": is_json_str
-                }
             return {
                 "status": "success",
-                "samples": sample_rows,
-                "analysis": analysis
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
-    def _apply_transformations(self, row, recipe):
         """
-        Apply Parsing, Renaming, Dropping, Filtering
         """
-        new_row = row.copy()
-        # 1. JSON Expansions
-        if "json_expansions" in recipe:
-            for item in recipe["json_expansions"]:
-                col_name = item["col"]
-                target_keys = item["keys"]
-                # Check if we need to parse string-json first
-                source_data = new_row.get(col_name)
-                parsed_obj = None
-                # Case A: It is already a dict (Struct)
-                if isinstance(source_data, dict):
-                    parsed_obj = source_data
-                # Case B: It is a string (JSON String)
-                elif isinstance(source_data, str):
                     try:
-                        parsed_obj = json.loads(source_data)
                     except:
-                        pass
-                if parsed_obj:
-                    for key in target_keys:
-                        # Handle Nested Dot Notation (e.g. "meta.url")
-                        val = parsed_obj
-                        parts = key.split('.')
-                        try:
-                            for p in parts:
-                                val = val[p]
-                            # Create new column name (replace dots with underscores)
-                            clean_key = key.replace('.', '_')
-                            new_col_name = f"{col_name}_{clean_key}"
-                            new_row[new_col_name] = val
-                        except:
-                            # Key not found
-                            clean_key = key.replace('.', '_')
-                            new_row[f"{col_name}_{clean_key}"] = None
-        # 2. Renames
-        if "renames" in recipe:
-            for old, new in recipe["renames"].items():
-                if old in new_row:
-                    new_row[new] = new_row.pop(old)
-        # 3. Drops
-        if "drops" in recipe:
-            for drop_col in recipe["drops"]:
-                if drop_col in new_row:
-                    del new_row[drop_col]
         return new_row
-    def _passes_filter(self, row, filters):
-        if not filters: return True
-        context = row.copy()
-        for f_str in filters:
-            try:
-                # Safety: very basic eval.
-                if not eval(f_str, {}, context):
-                    return False
-            except:
-                return False
-        return True
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
-        logger.info(f"Starting job: {source_id} ({config}/{split}) -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
@@ -201,35 +181,32 @@ class DatasetCommandCenter:
                 if max_rows and count >= int(max_rows):
                     break
-                # Transform first (so filters apply to NEW schema if needed,
-                # OR change order depending on preference. Here we filter RAW data usually,
-                # but for JSON extraction we often filter on extracted fields.
-                # Let's Apply Transform -> Then Filter to allow filtering on extracted JSON fields)
-                trans_row = self._apply_transformations(row, recipe)
-                if self._passes_filter(trans_row, recipe.get("filters", [])):
-                    yield trans_row
                     count += 1
-        # Push to Hub
-        # Note: We must infer features or let HF do it.
-        # Using a generator allows HF to auto-detect the new schema.
         try:
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
             logger.error(e)
             raise e
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
-        for i, row in enumerate(ds_stream):
             if len(processed) >= 5: break
-            trans_row = self._apply_transformations(row, recipe)
-            if self._passes_filter(trans_row, recipe.get("filters", [])):
-                processed.append(trans_row)
         return processed

 import logging
 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         self.token = token
     def get_dataset_metadata(self, dataset_id):
         try:
             try:
                 configs = get_dataset_config_names(dataset_id, token=self.token)
+            except:
                 configs = ['default']
+            # Try to fetch splits for the first config
             try:
                 infos = get_dataset_infos(dataset_id, token=self.token)
+                first_conf = configs[0]
+                if first_conf in infos:
+                    splits = list(infos[first_conf].splits.keys())
                 else:
                     splits = list(infos.values())[0].splits.keys()
             except:
                 splits = ['train', 'test', 'validation']
+            return {"status": "success", "configs": configs, "splits": list(splits)}
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             splits = list(infos[config_name].splits.keys())
             return {"status": "success", "splits": splits}
+        except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
+    def _flatten_object(self, obj, parent_key='', sep='.'):
+        """
+        Recursively finds all keys in a nested dictionary (or JSON string).
+        Returns a dict of { 'path': sample_value }.
+        """
+        items = {}
+        # If it's a string, try to parse it as JSON first
+        if isinstance(obj, str):
+            obj = obj.strip()
+            if (obj.startswith('{') and obj.endswith('}')):
+                try:
+                    obj = json.loads(obj)
+                except:
+                    pass # It's just a string
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                new_key = f"{parent_key}{sep}{k}" if parent_key else k
+                items.update(self._flatten_object(v, new_key, sep=sep))
+        else:
+            # It's a leaf node (int, str, list, etc.)
+            items[parent_key] = obj
+        return items
     def inspect_dataset(self, dataset_id, config, split):
         """
+        Scans first N rows to build a map of ALL available fields (including nested JSON).
         """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
+            available_paths = set()
+            # We scan 20 rows to find schema variations (some JSON keys might be optional)
             for i, row in enumerate(ds_stream):
+                if i >= 20: break
+                # Store a clean version for UI preview
                 clean_row = {}
                 for k, v in row.items():
+                    # Handle bytes/images
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
+                        clean_row[k] = f"<{type(v).__name__}>"
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
+                # Schema Inference: Flatten this row to find all possible dot-notation paths
+                flattened = self._flatten_object(row)
+                available_paths.update(flattened.keys())
+            # Sort paths naturally
+            sorted_paths = sorted(list(available_paths))
+            # Group paths by top-level column for the UI
+            schema_tree = {}
+            for path in sorted_paths:
+                root = path.split('.')[0]
+                if root not in schema_tree:
+                    schema_tree[root] = []
+                schema_tree[root].append(path)
             return {
                 "status": "success",
+                "samples": sample_rows[:5], # Send 5 to frontend
+                "schema_tree": schema_tree, # { 'meta': ['meta', 'meta.url', 'meta.id'] }
+                "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
+    def _get_value_by_path(self, row, path):
         """
+        Extracts value using dot notation, parsing JSON strings on the fly if needed.
         """
+        keys = path.split('.')
+        current_data = row
+        try:
+            for i, key in enumerate(keys):
+                # 1. If current_data is a JSON string, parse it
+                if isinstance(current_data, str):
                     try:
+                        current_data = json.loads(current_data)
                     except:
+                        return None # Parsing failed
+                # 2. Access key
+                if isinstance(current_data, dict) and key in current_data:
+                    current_data = current_data[key]
+                else:
+                    return None # Key missing
+            return current_data
+        except:
+            return None
+    def _apply_projection(self, row, recipe):
+        """
+        Constructs a NEW row based on the target columns defined in recipe.
+        """
+        new_row = {}
+        for target in recipe['columns']:
+            # target = { "name": "new_col_name", "source": "old_col.nested.key" }
+            val = self._get_value_by_path(row, target['source'])
+            new_row[target['name']] = val
         return new_row
+    def _passes_filter(self, row, filter_str):
+        """
+        Filters are applied to the SOURCE row structure (before projection).
+        """
+        if not filter_str or not filter_str.strip():
+            return True
+        try:
+            # We must handle cases where 'row' has nested objects unparsed?
+            # For simplicity, we eval on the raw row dictionary.
+            # Users can use python: `json.loads(row['meta'])['url'] == ...`
+            # Or we can support the flattened context?
+            # Let's stick to raw row context for now.
+            context = row.copy()
+            return eval(filter_str, {}, context)
+        except:
+            return False # Fail safe
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
+        logger.info(f"Starting projection job: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
                 if max_rows and count >= int(max_rows):
                     break
+                # 1. Filter (Source)
+                if self._passes_filter(row, recipe.get('filter_rule')):
+                    # 2. Project (Build new row)
+                    new_row = self._apply_projection(row, recipe)
+                    yield new_row
                     count += 1
         try:
+            # Create new dataset from generator (Auto-infers schema from first yielded dict)
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
             logger.error(e)
             raise e
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
+        for row in ds_stream:
             if len(processed) >= 5: break
+            if self._passes_filter(row, recipe.get('filter_rule')):
+                new_row = self._apply_projection(row, recipe)
+                processed.append(new_row)
         return processed