Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

a76c50f

verified ·

1 Parent(s): f37234a

Update processor.py

Browse files

Files changed (1) hide show

processor.py +145 -106

processor.py CHANGED Viewed

@@ -3,7 +3,6 @@ import logging
 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -17,8 +16,6 @@ class DatasetCommandCenter:
                 configs = get_dataset_config_names(dataset_id, token=self.token)
             except:
                 configs = ['default']
-            # Try to fetch splits for the first config
             try:
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 first_conf = configs[0]
@@ -28,7 +25,6 @@ class DatasetCommandCenter:
                     splits = list(infos.values())[0].splits.keys()
             except:
                 splits = ['train', 'test', 'validation']
             return {"status": "success", "configs": configs, "splits": list(splits)}
         except Exception as e:
             return {"status": "error", "message": str(e)}
@@ -41,172 +37,215 @@ class DatasetCommandCenter:
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
-    def _flatten_object(self, obj, parent_key='', sep='.'):
         """
-        Recursively finds all keys in a nested dictionary (or JSON string).
-        Returns a dict of { 'path': sample_value }.
         """
-        items = {}
-        # If it's a string, try to parse it as JSON first
         if isinstance(obj, str):
-            obj = obj.strip()
-            if (obj.startswith('{') and obj.endswith('}')):
                 try:
-                    obj = json.loads(obj)
                 except:
-                    pass # It's just a string
         if isinstance(obj, dict):
             for k, v in obj.items():
-                new_key = f"{parent_key}{sep}{k}" if parent_key else k
-                items.update(self._flatten_object(v, new_key, sep=sep))
-        else:
-            # It's a leaf node (int, str, list, etc.)
-            items[parent_key] = obj
         return items
     def inspect_dataset(self, dataset_id, config, split):
-        """
-        Scans first N rows to build a map of ALL available fields (including nested JSON).
-        """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
-            available_paths = set()
-            # We scan 20 rows to find schema variations (some JSON keys might be optional)
             for i, row in enumerate(ds_stream):
-                if i >= 20: break
-                # Store a clean version for UI preview
                 clean_row = {}
                 for k, v in row.items():
-                    # Handle bytes/images
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
-                        clean_row[k] = f"<{type(v).__name__}>"
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
-                # Schema Inference: Flatten this row to find all possible dot-notation paths
-                flattened = self._flatten_object(row)
-                available_paths.update(flattened.keys())
-            # Sort paths naturally
-            sorted_paths = sorted(list(available_paths))
-            # Group paths by top-level column for the UI
-            schema_tree = {}
-            for path in sorted_paths:
-                root = path.split('.')[0]
-                if root not in schema_tree:
-                    schema_tree[root] = []
-                schema_tree[root].append(path)
             return {
-                "status": "success",
-                "samples": sample_rows[:5], # Send 5 to frontend
-                "schema_tree": schema_tree, # { 'meta': ['meta', 'meta.url', 'meta.id'] }
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
-    def _get_value_by_path(self, row, path):
-        """
-        Extracts value using dot notation, parsing JSON strings on the fly if needed.
-        """
-        keys = path.split('.')
-        current_data = row
-        try:
-            for i, key in enumerate(keys):
-                # 1. If current_data is a JSON string, parse it
-                if isinstance(current_data, str):
-                    try:
-                        current_data = json.loads(current_data)
-                    except:
-                        return None # Parsing failed
-                # 2. Access key
-                if isinstance(current_data, dict) and key in current_data:
-                    current_data = current_data[key]
-                else:
-                    return None # Key missing
-            return current_data
-        except:
-            return None
     def _apply_projection(self, row, recipe):
-        """
-        Constructs a NEW row based on the target columns defined in recipe.
-        """
         new_row = {}
-        for target in recipe['columns']:
-            # target = { "name": "new_col_name", "source": "old_col.nested.key" }
-            val = self._get_value_by_path(row, target['source'])
-            new_row[target['name']] = val
         return new_row
     def _passes_filter(self, row, filter_str):
-        """
-        Filters are applied to the SOURCE row structure (before projection).
-        """
-        if not filter_str or not filter_str.strip():
-            return True
         try:
-            # We must handle cases where 'row' has nested objects unparsed?
-            # For simplicity, we eval on the raw row dictionary.
-            # Users can use python: `json.loads(row['meta'])['url'] == ...`
-            # Or we can support the flattened context?
-            # Let's stick to raw row context for now.
             context = row.copy()
             return eval(filter_str, {}, context)
         except:
-            return False # Fail safe
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
-        logger.info(f"Starting projection job: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for row in ds_stream:
-                if max_rows and count >= int(max_rows):
-                    break
-                # 1. Filter (Source)
                 if self._passes_filter(row, recipe.get('filter_rule')):
-                    # 2. Project (Build new row)
-                    new_row = self._apply_projection(row, recipe)
-                    yield new_row
                     count += 1
         try:
-            # Create new dataset from generator (Auto-infers schema from first yielded dict)
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
-            logger.error(e)
-            raise e
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
         for row in ds_stream:
             if len(processed) >= 5: break
             if self._passes_filter(row, recipe.get('filter_rule')):
-                new_row = self._apply_projection(row, recipe)
-                processed.append(new_row)
         return processed

 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
                 configs = get_dataset_config_names(dataset_id, token=self.token)
             except:
                 configs = ['default']
             try:
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 first_conf = configs[0]
                     splits = list(infos.values())[0].splits.keys()
             except:
                 splits = ['train', 'test', 'validation']
             return {"status": "success", "configs": configs, "splits": list(splits)}
         except Exception as e:
             return {"status": "error", "message": str(e)}
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
+    # --- HELPER: Recursive JSON/Dot Notation Getter ---
+    def _get_value_by_path(self, obj, path):
+        if not path: return obj
+        keys = path.split('.')
+        current = obj
+        for key in keys:
+            # Auto-parse JSON string if encountered
+            if isinstance(current, str):
+                s = current.strip()
+                if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                    try:
+                        current = json.loads(s)
+                    except:
+                        pass
+            if isinstance(current, dict) and key in current:
+                current = current[key]
+            else:
+                return None
+        return current
+    # --- HELPER: List Search Logic ---
+    def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         """
+        Logic: Look inside row[source_col] (which is a list).
+        Find first item where item[filter_key] == filter_val.
+        Then extract item[target_path].
         """
+        # 1. Get the list (handling JSON string if needed)
+        data = row.get(source_col)
+        if isinstance(data, str):
+            try:
+                data = json.loads(data)
+            except:
+                return None
+        if not isinstance(data, list):
+            return None
+        # 2. Search the list
+        matched_item = None
+        for item in data:
+            # We treat values as strings for comparison to be safe
+            if str(item.get(filter_key, '')) == str(filter_val):
+                matched_item = item
+                break
+        if matched_item:
+            # 3. Extract the target (supporting nested json parsing via dot notation)
+            # e.g. target_path = "content.analysis"
+            return self._get_value_by_path(matched_item, target_path)
+        return None
+    def _flatten_schema(self, obj, parent='', visited=None):
+        if visited is None: visited = set()
+        items = []
+        # Avoid infinite recursion
+        if id(obj) in visited: return []
+        visited.add(id(obj))
+        # Handle JSON strings
         if isinstance(obj, str):
+            s = obj.strip()
+            if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                 try:
+                    obj = json.loads(s)
                 except:
+                    pass
         if isinstance(obj, dict):
             for k, v in obj.items():
+                full_key = f"{parent}.{k}" if parent else k
+                items.append((full_key, type(v).__name__))
+                items.extend(self._flatten_schema(v, full_key, visited))
+        elif isinstance(obj, list) and len(obj) > 0:
+            # For lists, we just peek at the first item to guess schema
+            full_key = f"{parent}[]" if parent else "[]"
+            items.append((parent, "List")) # Mark the parent as a List
+            items.extend(self._flatten_schema(obj[0], full_key, visited))
         return items
     def inspect_dataset(self, dataset_id, config, split):
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
+            schema_map = {} # stores { "col_name": { "is_list": bool, "keys": [] } }
             for i, row in enumerate(ds_stream):
+                if i >= 10: break
+                # Create clean sample for UI
                 clean_row = {}
                 for k, v in row.items():
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
+                        clean_row[k] = str(v)
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
+                # Analyze Schema
+                for k, v in row.items():
+                    if k not in schema_map:
+                        schema_map[k] = {"is_list": False, "keys": set()}
+                    # Check if it's a list (or json-string list)
+                    val = v
+                    if isinstance(val, str):
+                        try:
+                            val = json.loads(val)
+                        except: pass
+                    if isinstance(val, list):
+                        schema_map[k]["is_list"] = True
+                        if len(val) > 0 and isinstance(val[0], dict):
+                            schema_map[k]["keys"].update(val[0].keys())
+                    elif isinstance(val, dict):
+                        schema_map[k]["keys"].update(val.keys())
+            # Format schema for UI
+            formatted_schema = {}
+            for k, info in schema_map.items():
+                formatted_schema[k] = {
+                    "type": "List" if info["is_list"] else "Object",
+                    "keys": list(info["keys"])
+                }
             return {
+                "status": "success",
+                "samples": sample_rows,
+                "schema": formatted_schema,
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def _apply_projection(self, row, recipe):
         new_row = {}
+        for col_def in recipe['columns']:
+            t_type = col_def.get('type', 'simple')
+            if t_type == 'simple':
+                # Standard Dot Notation
+                new_row[col_def['name']] = self._get_value_by_path(row, col_def['source'])
+            elif t_type == 'list_search':
+                # GET x WHERE y=z
+                val = self._extract_from_list_logic(
+                    row,
+                    col_def['source'],
+                    col_def['filter_key'],
+                    col_def['filter_val'],
+                    col_def['target_key']
+                )
+                new_row[col_def['name']] = val
+            elif t_type == 'python':
+                # Advanced Python Eval
+                try:
+                    context = row.copy()
+                    # We inject 'json' module into context for user scripts
+                    context['json'] = json
+                    val = eval(col_def['expression'], {}, context)
+                    new_row[col_def['name']] = val
+                except:
+                    new_row[col_def['name']] = None
         return new_row
     def _passes_filter(self, row, filter_str):
+        if not filter_str: return True
         try:
             context = row.copy()
             return eval(filter_str, {}, context)
         except:
+            return False
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
+        logger.info(f"Job started: {source_id}")
         conf = config if config != 'default' else None
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for row in ds_stream:
+                if max_rows and count >= int(max_rows): break
                 if self._passes_filter(row, recipe.get('filter_rule')):
+                    yield self._apply_projection(row, recipe)
                     count += 1
         try:
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
+            return {"status": "error", "message": str(e)}
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
         for row in ds_stream:
             if len(processed) >= 5: break
             if self._passes_filter(row, recipe.get('filter_rule')):
+                processed.append(self._apply_projection(row, recipe))
         return processed