Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

2f3537a

verified ·

1 Parent(s): 5015673

Update processor.py

Browse files

Files changed (1) hide show

processor.py +274 -95

processor.py CHANGED Viewed

@@ -2,10 +2,11 @@ import json
 import logging
 import datasets
 import math
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 from huggingface_hub import HfApi, DatasetCard, DatasetCardData
-import re
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -14,41 +15,78 @@ class DatasetCommandCenter:
         self.token = token
         self.api = HfApi(token=token)
-    # --- 1. INSPECTION ---
     def get_dataset_metadata(self, dataset_id):
         configs = ['default']
         splits = ['train', 'test', 'validation']
         license_name = "unknown"
         try:
             try:
-                c = get_dataset_config_names(dataset_id, token=self.token)
-                if c: configs = c
-            except: pass
             try:
                 infos = get_dataset_infos(dataset_id, token=self.token)
-                sel = configs[0]
-                info = infos.get(sel) or infos.get('default') or (list(infos.values())[0] if infos else None)
                 if info:
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
-            except: pass
-            return {"status": "success", "configs": configs, "splits": splits, "license_detected": license_name}
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
-            splits = list(infos[config_name].splits.keys())
             return {"status": "success", "splits": splits}
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
     def _sanitize_for_json(self, obj):
-        """Recursively cleans data for JSON serialization (Fixes NaN crash)."""
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
                 return None
@@ -60,196 +98,337 @@ class DatasetCommandCenter:
         elif isinstance(obj, (str, int, bool, type(None))):
             return obj
         else:
             return str(obj)
     def _flatten_object(self, obj, parent_key='', sep='.'):
         items = {}
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                try: obj = json.loads(s)
-                except: pass
         if isinstance(obj, dict):
             for k, v in obj.items():
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
-                items.update(self._flatten_object(v, new_key, sep))
         elif isinstance(obj, list):
-            items[parent_key or "list"] = "List"
         else:
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
-            schema_map = {}
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
-                # Sanitize entire row to prevent JSON crash on UI
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
-                # Schema Discovery
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
-                # List Detection
                 for k, v in row.items():
-                    if k not in schema_map: schema_map[k] = {"type": "Object"}
                     val = v
                     if isinstance(val, str):
                         try: val = json.loads(val)
                         except: pass
-                    if isinstance(val, list): schema_map[k]["type"] = "List"
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
                 root = path.split('.')[0]
-                if root not in schema_tree: schema_tree[root] = []
                 schema_tree[root].append(path)
             return {
                 "status": "success",
                 "samples": sample_rows,
-                "schema_tree": schema_tree,
-                "schema": schema_map,
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
-    # --- 2. LOGIC ---
     def _get_value_by_path(self, obj, path):
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                    try: current = json.loads(s)
-                    except: pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
-            else: return None
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         data = row.get(source_col)
         if isinstance(data, str):
-            try: data = json.loads(data)
-            except: return None
-        if not isinstance(data, list): return None
-        matched = None
         for item in data:
             if str(item.get(filter_key, '')) == str(filter_val):
-                matched = item
                 break
-        if matched: return self._get_value_by_path(matched, target_path)
         return None
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # Context
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
         eval_context['re'] = re
-        for col in recipe['columns']:
             try:
-                c_type = col.get('type', 'simple')
-                name = col['name']
-                if c_type == 'simple':
-                    new_row[name] = self._get_value_by_path(row, col['source'])
-                elif c_type == 'list_search':
-                    new_row[name] = self._extract_from_list_logic(row, col['source'], col['filter_key'], col['filter_val'], col['target_key'])
-                elif c_type == 'python':
-                    new_row[name] = eval(col['expression'], {}, eval_context)
             except Exception as e:
-                raise ValueError(f"Column '{col['name']}' error: {e}")
         return new_row
-    # --- 3. PREVIEW & PUSH ---
-    def preview_transform(self, dataset_id, config, split, recipe):
-        conf = config if config != 'default' else None
-        try:
-            ds = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
-            out = []
-            for i, row in enumerate(ds):
-                if len(out) >= 5: break
-                # Filter
-                if recipe.get('filter_rule'):
-                    try:
-                        ctx = row.copy()
-                        ctx['row'] = row
-                        ctx['json'] = json
-                        ctx['re'] = re
-                        if not eval(recipe['filter_rule'], {}, ctx): continue
-                    except: continue # Skip crashing filters in preview
-                try:
-                    # Apply & Sanitize
-                    proj = self._apply_projection(row, recipe)
-                    out.append(self._sanitize_for_json(proj))
-                except Exception as e:
-                    out.append({"_preview_error": str(e)})
-            return out
-        except Exception as e:
-            raise e
     def _generate_card(self, source_id, target_id, recipe, license_name):
-        content = f"# {target_id}\nDerived from [{source_id}](https://huggingface.co/datasets/{source_id}).\n\n## Recipe\n"
-        for c in recipe['columns']:
-            content += f"- **{c['name']}**: {c.get('type')} ({c.get('source') or c.get('expression')})\n"
-        content += f"\n**License:** {license_name}"
-        return DatasetCard.from_template(DatasetCardData(license=license_name, tags=["etl"]), content=content)
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
-        logger.info(f"Pushing {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
-            ds = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
-            for i, row in enumerate(ds):
-                if max_rows and count >= int(max_rows): break
-                # Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
                         ctx['re'] = re
-                        if not eval(recipe['filter_rule'], {}, ctx): continue
-                    except Exception as e: raise ValueError(f"Filter error row {i}: {e}")
-                # Project
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
-                except Exception as e: raise ValueError(f"Row {i} error: {e}")
         try:
-            new_ds = datasets.Dataset.from_generator(gen)
-            new_ds.push_to_hub(target_id, token=self.token)
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
-            except: pass
-            return {"status": "success", "rows_processed": len(new_ds)}
         except Exception as e:
-            return {"status": "failed", "error": str(e)}

 import logging
 import datasets
 import math
+import re
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 from huggingface_hub import HfApi, DatasetCard, DatasetCardData
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.token = token
         self.api = HfApi(token=token)
+    # ==========================================
+    # 1. METADATA & SCHEMA INSPECTION
+    # ==========================================
     def get_dataset_metadata(self, dataset_id):
+        """
+        Robustly fetches Configs and Splits.
+        Handles 404 errors gracefully if metadata files are missing.
+        """
         configs = ['default']
         splits = ['train', 'test', 'validation']
         license_name = "unknown"
         try:
+            # 1. Fetch Configs
             try:
+                found_configs = get_dataset_config_names(dataset_id, token=self.token)
+                if found_configs:
+                    configs = found_configs
+            except Exception:
+                pass # Keep default
+            # 2. Fetch Metadata (Splits & License)
             try:
+                selected = configs[0]
+                # This API call can fail on some datasets, so we wrap it safely
                 infos = get_dataset_infos(dataset_id, token=self.token)
+                info = None
+                if selected in infos:
+                    info = infos[selected]
+                elif 'default' in infos:
+                    info = infos['default']
+                elif infos:
+                    info = list(infos.values())[0]
                 if info:
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
+            except Exception:
+                pass # Keep defaults if metadata fails
+            return {
+                "status": "success",
+                "configs": configs,
+                "splits": splits,
+                "license_detected": license_name
+            }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
+        """
+        Updates the Split dropdown when the user changes the Config.
+        """
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
+            if config_name in infos:
+                splits = list(infos[config_name].splits.keys())
+            elif len(infos) > 0:
+                splits = list(infos.values())[0].splits.keys()
+            else:
+                splits = ['train', 'test']
             return {"status": "success", "splits": splits}
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
     def _sanitize_for_json(self, obj):
+        """
+        Recursively cleans data for JSON serialization.
+        CRITICAL FIX: Prevents 'Preview' crashes caused by NaN, Infinity, or Timestamps.
+        """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
                 return None
         elif isinstance(obj, (str, int, bool, type(None))):
             return obj
         else:
+            # Convert complex objects (Images, Dates) to string
             return str(obj)
     def _flatten_object(self, obj, parent_key='', sep='.'):
+        """
+        Recursively finds all keys in nested dicts or JSON strings
+        to populate the 'Simple Path' dropdown in the UI.
+        """
         items = {}
+        # Transparently parse JSON strings
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                try:
+                    obj = json.loads(s)
+                except:
+                    pass
         if isinstance(obj, dict):
             for k, v in obj.items():
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
+                items.update(self._flatten_object(v, new_key, sep=sep))
         elif isinstance(obj, list):
+            # We mark lists but do not recurse infinitely
+            new_key = f"{parent_key}" if parent_key else "list_content"
+            items[new_key] = "List"
         else:
+            # Leaf node
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
+        """
+        Scans the first 10 rows to build a Schema Tree for the UI.
+        """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
+            schema_map = {} # Used for List Mode detection
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # 1. Clean row for UI Preview (Handle NaN/Objects)
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
+                # 2. Deep Flattening for "Simple Path" dropdowns
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
+                # 3. Top Level Analysis for "List Mode" detection
                 for k, v in row.items():
+                    if k not in schema_map:
+                        schema_map[k] = {"type": "Object"}
                     val = v
                     if isinstance(val, str):
                         try: val = json.loads(val)
                         except: pass
+                    if isinstance(val, list):
+                        schema_map[k]["type"] = "List"
+            # Reconstruct Schema Tree for UI grouping
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
                 root = path.split('.')[0]
+                if root not in schema_tree:
+                    schema_tree[root] = []
                 schema_tree[root].append(path)
             return {
                 "status": "success",
                 "samples": sample_rows,
+                "schema_tree": schema_tree, # Used by Simple Path Dropdown
+                "schema": schema_map,       # Used by List Mode Dropdown
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
+    # ==========================================
+    # 2. CORE EXTRACTION LOGIC
+    # ==========================================
     def _get_value_by_path(self, obj, path):
+        """
+        Navigates dot notation (meta.user.id), automatically parsing
+        JSON strings if encountered along the path.
+        """
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
+            # Auto-parse JSON string if encountered
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                    try:
+                        current = json.loads(s)
+                    except:
+                        pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
+            else:
+                return None # Path broken
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
+        """
+        Logic for: FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
+        """
         data = row.get(source_col)
+        # Parse if string
         if isinstance(data, str):
+            try:
+                data = json.loads(data)
+            except:
+                return None
+        if not isinstance(data, list):
+            return None
+        matched_item = None
         for item in data:
+            # String comparison for safety
             if str(item.get(filter_key, '')) == str(filter_val):
+                matched_item = item
                 break
+        if matched_item:
+            return self._get_value_by_path(matched_item, target_path)
         return None
     def _apply_projection(self, row, recipe):
+        """
+        Builds the new row based on the recipe.
+        Raises ValueError if user Python code fails (Fail Fast).
+        """
         new_row = {}
+        # Setup Eval Context (Variables available in Python Mode)
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
         eval_context['re'] = re
+        for col_def in recipe['columns']:
+            t_type = col_def.get('type', 'simple')
+            target_col = col_def['name']
             try:
+                if t_type == 'simple':
+                    new_row[target_col] = self._get_value_by_path(row, col_def['source'])
+                elif t_type == 'list_search':
+                    new_row[target_col] = self._extract_from_list_logic(
+                        row,
+                        col_def['source'],
+                        col_def['filter_key'],
+                        col_def['filter_val'],
+                        col_def['target_key']
+                    )
+                elif t_type == 'python':
+                    # Execute user code
+                    expression = col_def['expression']
+                    val = eval(expression, {}, eval_context)
+                    new_row[target_col] = val
             except Exception as e:
+                # Fail Fast: Stop the generator immediately if a column fails
+                raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
+    # ==========================================
+    # 3. DOCUMENTATION (MODEL CARD)
+    # ==========================================
     def _generate_card(self, source_id, target_id, recipe, license_name):
+        """
+        Creates a high-quality README.md with a Markdown table of operations.
+        """
+        card_data = DatasetCardData(
+            language="en",
+            license=license_name,
+            tags=["dataset-command-center", "etl", "generated-dataset"],
+            base_model=source_id,
+        )
+        content = f"""
+# {target_id.split('/')[-1]}
+This dataset is a transformation of [{source_id}](https://huggingface.co/datasets/{source_id}).
+It was generated using the **Hugging Face Dataset Command Center**.
+## Transformation Recipe
+The following operations were applied to the source data:
+| Target Column | Operation Type | Source / Logic |
+|---------------|----------------|----------------|
+"""
+        for col in recipe['columns']:
+            c_type = col.get('type', 'simple')
+            c_name = col['name']
+            c_src = col.get('source', '-')
+            logic = "-"
+            if c_type == 'simple':
+                logic = f"Mapped from `{c_src}`"
+            elif c_type == 'list_search':
+                logic = f"Get `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
+            elif c_type == 'python':
+                logic = f"Python: `{col.get('expression')}`"
+            content += f"| **{c_name}** | {c_type} | {logic} |\n"
+        if recipe.get('filter_rule'):
+            content += f"\n### Row Filtering\n**Filter Applied:** `{recipe['filter_rule']}`\n"
+        content += f"\n## Original License\nThis dataset inherits the license: `{license_name}` from the source."
+        card = DatasetCard.from_template(card_data, content=content)
+        return card
+    # ==========================================
+    # 4. EXECUTION
+    # ==========================================
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
+        logger.info(f"Job started: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
+            ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
+            for i, row in enumerate(ds_stream):
+                if max_rows and count >= int(max_rows):
+                    break
+                # 1. Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
                         ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx):
+                            continue
+                    except Exception as e:
+                        raise ValueError(f"Filter crashed on row {i}: {e}")
+                # 2. Projection
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
+                except ValueError as ve:
+                    # Pass the specific column error up
+                    raise ve
+                except Exception as e:
+                    raise ValueError(f"Unexpected crash on row {i}: {e}")
         try:
+            # 1. Process & Push Data
+            new_dataset = datasets.Dataset.from_generator(gen)
+            new_dataset.push_to_hub(target_id, token=self.token)
+            # 2. Generate & Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
+            except Exception as e:
+                logger.error(f"Failed to push Dataset Card: {e}")
+            return {"status": "success", "rows_processed": len(new_dataset)}
+        except Exception as e:
+            logger.error(f"Job Failed: {e}")
+            return {"status": "failed", "error": str(e)}
+    # ==========================================
+    # 5. PREVIEW
+    # ==========================================
+    def preview_transform(self, dataset_id, config, split, recipe):
+        conf = config if config != 'default' else None
+        try:
+            ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
+            processed = []
+            for i, row in enumerate(ds_stream):
+                if len(processed) >= 5: break
+                # Check Filter
+                passed = True
+                if recipe.get('filter_rule'):
+                    try:
+                        ctx = row.copy()
+                        ctx['row'] = row
+                        ctx['json'] = json
+                        ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx):
+                            passed = False
+                    except:
+                        passed = False # Skip invalid rows in preview
+                if passed:
+                    try:
+                        new_row = self._apply_projection(row, recipe)
+                        # CRITICAL: Sanitize for JSON (handles NaNs, Dates, Images)
+                        clean_new_row = self._sanitize_for_json(new_row)
+                        processed.append(clean_new_row)
+                    except Exception as e:
+                        # In preview, we want to see the error, not crash
+                        processed.append({"_preview_error": f"Error: {str(e)}"})
+            return processed
         except Exception as e:
+             # Return global error if loading fails
+             raise e