Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

a738515

verified ·

1 Parent(s): 6baf7e5

Update processor.py

Browse files

Files changed (1) hide show

processor.py +115 -229

processor.py CHANGED Viewed

@@ -3,13 +3,18 @@ import logging
 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 from huggingface_hub import HfApi, DatasetCard, DatasetCardData
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class DatasetCommandCenter:
     def __init__(self, token=None):
         self.token = token
     def get_dataset_metadata(self, dataset_id):
         configs = []
@@ -20,26 +25,20 @@ class DatasetCommandCenter:
         try:
             configs = get_dataset_config_names(dataset_id, token=self.token)
         except Exception as e:
-            logger.warning(f"Could not fetch configs for {dataset_id}: {e}")
-            # Fallback: if we can't get configs, assume 'default'
             configs = ['default']
         # 2. Get Splits & License
-        # Many datasets return 404 on dataset_infos.json. We must catch this.
         try:
             selected_config = configs[0] if configs else 'default'
-            # This API call frequently fails on datasets without metadata cards
             infos = get_dataset_infos(dataset_id, token=self.token)
-            # Attempt to find the info object for our config
             info_obj = None
             if selected_config in infos:
                 info_obj = infos[selected_config]
             elif 'default' in infos:
                 info_obj = infos['default']
             elif len(infos) > 0:
-                # Fallback to the first available if names don't match
                 info_obj = list(infos.values())[0]
             if info_obj:
@@ -47,12 +46,9 @@ class DatasetCommandCenter:
                 license_name = info_obj.license or "unknown"
         except Exception as e:
-            logger.warning(f"Could not fetch dataset_infos (using fallbacks): {e}")
-            # Safe Fallback if metadata fails
             splits = ['train', 'test', 'validation']
-            license_name = "unknown"
-        # Ensure we NEVER return None for lists
         return {
             "status": "success",
             "configs": configs if configs else ['default'],
@@ -61,107 +57,18 @@ class DatasetCommandCenter:
         }
     def get_splits_for_config(self, dataset_id, config_name):
-        splits = []
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
                 splits = list(infos[config_name].splits.keys())
             elif len(infos) > 0:
-                # Fallback to first available
                 splits = list(infos.values())[0].splits.keys()
-        except Exception as e:
-            logger.warning(f"Could not fetch splits for config {config_name}: {e}")
-            # Fallback
-            splits = ['train', 'test', 'validation']
-        return {"status": "success", "splits": list(splits) if splits else ['train']}
-    # --- HELPER: Recursive JSON/Dot Notation Getter ---
-    def _get_value_by_path(self, obj, path):
-        if not path: return obj
-        keys = path.split('.')
-        current = obj
-        for key in keys:
-            # Auto-parse JSON string if encountered
-            if isinstance(current, str):
-                s = current.strip()
-                if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                    try:
-                        current = json.loads(s)
-                    except:
-                        pass
-            if isinstance(current, dict) and key in current:
-                current = current[key]
             else:
-                return None
-        return current
-    # --- HELPER: List Search Logic ---
-    def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
-        """
-        Logic: Look inside row[source_col] (which is a list).
-        Find first item where item[filter_key] == filter_val.
-        Then extract item[target_path].
-        """
-        # 1. Get the list (handling JSON string if needed)
-        data = row.get(source_col)
-        if isinstance(data, str):
-            try:
-                data = json.loads(data)
-            except:
-                return None
-        if not isinstance(data, list):
-            return None
-        # 2. Search the list
-        matched_item = None
-        for item in data:
-            # We treat values as strings for comparison to be safe
-            if str(item.get(filter_key, '')) == str(filter_val):
-                matched_item = item
-                break
-        if matched_item:
-            # 3. Extract the target (supporting nested json parsing via dot notation)
-            # e.g. target_path = "content.analysis"
-            return self._get_value_by_path(matched_item, target_path)
-        return None
-    def _flatten_schema(self, obj, parent='', visited=None):
-        if visited is None: visited = set()
-        items = []
-        # Avoid infinite recursion
-        if id(obj) in visited: return []
-        visited.add(id(obj))
-        # Handle JSON strings
-        if isinstance(obj, str):
-            s = obj.strip()
-            if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                try:
-                    obj = json.loads(s)
-                except:
-                    pass
-        if isinstance(obj, dict):
-            for k, v in obj.items():
-                full_key = f"{parent}.{k}" if parent else k
-                items.append((full_key, type(v).__name__))
-                items.extend(self._flatten_schema(v, full_key, visited))
-        elif isinstance(obj, list) and len(obj) > 0:
-            # For lists, we just peek at the first item to guess schema
-            full_key = f"{parent}[]" if parent else "[]"
-            items.append((parent, "List")) # Mark the parent as a List
-            items.extend(self._flatten_schema(obj[0], full_key, visited))
-        return items
     def inspect_dataset(self, dataset_id, config, split):
         try:
@@ -169,7 +76,7 @@ class DatasetCommandCenter:
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
-            schema_map = {} # stores { "col_name": { "is_list": bool, "keys": [] } }
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
@@ -177,6 +84,7 @@ class DatasetCommandCenter:
                 # Create clean sample for UI
                 clean_row = {}
                 for k, v in row.items():
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
                         clean_row[k] = str(v)
                     else:
@@ -188,8 +96,8 @@ class DatasetCommandCenter:
                     if k not in schema_map:
                         schema_map[k] = {"is_list": False, "keys": set()}
-                    # Check if it's a list (or json-string list)
                     val = v
                     if isinstance(val, str):
                         try:
                             val = json.loads(val)
@@ -202,7 +110,6 @@ class DatasetCommandCenter:
                     elif isinstance(val, dict):
                         schema_map[k]["keys"].update(val.keys())
-            # Format schema for UI
             formatted_schema = {}
             for k, info in schema_map.items():
                 formatted_schema[k] = {
@@ -219,19 +126,51 @@ class DatasetCommandCenter:
         except Exception as e:
             return {"status": "error", "message": str(e)}
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # OPTIMIZATION: Prepare context once per row, but imports are global/cached
-        # We assume standard libraries are available.
-        # For safety, we only import them once inside the method scope if needed,
-        # but Python caches imports so doing it here is fine.
-        import re
-        import json
-        # 1. Context Creation
-        # We use a shallow copy. If deep nested edits are needed, users should handle that,
-        # but shallow copy prevents modifying the original 'row' variable accidentally in the context.
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
@@ -239,69 +178,44 @@ class DatasetCommandCenter:
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
-            if t_type == 'simple':
-                new_row[col_def['name']] = self._get_value_by_path(row, col_def['source'])
-            elif t_type == 'list_search':
-                val = self._extract_from_list_logic(
-                    row,
-                    col_def['source'],
-                    col_def['filter_key'],
-                    col_def['filter_val'],
-                    col_def['target_key']
-                )
-                new_row[col_def['name']] = val
-            elif t_type == 'python':
-                # --- CRITICAL FIX: Error Visibility ---
-                try:
-                    # Execute the user's python string
-                    val = eval(col_def['expression'], {}, eval_context)
-                    new_row[col_def['name']] = val
-                except Exception as e:
-                    # Instead of silently setting None, we raise a custom error
-                    # that the generator can catch to abort the job.
-                    # We include the column name to help you debug.
-                    raise RuntimeError(f"Python Error in column '{col_def['name']}': {str(e)}")
-        return new_row
-    def _passes_filter(self, row, filter_str):
-        if not filter_str or not filter_str.strip():
-            return True
-        try:
-            # Fix context here as well so filters like "len(row['text']) > 5" work
-            context = row.copy()
-            context['row'] = row
-            context['json'] = json
-            import re
-            context['re'] = re
-            return eval(filter_str, {}, context)
-        except:
-            return False
         return new_row
     def _generate_card(self, source_id, target_id, recipe, license_name):
-        """
-        Generates a README.md with YAML metadata and a report of operations.
-        """
-        # 1. YAML Metadata
         card_data = DatasetCardData(
             language="en",
             license=license_name,
             tags=["dataset-command-center", "etl", "generated-dataset"],
-            base_model=source_id, # Linking source
         )
-        # 2. Description & Recipe Table
         content = f"""
 # {target_id.split('/')[-1]}
@@ -315,20 +229,15 @@ The following operations were applied to the source data:
 | Target Column | Source | Type | Logic / Filter |
 |---------------|--------|------|----------------|
 """
         for col in recipe['columns']:
             c_type = col.get('type', 'simple')
             c_name = col['name']
             c_src = col.get('source', '-')
-            if c_type == 'simple':
-                logic = "Direct Mapping"
-            elif c_type == 'list_search':
-                logic = f"Get `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
-            elif c_type == 'python':
-                logic = f"`{col.get('expression')}`"
-            else:
-                logic = "-"
             content += f"| **{c_name}** | `{c_src}` | {c_type} | {logic} |\n"
@@ -340,100 +249,77 @@ The following operations were applied to the source data:
         card = DatasetCard.from_template(card_data, content=content)
         return card
-    '''def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
-        logger.info(f"Job started: {source_id}")
-        conf = config if config != 'default' else None
-        def gen():
-            ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
-            count = 0
-            for row in ds_stream:
-                if max_rows and count >= int(max_rows): break
-                if self._passes_filter(row, recipe.get('filter_rule')):
-                    yield self._apply_projection(row, recipe)
-                    count += 1
-        try:
-            new_dataset = datasets.Dataset.from_generator(gen)
-            new_dataset.push_to_hub(target_id, token=self.token)
-            # 2. GENERATE & PUSH CARD
-            try:
-                card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
-                card.push_to_hub(target_id, token=self.token)
-            except Exception as e:
-                logger.warning(f"Could not push dataset card: {e}")
-            return {"status": "success", "rows_processed": len(new_dataset)}
-        except Exception as e:
-            return {"status": "error", "message": str(e)}'''
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
         logger.info(f"Job started: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
-        # We need a way to bubble exceptions out of the generator
-        # to the main thread.
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
-                if max_rows and count >= int(max_rows):
-                    break
-                # 1. Apply Filter (Safe Guarded)
                 if recipe.get('filter_rule'):
                     try:
-                        # Re-create context for filter
-                        import re, json
                         ctx = row.copy()
                         ctx['row'] = row
-                        ctx['json'] = json
-                        ctx['re'] = re
                         if not eval(recipe['filter_rule'], {}, ctx):
-                            continue # Skip this row naturally
                     except Exception as e:
-                        raise RuntimeError(f"Error in Row Filter: {str(e)}")
-                # 2. Apply Projection (The heavy lifter)
                 try:
-                    projected_row = self._apply_projection(row, recipe)
-                    yield projected_row
                     count += 1
-                except RuntimeError as re_err:
-                    # This is our custom error from above.
-                    # We stop the generator immediately.
-                    raise re_err
                 except Exception as e:
-                    raise RuntimeError(f"Unexpected error processing row {i}: {str(e)}")
         try:
-            # from_generator will iterate `gen()`. If gen() raises an error,
-            # from_generator stops and re-raises it.
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
-            # Generate Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
-            except: pass
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
-            # Return the specific error message to the UI
             logger.error(f"Job Failed: {e}")
             return {"status": "failed", "error": str(e)}
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
         for row in ds_stream:
             if len(processed) >= 5: break
-            if self._passes_filter(row, recipe.get('filter_rule')):
-                processed.append(self._apply_projection(row, recipe))
         return processed

 import datasets
 from datasets import load_dataset, get_dataset_config_names, get_dataset_infos
 from huggingface_hub import HfApi, DatasetCard, DatasetCardData
+import re
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class DatasetCommandCenter:
     def __init__(self, token=None):
         self.token = token
+        self.api = HfApi(token=token)
+    # --- 1. METADATA & INSPECTION ---
     def get_dataset_metadata(self, dataset_id):
         configs = []
         try:
             configs = get_dataset_config_names(dataset_id, token=self.token)
         except Exception as e:
+            logger.warning(f"Could not fetch configs: {e}")
             configs = ['default']
         # 2. Get Splits & License
         try:
             selected_config = configs[0] if configs else 'default'
             infos = get_dataset_infos(dataset_id, token=self.token)
             info_obj = None
             if selected_config in infos:
                 info_obj = infos[selected_config]
             elif 'default' in infos:
                 info_obj = infos['default']
             elif len(infos) > 0:
                 info_obj = list(infos.values())[0]
             if info_obj:
                 license_name = info_obj.license or "unknown"
         except Exception as e:
+            logger.warning(f"Metadata fetch fallback: {e}")
             splits = ['train', 'test', 'validation']
         return {
             "status": "success",
             "configs": configs if configs else ['default'],
         }
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
                 splits = list(infos[config_name].splits.keys())
             elif len(infos) > 0:
                 splits = list(infos.values())[0].splits.keys()
             else:
+                splits = ['train', 'test']
+        except:
+            splits = ['train', 'test', 'validation']
+        return {"status": "success", "splits": splits}
     def inspect_dataset(self, dataset_id, config, split):
         try:
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
+            schema_map = {}
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
                 # Create clean sample for UI
                 clean_row = {}
                 for k, v in row.items():
+                    # Convert objects to strings for display safety
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
                         clean_row[k] = str(v)
                     else:
                     if k not in schema_map:
                         schema_map[k] = {"is_list": False, "keys": set()}
                     val = v
+                    # Check for JSON string
                     if isinstance(val, str):
                         try:
                             val = json.loads(val)
                     elif isinstance(val, dict):
                         schema_map[k]["keys"].update(val.keys())
             formatted_schema = {}
             for k, info in schema_map.items():
                 formatted_schema[k] = {
         except Exception as e:
             return {"status": "error", "message": str(e)}
+    # --- 2. EXTRACTION LOGIC ---
+    def _get_value_by_path(self, obj, path):
+        if not path: return obj
+        keys = path.split('.')
+        current = obj
+        for key in keys:
+            if isinstance(current, str):
+                s = current.strip()
+                if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                    try:
+                        current = json.loads(s)
+                    except: pass
+            if isinstance(current, dict) and key in current:
+                current = current[key]
+            else:
+                return None
+        return current
+    def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
+        data = row.get(source_col)
+        if isinstance(data, str):
+            try:
+                data = json.loads(data)
+            except: return None
+        if not isinstance(data, list):
+            return None
+        matched_item = None
+        for item in data:
+            if str(item.get(filter_key, '')) == str(filter_val):
+                matched_item = item
+                break
+        if matched_item:
+            return self._get_value_by_path(matched_item, target_path)
+        return None
     def _apply_projection(self, row, recipe):
         new_row = {}
+        # Setup Context for Python/Eval
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
+            target_col = col_def['name']
+            try:
+                if t_type == 'simple':
+                    new_row[target_col] = self._get_value_by_path(row, col_def['source'])
+                elif t_type == 'list_search':
+                    new_row[target_col] = self._extract_from_list_logic(
+                        row,
+                        col_def['source'],
+                        col_def['filter_key'],
+                        col_def['filter_val'],
+                        col_def['target_key']
+                    )
+                elif t_type == 'python':
+                    expression = col_def['expression']
+                    val = eval(expression, {}, eval_context)
+                    new_row[target_col] = val
+            except Exception as e:
+                # Fail Fast: Raise error to stop the generator
+                raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
+    # --- 3. DOCUMENTATION (CARD) ---
     def _generate_card(self, source_id, target_id, recipe, license_name):
+        logger.info(f"Generating card for {target_id} with license {license_name}")
         card_data = DatasetCardData(
             language="en",
             license=license_name,
             tags=["dataset-command-center", "etl", "generated-dataset"],
+            base_model=source_id,
         )
         content = f"""
 # {target_id.split('/')[-1]}
 | Target Column | Source | Type | Logic / Filter |
 |---------------|--------|------|----------------|
 """
         for col in recipe['columns']:
             c_type = col.get('type', 'simple')
             c_name = col['name']
             c_src = col.get('source', '-')
+            logic = "-"
+            if c_type == 'simple': logic = "Direct Mapping"
+            elif c_type == 'list_search': logic = f"Get `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
+            elif c_type == 'python': logic = f"`{col.get('expression')}`"
             content += f"| **{c_name}** | `{c_src}` | {c_type} | {logic} |\n"
         card = DatasetCard.from_template(card_data, content=content)
         return card
+    # --- 4. EXECUTION ---
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
         logger.info(f"Job started: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
+                if max_rows and count >= int(max_rows): break
+                # Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         if not eval(recipe['filter_rule'], {}, ctx):
+                            continue
                     except Exception as e:
+                        raise ValueError(f"Filter crashed on row {i}: {e}")
+                # Projection
                 try:
+                    yield self._apply_projection(row, recipe)
                     count += 1
+                except ValueError as ve:
+                    raise ve
                 except Exception as e:
+                    raise ValueError(f"Crash on row {i}: {e}")
         try:
+            # 1. Push Data
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
+            # 2. Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
+            except Exception as e:
+                logger.error(f"Failed to push Dataset Card: {e}")
+                # We do NOT fail the whole job, but we log it.
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
             logger.error(f"Job Failed: {e}")
             return {"status": "failed", "error": str(e)}
+    # --- 5. PREVIEW ---
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
         ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
         processed = []
         for row in ds_stream:
             if len(processed) >= 5: break
+            # Filter
+            passed = True
+            if recipe.get('filter_rule'):
+                try:
+                    ctx = row.copy()
+                    ctx['row'] = row
+                    if not eval(recipe['filter_rule'], {}, ctx): passed = False
+                except: passed = False
+            if passed:
+                try:
+                    processed.append(self._apply_projection(row, recipe))
+                except Exception as e:
+                    processed.append({"error": str(e)}) # Show error in preview
         return processed