Spaces:

broadfield-dev
/

HF-Dataset-Commander

Paused

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

6baf7e5

verified ·

1 Parent(s): 917097f

Update processor.py

Browse files

Files changed (1) hide show

processor.py +81 -13

processor.py CHANGED Viewed

@@ -222,23 +222,28 @@ class DatasetCommandCenter:
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # 1. Prepare Execution Context for Python Mode
-        # This allows users to write: row['text'] OR just text
         eval_context = row.copy()
-        eval_context['row'] = row  # <--- CRITICAL FIX: explicit 'row' variable
         eval_context['json'] = json
-        import re
-        eval_context['re'] = re  # Allow regex usage
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
             if t_type == 'simple':
-                # Standard Dot Notation
                 new_row[col_def['name']] = self._get_value_by_path(row, col_def['source'])
             elif t_type == 'list_search':
-                # GET x WHERE y=z
                 val = self._extract_from_list_logic(
                     row,
                     col_def['source'],
@@ -249,17 +254,21 @@ class DatasetCommandCenter:
                 new_row[col_def['name']] = val
             elif t_type == 'python':
-                # Advanced Python Eval
                 try:
-                    # Logic: eval("row['text'].upper()", globals, locals)
                     val = eval(col_def['expression'], {}, eval_context)
                     new_row[col_def['name']] = val
                 except Exception as e:
-                    # If it fails (e.g. key error on specific row), return None
-                    new_row[col_def['name']] = None
         return new_row
     def _passes_filter(self, row, filter_str):
         if not filter_str or not filter_str.strip():
             return True
@@ -331,7 +340,7 @@ The following operations were applied to the source data:
         card = DatasetCard.from_template(card_data, content=content)
         return card
-    def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
         logger.info(f"Job started: {source_id}")
         conf = config if config != 'default' else None
@@ -357,8 +366,67 @@ The following operations were applied to the source data:
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
-            return {"status": "error", "message": str(e)}
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None

     def _apply_projection(self, row, recipe):
         new_row = {}
+        # OPTIMIZATION: Prepare context once per row, but imports are global/cached
+        # We assume standard libraries are available.
+        # For safety, we only import them once inside the method scope if needed,
+        # but Python caches imports so doing it here is fine.
+        import re
+        import json
+        # 1. Context Creation
+        # We use a shallow copy. If deep nested edits are needed, users should handle that,
+        # but shallow copy prevents modifying the original 'row' variable accidentally in the context.
         eval_context = row.copy()
+        eval_context['row'] = row
         eval_context['json'] = json
+        eval_context['re'] = re
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
             if t_type == 'simple':
                 new_row[col_def['name']] = self._get_value_by_path(row, col_def['source'])
             elif t_type == 'list_search':
                 val = self._extract_from_list_logic(
                     row,
                     col_def['source'],
                 new_row[col_def['name']] = val
             elif t_type == 'python':
+                # --- CRITICAL FIX: Error Visibility ---
                 try:
+                    # Execute the user's python string
                     val = eval(col_def['expression'], {}, eval_context)
                     new_row[col_def['name']] = val
                 except Exception as e:
+                    # Instead of silently setting None, we raise a custom error
+                    # that the generator can catch to abort the job.
+                    # We include the column name to help you debug.
+                    raise RuntimeError(f"Python Error in column '{col_def['name']}': {str(e)}")
         return new_row
     def _passes_filter(self, row, filter_str):
         if not filter_str or not filter_str.strip():
             return True
         card = DatasetCard.from_template(card_data, content=content)
         return card
+    '''def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None):
         logger.info(f"Job started: {source_id}")
         conf = config if config != 'default' else None
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
+            return {"status": "error", "message": str(e)}'''
+    def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
+        logger.info(f"Job started: {source_id} -> {target_id}")
+        conf = config if config != 'default' else None
+        # We need a way to bubble exceptions out of the generator
+        # to the main thread.
+        def gen():
+            ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
+            count = 0
+            for i, row in enumerate(ds_stream):
+                if max_rows and count >= int(max_rows):
+                    break
+                # 1. Apply Filter (Safe Guarded)
+                if recipe.get('filter_rule'):
+                    try:
+                        # Re-create context for filter
+                        import re, json
+                        ctx = row.copy()
+                        ctx['row'] = row
+                        ctx['json'] = json
+                        ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx):
+                            continue # Skip this row naturally
+                    except Exception as e:
+                        raise RuntimeError(f"Error in Row Filter: {str(e)}")
+                # 2. Apply Projection (The heavy lifter)
+                try:
+                    projected_row = self._apply_projection(row, recipe)
+                    yield projected_row
+                    count += 1
+                except RuntimeError as re_err:
+                    # This is our custom error from above.
+                    # We stop the generator immediately.
+                    raise re_err
+                except Exception as e:
+                    raise RuntimeError(f"Unexpected error processing row {i}: {str(e)}")
+        try:
+            # from_generator will iterate `gen()`. If gen() raises an error,
+            # from_generator stops and re-raises it.
+            new_dataset = datasets.Dataset.from_generator(gen)
+            new_dataset.push_to_hub(target_id, token=self.token)
+            # Generate Card
+            try:
+                card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
+                card.push_to_hub(target_id, token=self.token)
+            except: pass
+            return {"status": "success", "rows_processed": len(new_dataset)}
+        except Exception as e:
+            # Return the specific error message to the UI
+            logger.error(f"Job Failed: {e}")
+            return {"status": "failed", "error": str(e)}
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None