Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 31, 2025

Commit

75b52d0

verified ·

1 Parent(s): 175e3dd

Update processor.py

Browse files

Files changed (1) hide show

processor.py +46 -24

processor.py CHANGED Viewed

@@ -179,35 +179,35 @@ class DatasetCommandCenter:
     # 2. CORE EXTRACTION LOGIC
     # ==========================================
-    def _get_value_by_path(self, obj, path):
         """
-        Retrieves a value from the row.
         """
         if not path: return obj
-        # 1. Try Direct Access (Fixes "Simple Path" for columns with dots in name)
         try:
-            if isinstance(obj, dict) and path in obj:
-                return obj[path]
-        except: pass
-        # 2. Try Dot Notation
         keys = path.split('.')
         current = obj
         for i, key in enumerate(keys):
             try:
-                # Use get() if possible, or key access
-                if isinstance(current, dict):
-                    current = current.get(key)
                 else:
-                    return None
             except:
                 return None
-            if current is None: return None
-            # Lazy Parsing: Only parse string if we need to go deeper
             is_last_key = (i == len(keys) - 1)
             if not is_last_key and isinstance(current, str):
                 s = current.strip()
@@ -219,6 +219,8 @@ class DatasetCommandCenter:
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         """
         FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
@@ -249,11 +251,9 @@ class DatasetCommandCenter:
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # Eval Context (requires explicit dict)
-        eval_context = row.copy()
-        eval_context['row'] = row
-        eval_context['json'] = json
-        eval_context['re'] = re
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
@@ -261,9 +261,11 @@ class DatasetCommandCenter:
             try:
                 if t_type == 'simple':
                     new_row[target_col] = self._get_value_by_path(row, col_def['source'])
                 elif t_type == 'list_search':
                     new_row[target_col] = self._extract_from_list_logic(
                         row,
                         col_def['source'],
@@ -273,6 +275,13 @@ class DatasetCommandCenter:
                     )
                 elif t_type == 'python':
                     val = eval(col_def['expression'], {}, eval_context)
                     new_row[target_col] = val
@@ -394,19 +403,24 @@ The following operations were applied to the source data:
         conf = config if config != 'default' else None
         try:
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             processed = []
             for i, row in enumerate(ds_stream):
-                if len(processed) >= 5: break
-                # CRITICAL FIX: Force Materialization
                 row = dict(row)
-                # Check Filter
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
@@ -414,17 +428,25 @@ The following operations were applied to the source data:
                         if not eval(recipe['filter_rule'], {}, ctx):
                             passed = False
                     except:
                         passed = False
                 if passed:
                     try:
                         new_row = self._apply_projection(row, recipe)
-                        # Sanitize to prevent JSON crashes
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e:
-                        processed.append({"_preview_error": f"Error: {str(e)}"})
             return processed
         except Exception as e:
              raise e

     # 2. CORE EXTRACTION LOGIC
     # ==========================================
+def _get_value_by_path(self, obj, path):
         """
+        Retrieves value. PRIORITY: Direct Key Access (Fastest).
         """
         if not path: return obj
+        # 1. Try Direct Access (Fastest, handles 99% of cases)
         try:
+            # We treat obj as a dict-like object (works for dict, UserDict, LazyRow)
+            # We DO NOT check isinstance(dict) to allow duck-typing
+            return obj[path]
+        except:
+            pass
+        # 2. Try Dot Notation (Only if direct access failed)
         keys = path.split('.')
         current = obj
         for i, key in enumerate(keys):
             try:
+                # Array/List index access support (e.g. solutions.0.code)
+                if isinstance(current, list) and key.isdigit():
+                    current = current[int(key)]
                 else:
+                    current = current[key]
             except:
                 return None
+            # Lazy Parsing: Only parse string if we strictly need to go deeper
             is_last_key = (i == len(keys) - 1)
             if not is_last_key and isinstance(current, str):
                 s = current.strip()
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         """
         FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
     def _apply_projection(self, row, recipe):
         new_row = {}
+        # OPTIMIZATION: Only create eval_context if we actually have a Python column.
+        # This prevents expensive row.copy() calls for Simple Path operations.
+        eval_context = None
         for col_def in recipe['columns']:
             t_type = col_def.get('type', 'simple')
             try:
                 if t_type == 'simple':
+                    # Fast path - no context needed
                     new_row[target_col] = self._get_value_by_path(row, col_def['source'])
                 elif t_type == 'list_search':
+                    # Fast path - no context needed
                     new_row[target_col] = self._extract_from_list_logic(
                         row,
                         col_def['source'],
                     )
                 elif t_type == 'python':
+                    # Lazy Context Creation: Only pay the cost if used
+                    if eval_context is None:
+                        eval_context = row.copy()
+                        eval_context['row'] = row
+                        eval_context['json'] = json
+                        eval_context['re'] = re
                     val = eval(col_def['expression'], {}, eval_context)
                     new_row[target_col] = val
         conf = config if config != 'default' else None
         try:
+            # Load dataset in streaming mode
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             processed = []
             for i, row in enumerate(ds_stream):
+                # Stop after 5 successful rows
+                if len(processed) >= 5:
+                    break
+                # CRITICAL: Force materialization from LazyRow to standard Dict.
+                # This fixes the interaction between Streaming datasets and JSON serialization.
                 row = dict(row)
+                # --- Filter Logic ---
                 passed = True
                 if recipe.get('filter_rule'):
                     try:
+                        # Create context only for the filter check
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
                         if not eval(recipe['filter_rule'], {}, ctx):
                             passed = False
                     except:
+                        # If filter errors out (e.g. missing column), treat as filtered out
                         passed = False
                 if passed:
                     try:
+                        # --- Projection Logic ---
                         new_row = self._apply_projection(row, recipe)
+                        # --- Sanitization ---
+                        # Convert NaNs, Infinity, and complex objects to prevent browser/Flask crash
                         clean_new_row = self._sanitize_for_json(new_row)
                         processed.append(clean_new_row)
                     except Exception as e:
+                        # Capture specific row errors for the UI
+                        processed.append({"_preview_error": f"Row {i} Error: {str(e)}"})
             return processed
         except Exception as e:
+             # Raise global errors (like 404 Dataset Not Found) so the UI sees them
              raise e